|
tesseract 3.04.01
|
00001 00002 // File: language_model.cpp 00003 // Description: Functions that utilize the knowledge about the properties, 00004 // structure and statistics of the language to help recognition. 00005 // Author: Daria Antonova 00006 // Created: Mon Nov 11 11:26:43 PST 2009 00007 // 00008 // (C) Copyright 2009, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #include <math.h> 00022 00023 #include "language_model.h" 00024 00025 #include "dawg.h" 00026 #include "freelist.h" 00027 #include "intproto.h" 00028 #include "helpers.h" 00029 #include "lm_state.h" 00030 #include "lm_pain_points.h" 00031 #include "matrix.h" 00032 #include "params.h" 00033 #include "params_training_featdef.h" 00034 00035 #if defined(_MSC_VER) || defined(ANDROID) 00036 double log2(double n) { 00037 return log(n) / log(2.0); 00038 } 00039 #endif // _MSC_VER 00040 00041 namespace tesseract { 00042 00043 const float LanguageModel::kMaxAvgNgramCost = 25.0f; 00044 00045 LanguageModel::LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, 00046 Dict *dict) 00047 : INT_MEMBER(language_model_debug_level, 0, "Language model debug level", 00048 dict->getCCUtil()->params()), 00049 BOOL_INIT_MEMBER(language_model_ngram_on, false, 00050 "Turn on/off the use of character ngram model", 00051 dict->getCCUtil()->params()), 00052 INT_MEMBER(language_model_ngram_order, 8, 00053 "Maximum order of the character ngram model", 00054 dict->getCCUtil()->params()), 00055 INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10, 00056 "Maximum number of prunable (those for which" 00057 " PrunablePath() is true) entries in each viterbi list" 00058 " recorded in BLOB_CHOICEs", 00059 dict->getCCUtil()->params()), 00060 INT_MEMBER(language_model_viterbi_list_max_size, 500, 00061 "Maximum size of viterbi lists recorded in BLOB_CHOICEs", 00062 dict->getCCUtil()->params()), 00063 double_MEMBER(language_model_ngram_small_prob, 0.000001, 00064 "To avoid overly small denominators use this as the " 00065 "floor of the probability returned by the ngram model.", 00066 dict->getCCUtil()->params()), 00067 double_MEMBER(language_model_ngram_nonmatch_score, -40.0, 00068 "Average classifier score of a non-matching unichar.", 00069 dict->getCCUtil()->params()), 00070 BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false, 00071 "Use only the first UTF8 step of the given string" 00072 " when computing log probabilities.", 00073 dict->getCCUtil()->params()), 00074 double_MEMBER(language_model_ngram_scale_factor, 0.03, 00075 "Strength of the character ngram model relative to the" 00076 " character classifier ", 00077 dict->getCCUtil()->params()), 00078 double_MEMBER(language_model_ngram_rating_factor, 16.0, 00079 "Factor to bring log-probs into the same range as ratings" 00080 " when multiplied by outline length ", 00081 dict->getCCUtil()->params()), 00082 BOOL_MEMBER(language_model_ngram_space_delimited_language, true, 00083 "Words are delimited by space", 00084 dict->getCCUtil()->params()), 00085 INT_MEMBER(language_model_min_compound_length, 3, 00086 "Minimum length of compound words", 00087 dict->getCCUtil()->params()), 00088 double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1, 00089 "Penalty for words not in the frequent word dictionary", 00090 dict->getCCUtil()->params()), 00091 double_MEMBER(language_model_penalty_non_dict_word, 0.15, 00092 "Penalty for non-dictionary words", 00093 dict->getCCUtil()->params()), 00094 double_MEMBER(language_model_penalty_punc, 0.2, 00095 "Penalty for inconsistent punctuation", 00096 dict->getCCUtil()->params()), 00097 double_MEMBER(language_model_penalty_case, 0.1, 00098 "Penalty for inconsistent case", 00099 dict->getCCUtil()->params()), 00100 double_MEMBER(language_model_penalty_script, 0.5, 00101 "Penalty for inconsistent script", 00102 dict->getCCUtil()->params()), 00103 double_MEMBER(language_model_penalty_chartype, 0.3, 00104 "Penalty for inconsistent character type", 00105 dict->getCCUtil()->params()), 00106 // TODO(daria, rays): enable font consistency checking 00107 // after improving font analysis. 00108 double_MEMBER(language_model_penalty_font, 0.00, 00109 "Penalty for inconsistent font", 00110 dict->getCCUtil()->params()), 00111 double_MEMBER(language_model_penalty_spacing, 0.05, 00112 "Penalty for inconsistent spacing", 00113 dict->getCCUtil()->params()), 00114 double_MEMBER(language_model_penalty_increment, 0.01, 00115 "Penalty increment", 00116 dict->getCCUtil()->params()), 00117 INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations", 00118 dict->getCCUtil()->params()), 00119 BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false, 00120 "Use sigmoidal score for certainty", 00121 dict->getCCUtil()->params()), 00122 fontinfo_table_(fontinfo_table), dict_(dict), 00123 fixed_pitch_(false), max_char_wh_ratio_(0.0), 00124 acceptable_choice_found_(false) { 00125 ASSERT_HOST(dict_ != NULL); 00126 dawg_args_ = new DawgArgs(NULL, new DawgPositionVector(), NO_PERM); 00127 very_beginning_active_dawgs_ = new DawgPositionVector(); 00128 beginning_active_dawgs_ = new DawgPositionVector(); 00129 } 00130 00131 LanguageModel::~LanguageModel() { 00132 delete very_beginning_active_dawgs_; 00133 delete beginning_active_dawgs_; 00134 delete dawg_args_->updated_dawgs; 00135 delete dawg_args_; 00136 } 00137 00138 void LanguageModel::InitForWord(const WERD_CHOICE *prev_word, 00139 bool fixed_pitch, float max_char_wh_ratio, 00140 float rating_cert_scale) { 00141 fixed_pitch_ = fixed_pitch; 00142 max_char_wh_ratio_ = max_char_wh_ratio; 00143 rating_cert_scale_ = rating_cert_scale; 00144 acceptable_choice_found_ = false; 00145 correct_segmentation_explored_ = false; 00146 00147 // Initialize vectors with beginning DawgInfos. 00148 very_beginning_active_dawgs_->clear(); 00149 dict_->init_active_dawgs(very_beginning_active_dawgs_, false); 00150 beginning_active_dawgs_->clear(); 00151 dict_->default_dawgs(beginning_active_dawgs_, false); 00152 00153 // Fill prev_word_str_ with the last language_model_ngram_order 00154 // unichars from prev_word. 00155 if (language_model_ngram_on) { 00156 if (prev_word != NULL && prev_word->unichar_string() != NULL) { 00157 prev_word_str_ = prev_word->unichar_string(); 00158 if (language_model_ngram_space_delimited_language) prev_word_str_ += ' '; 00159 } else { 00160 prev_word_str_ = " "; 00161 } 00162 const char *str_ptr = prev_word_str_.string(); 00163 const char *str_end = str_ptr + prev_word_str_.length(); 00164 int step; 00165 prev_word_unichar_step_len_ = 0; 00166 while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) { 00167 str_ptr += step; 00168 ++prev_word_unichar_step_len_; 00169 } 00170 ASSERT_HOST(str_ptr == str_end); 00171 } 00172 } 00173 00178 static void ScanParentsForCaseMix(const UNICHARSET& unicharset, 00179 LanguageModelState* parent_node) { 00180 if (parent_node == NULL) return; 00181 ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries); 00182 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) { 00183 ViterbiStateEntry* vse = vit.data(); 00184 vse->competing_vse = NULL; 00185 UNICHAR_ID unichar_id = vse->curr_b->unichar_id(); 00186 if (unicharset.get_isupper(unichar_id) || 00187 unicharset.get_islower(unichar_id)) { 00188 UNICHAR_ID other_case = unicharset.get_other_case(unichar_id); 00189 if (other_case == unichar_id) continue; // Not in unicharset. 00190 // Find other case in same list. There could be multiple entries with 00191 // the same unichar_id, but in theory, they should all point to the 00192 // same BLOB_CHOICE, and that is what we will be using to decide 00193 // which to keep. 00194 ViterbiStateEntry_IT vit2(&parent_node->viterbi_state_entries); 00195 for (vit2.mark_cycle_pt(); !vit2.cycled_list() && 00196 vit2.data()->curr_b->unichar_id() != other_case; 00197 vit2.forward()) {} 00198 if (!vit2.cycled_list()) { 00199 vse->competing_vse = vit2.data(); 00200 } 00201 } 00202 } 00203 } 00204 00209 static bool HasBetterCaseVariant(const UNICHARSET& unicharset, 00210 const BLOB_CHOICE* choice, 00211 BLOB_CHOICE_LIST* choices) { 00212 UNICHAR_ID choice_id = choice->unichar_id(); 00213 UNICHAR_ID other_case = unicharset.get_other_case(choice_id); 00214 if (other_case == choice_id || other_case == INVALID_UNICHAR_ID) 00215 return false; // Not upper or lower or not in unicharset. 00216 if (unicharset.SizesDistinct(choice_id, other_case)) 00217 return false; // Can be separated by size. 00218 BLOB_CHOICE_IT bc_it(choices); 00219 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { 00220 BLOB_CHOICE* better_choice = bc_it.data(); 00221 if (better_choice->unichar_id() == other_case) 00222 return true; // Found an earlier instance of other_case. 00223 else if (better_choice == choice) 00224 return false; // Reached the original choice. 00225 } 00226 return false; // Should never happen, but just in case. 00227 } 00228 00255 bool LanguageModel::UpdateState( 00256 bool just_classified, 00257 int curr_col, int curr_row, 00258 BLOB_CHOICE_LIST *curr_list, 00259 LanguageModelState *parent_node, 00260 LMPainPoints *pain_points, 00261 WERD_RES *word_res, 00262 BestChoiceBundle *best_choice_bundle, 00263 BlamerBundle *blamer_bundle) { 00264 if (language_model_debug_level > 0) { 00265 tprintf("\nUpdateState: col=%d row=%d %s", 00266 curr_col, curr_row, just_classified ? "just_classified" : ""); 00267 if (language_model_debug_level > 5) 00268 tprintf("(parent=%p)\n", parent_node); 00269 else 00270 tprintf("\n"); 00271 } 00272 // Initialize helper variables. 00273 bool word_end = (curr_row+1 >= word_res->ratings->dimension()); 00274 bool new_changed = false; 00275 float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f; 00276 const UNICHARSET& unicharset = dict_->getUnicharset(); 00277 BLOB_CHOICE *first_lower = NULL; 00278 BLOB_CHOICE *first_upper = NULL; 00279 BLOB_CHOICE *first_digit = NULL; 00280 bool has_alnum_mix = false; 00281 if (parent_node != NULL) { 00282 int result = SetTopParentLowerUpperDigit(parent_node); 00283 if (result < 0) { 00284 if (language_model_debug_level > 0) 00285 tprintf("No parents found to process\n"); 00286 return false; 00287 } 00288 if (result > 0) 00289 has_alnum_mix = true; 00290 } 00291 if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper, 00292 &first_digit)) 00293 has_alnum_mix = false;; 00294 ScanParentsForCaseMix(unicharset, parent_node); 00295 if (language_model_debug_level > 3 && parent_node != NULL) { 00296 parent_node->Print("Parent viterbi list"); 00297 } 00298 LanguageModelState *curr_state = best_choice_bundle->beam[curr_row]; 00299 00300 // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry. 00301 ViterbiStateEntry_IT vit; 00302 BLOB_CHOICE_IT c_it(curr_list); 00303 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { 00304 BLOB_CHOICE* choice = c_it.data(); 00305 // TODO(antonova): make sure commenting this out if ok for ngram 00306 // model scoring (I think this was introduced to fix ngram model quirks). 00307 // Skip NULL unichars unless it is the only choice. 00308 //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue; 00309 UNICHAR_ID unichar_id = choice->unichar_id(); 00310 if (unicharset.get_fragment(unichar_id)) { 00311 continue; // Skip fragments. 00312 } 00313 // Set top choice flags. 00314 LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag; 00315 if (c_it.at_first() || !new_changed) 00316 blob_choice_flags |= kSmallestRatingFlag; 00317 if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag; 00318 if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag; 00319 if (first_digit == choice) blob_choice_flags |= kDigitFlag; 00320 00321 if (parent_node == NULL) { 00322 // Process the beginning of a word. 00323 // If there is a better case variant that is not distinguished by size, 00324 // skip this blob choice, as we have no choice but to accept the result 00325 // of the character classifier to distinguish between them, even if 00326 // followed by an upper case. 00327 // With words like iPoc, and other CamelBackWords, the lower-upper 00328 // transition can only be achieved if the classifier has the correct case 00329 // as the top choice, and leaving an initial I lower down the list 00330 // increases the chances of choosing IPoc simply because it doesn't 00331 // include such a transition. iPoc will beat iPOC and ipoc because 00332 // the other words are baseline/x-height inconsistent. 00333 if (HasBetterCaseVariant(unicharset, choice, curr_list)) 00334 continue; 00335 // Upper counts as lower at the beginning of a word. 00336 if (blob_choice_flags & kUpperCaseFlag) 00337 blob_choice_flags |= kLowerCaseFlag; 00338 new_changed |= AddViterbiStateEntry( 00339 blob_choice_flags, denom, word_end, curr_col, curr_row, 00340 choice, curr_state, NULL, pain_points, 00341 word_res, best_choice_bundle, blamer_bundle); 00342 } else { 00343 // Get viterbi entries from each parent ViterbiStateEntry. 00344 vit.set_to_list(&parent_node->viterbi_state_entries); 00345 int vit_counter = 0; 00346 vit.mark_cycle_pt(); 00347 ViterbiStateEntry* parent_vse = NULL; 00348 LanguageModelFlagsType top_choice_flags; 00349 while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix, 00350 c_it.data(), blob_choice_flags, 00351 unicharset, word_res, &vit, 00352 &top_choice_flags)) != NULL) { 00353 // Skip pruned entries and do not look at prunable entries if already 00354 // examined language_model_viterbi_list_max_num_prunable of those. 00355 if (PrunablePath(*parent_vse) && 00356 (++vit_counter > language_model_viterbi_list_max_num_prunable || 00357 (language_model_ngram_on && parent_vse->ngram_info->pruned))) { 00358 continue; 00359 } 00360 // If the parent has no alnum choice, (ie choice is the first in a 00361 // string of alnum), and there is a better case variant that is not 00362 // distinguished by size, skip this blob choice/parent, as with the 00363 // initial blob treatment above. 00364 if (!parent_vse->HasAlnumChoice(unicharset) && 00365 HasBetterCaseVariant(unicharset, choice, curr_list)) 00366 continue; 00367 // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data() 00368 // looks good according to the Dawgs or character ngram model. 00369 new_changed |= AddViterbiStateEntry( 00370 top_choice_flags, denom, word_end, curr_col, curr_row, 00371 c_it.data(), curr_state, parent_vse, pain_points, 00372 word_res, best_choice_bundle, blamer_bundle); 00373 } 00374 } 00375 } 00376 return new_changed; 00377 } 00378 00385 bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, 00386 BLOB_CHOICE **first_lower, 00387 BLOB_CHOICE **first_upper, 00388 BLOB_CHOICE **first_digit) const { 00389 BLOB_CHOICE_IT c_it(curr_list); 00390 const UNICHARSET &unicharset = dict_->getUnicharset(); 00391 BLOB_CHOICE *first_unichar = NULL; 00392 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { 00393 UNICHAR_ID unichar_id = c_it.data()->unichar_id(); 00394 if (unicharset.get_fragment(unichar_id)) continue; // skip fragments 00395 if (first_unichar == NULL) first_unichar = c_it.data(); 00396 if (*first_lower == NULL && unicharset.get_islower(unichar_id)) { 00397 *first_lower = c_it.data(); 00398 } 00399 if (*first_upper == NULL && unicharset.get_isalpha(unichar_id) && 00400 !unicharset.get_islower(unichar_id)) { 00401 *first_upper = c_it.data(); 00402 } 00403 if (*first_digit == NULL && unicharset.get_isdigit(unichar_id)) { 00404 *first_digit = c_it.data(); 00405 } 00406 } 00407 ASSERT_HOST(first_unichar != NULL); 00408 bool mixed = (*first_lower != NULL || *first_upper != NULL) && 00409 *first_digit != NULL; 00410 if (*first_lower == NULL) *first_lower = first_unichar; 00411 if (*first_upper == NULL) *first_upper = first_unichar; 00412 if (*first_digit == NULL) *first_digit = first_unichar; 00413 return mixed; 00414 } 00415 00425 int LanguageModel::SetTopParentLowerUpperDigit( 00426 LanguageModelState *parent_node) const { 00427 if (parent_node == NULL) return -1; 00428 UNICHAR_ID top_id = INVALID_UNICHAR_ID; 00429 ViterbiStateEntry* top_lower = NULL; 00430 ViterbiStateEntry* top_upper = NULL; 00431 ViterbiStateEntry* top_digit = NULL; 00432 ViterbiStateEntry* top_choice = NULL; 00433 float lower_rating = 0.0f; 00434 float upper_rating = 0.0f; 00435 float digit_rating = 0.0f; 00436 float top_rating = 0.0f; 00437 const UNICHARSET &unicharset = dict_->getUnicharset(); 00438 ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries); 00439 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) { 00440 ViterbiStateEntry* vse = vit.data(); 00441 // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan 00442 // back to the real character if needed. 00443 ViterbiStateEntry* unichar_vse = vse; 00444 UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id(); 00445 float rating = unichar_vse->curr_b->rating(); 00446 while (unichar_id == INVALID_UNICHAR_ID && 00447 unichar_vse->parent_vse != NULL) { 00448 unichar_vse = unichar_vse->parent_vse; 00449 unichar_id = unichar_vse->curr_b->unichar_id(); 00450 rating = unichar_vse->curr_b->rating(); 00451 } 00452 if (unichar_id != INVALID_UNICHAR_ID) { 00453 if (unicharset.get_islower(unichar_id)) { 00454 if (top_lower == NULL || lower_rating > rating) { 00455 top_lower = vse; 00456 lower_rating = rating; 00457 } 00458 } else if (unicharset.get_isalpha(unichar_id)) { 00459 if (top_upper == NULL || upper_rating > rating) { 00460 top_upper = vse; 00461 upper_rating = rating; 00462 } 00463 } else if (unicharset.get_isdigit(unichar_id)) { 00464 if (top_digit == NULL || digit_rating > rating) { 00465 top_digit = vse; 00466 digit_rating = rating; 00467 } 00468 } 00469 } 00470 if (top_choice == NULL || top_rating > rating) { 00471 top_choice = vse; 00472 top_rating = rating; 00473 top_id = unichar_id; 00474 } 00475 } 00476 if (top_choice == NULL) return -1; 00477 bool mixed = (top_lower != NULL || top_upper != NULL) && 00478 top_digit != NULL; 00479 if (top_lower == NULL) top_lower = top_choice; 00480 top_lower->top_choice_flags |= kLowerCaseFlag; 00481 if (top_upper == NULL) top_upper = top_choice; 00482 top_upper->top_choice_flags |= kUpperCaseFlag; 00483 if (top_digit == NULL) top_digit = top_choice; 00484 top_digit->top_choice_flags |= kDigitFlag; 00485 top_choice->top_choice_flags |= kSmallestRatingFlag; 00486 if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) && 00487 (top_choice->top_choice_flags & 00488 (kLowerCaseFlag | kUpperCaseFlag | kDigitFlag))) { 00489 // If the compound marker top choice carries any of the top alnum flags, 00490 // then give it all of them, allowing words like I-295 to be chosen. 00491 top_choice->top_choice_flags |= 00492 kLowerCaseFlag | kUpperCaseFlag | kDigitFlag; 00493 } 00494 return mixed ? 1 : 0; 00495 } 00496 00502 ViterbiStateEntry* LanguageModel::GetNextParentVSE( 00503 bool just_classified, bool mixed_alnum, const BLOB_CHOICE* bc, 00504 LanguageModelFlagsType blob_choice_flags, const UNICHARSET& unicharset, 00505 WERD_RES* word_res, ViterbiStateEntry_IT* vse_it, 00506 LanguageModelFlagsType* top_choice_flags) const { 00507 for (; !vse_it->cycled_list(); vse_it->forward()) { 00508 ViterbiStateEntry* parent_vse = vse_it->data(); 00509 // Only consider the parent if it has been updated or 00510 // if the current ratings cell has just been classified. 00511 if (!just_classified && !parent_vse->updated) continue; 00512 if (language_model_debug_level > 2) 00513 parent_vse->Print("Considering"); 00514 // If the parent is non-alnum, then upper counts as lower. 00515 *top_choice_flags = blob_choice_flags; 00516 if ((blob_choice_flags & kUpperCaseFlag) && 00517 !parent_vse->HasAlnumChoice(unicharset)) { 00518 *top_choice_flags |= kLowerCaseFlag; 00519 } 00520 *top_choice_flags &= parent_vse->top_choice_flags; 00521 UNICHAR_ID unichar_id = bc->unichar_id(); 00522 const BLOB_CHOICE* parent_b = parent_vse->curr_b; 00523 UNICHAR_ID parent_id = parent_b->unichar_id(); 00524 // Digits do not bind to alphas if there is a mix in both parent and current 00525 // or if the alpha is not the top choice. 00526 if (unicharset.get_isdigit(unichar_id) && 00527 unicharset.get_isalpha(parent_id) && 00528 (mixed_alnum || *top_choice_flags == 0)) 00529 continue; // Digits don't bind to alphas. 00530 // Likewise alphas do not bind to digits if there is a mix in both or if 00531 // the digit is not the top choice. 00532 if (unicharset.get_isalpha(unichar_id) && 00533 unicharset.get_isdigit(parent_id) && 00534 (mixed_alnum || *top_choice_flags == 0)) 00535 continue; // Alphas don't bind to digits. 00536 // If there is a case mix of the same alpha in the parent list, then 00537 // competing_vse is non-null and will be used to determine whether 00538 // or not to bind the current blob choice. 00539 if (parent_vse->competing_vse != NULL) { 00540 const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b; 00541 UNICHAR_ID other_id = competing_b->unichar_id(); 00542 if (language_model_debug_level >= 5) { 00543 tprintf("Parent %s has competition %s\n", 00544 unicharset.id_to_unichar(parent_id), 00545 unicharset.id_to_unichar(other_id)); 00546 } 00547 if (unicharset.SizesDistinct(parent_id, other_id)) { 00548 // If other_id matches bc wrt position and size, and parent_id, doesn't, 00549 // don't bind to the current parent. 00550 if (bc->PosAndSizeAgree(*competing_b, word_res->x_height, 00551 language_model_debug_level >= 5) && 00552 !bc->PosAndSizeAgree(*parent_b, word_res->x_height, 00553 language_model_debug_level >= 5)) 00554 continue; // Competing blobchoice has a better vertical match. 00555 } 00556 } 00557 vse_it->forward(); 00558 return parent_vse; // This one is good! 00559 } 00560 return NULL; // Ran out of possibilities. 00561 } 00562 00563 bool LanguageModel::AddViterbiStateEntry( 00564 LanguageModelFlagsType top_choice_flags, 00565 float denom, 00566 bool word_end, 00567 int curr_col, int curr_row, 00568 BLOB_CHOICE *b, 00569 LanguageModelState *curr_state, 00570 ViterbiStateEntry *parent_vse, 00571 LMPainPoints *pain_points, 00572 WERD_RES *word_res, 00573 BestChoiceBundle *best_choice_bundle, 00574 BlamerBundle *blamer_bundle) { 00575 ViterbiStateEntry_IT vit; 00576 if (language_model_debug_level > 1) { 00577 tprintf("AddViterbiStateEntry for unichar %s rating=%.4f" 00578 " certainty=%.4f top_choice_flags=0x%x", 00579 dict_->getUnicharset().id_to_unichar(b->unichar_id()), 00580 b->rating(), b->certainty(), top_choice_flags); 00581 if (language_model_debug_level > 5) 00582 tprintf(" parent_vse=%p\n", parent_vse); 00583 else 00584 tprintf("\n"); 00585 } 00586 // Check whether the list is full. 00587 if (curr_state != NULL && 00588 curr_state->viterbi_state_entries_length >= 00589 language_model_viterbi_list_max_size) { 00590 if (language_model_debug_level > 1) { 00591 tprintf("AddViterbiStateEntry: viterbi list is full!\n"); 00592 } 00593 return false; 00594 } 00595 00596 // Invoke Dawg language model component. 00597 LanguageModelDawgInfo *dawg_info = 00598 GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse); 00599 00600 float outline_length = 00601 AssociateUtils::ComputeOutlineLength(rating_cert_scale_, *b); 00602 // Invoke Ngram language model component. 00603 LanguageModelNgramInfo *ngram_info = NULL; 00604 if (language_model_ngram_on) { 00605 ngram_info = GenerateNgramInfo( 00606 dict_->getUnicharset().id_to_unichar(b->unichar_id()), b->certainty(), 00607 denom, curr_col, curr_row, outline_length, parent_vse); 00608 ASSERT_HOST(ngram_info != NULL); 00609 } 00610 bool liked_by_language_model = dawg_info != NULL || 00611 (ngram_info != NULL && !ngram_info->pruned); 00612 // Quick escape if not liked by the language model, can't be consistent 00613 // xheight, and not top choice. 00614 if (!liked_by_language_model && top_choice_flags == 0) { 00615 if (language_model_debug_level > 1) { 00616 tprintf("Language model components very early pruned this entry\n"); 00617 } 00618 delete ngram_info; 00619 delete dawg_info; 00620 return false; 00621 } 00622 00623 // Check consistency of the path and set the relevant consistency_info. 00624 LMConsistencyInfo consistency_info( 00625 parent_vse != NULL ? &parent_vse->consistency_info : NULL); 00626 // Start with just the x-height consistency, as it provides significant 00627 // pruning opportunity. 00628 consistency_info.ComputeXheightConsistency( 00629 b, dict_->getUnicharset().get_ispunctuation(b->unichar_id())); 00630 // Turn off xheight consistent flag if not consistent. 00631 if (consistency_info.InconsistentXHeight()) { 00632 top_choice_flags &= ~kXhtConsistentFlag; 00633 } 00634 00635 // Quick escape if not liked by the language model, not consistent xheight, 00636 // and not top choice. 00637 if (!liked_by_language_model && top_choice_flags == 0) { 00638 if (language_model_debug_level > 1) { 00639 tprintf("Language model components early pruned this entry\n"); 00640 } 00641 delete ngram_info; 00642 delete dawg_info; 00643 return false; 00644 } 00645 00646 // Compute the rest of the consistency info. 00647 FillConsistencyInfo(curr_col, word_end, b, parent_vse, 00648 word_res, &consistency_info); 00649 if (dawg_info != NULL && consistency_info.invalid_punc) { 00650 consistency_info.invalid_punc = false; // do not penalize dict words 00651 } 00652 00653 // Compute cost of associating the blobs that represent the current unichar. 00654 AssociateStats associate_stats; 00655 ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_, 00656 parent_vse, word_res, &associate_stats); 00657 if (parent_vse != NULL) { 00658 associate_stats.shape_cost += parent_vse->associate_stats.shape_cost; 00659 associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape; 00660 } 00661 00662 // Create the new ViterbiStateEntry compute the adjusted cost of the path. 00663 ViterbiStateEntry *new_vse = new ViterbiStateEntry( 00664 parent_vse, b, 0.0, outline_length, 00665 consistency_info, associate_stats, top_choice_flags, dawg_info, 00666 ngram_info, (language_model_debug_level > 0) ? 00667 dict_->getUnicharset().id_to_unichar(b->unichar_id()) : NULL); 00668 new_vse->cost = ComputeAdjustedPathCost(new_vse); 00669 if (language_model_debug_level >= 3) 00670 tprintf("Adjusted cost = %g\n", new_vse->cost); 00671 00672 // Invoke Top Choice language model component to make the final adjustments 00673 // to new_vse->top_choice_flags. 00674 if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) { 00675 GenerateTopChoiceInfo(new_vse, parent_vse, curr_state); 00676 } 00677 00678 // If language model components did not like this unichar - return. 00679 bool keep = new_vse->top_choice_flags || liked_by_language_model; 00680 if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths 00681 consistency_info.inconsistent_script) { // with inconsistent script 00682 keep = false; 00683 } 00684 if (!keep) { 00685 if (language_model_debug_level > 1) { 00686 tprintf("Language model components did not like this entry\n"); 00687 } 00688 delete new_vse; 00689 return false; 00690 } 00691 00692 // Discard this entry if it represents a prunable path and 00693 // language_model_viterbi_list_max_num_prunable such entries with a lower 00694 // cost have already been recorded. 00695 if (PrunablePath(*new_vse) && 00696 (curr_state->viterbi_state_entries_prunable_length >= 00697 language_model_viterbi_list_max_num_prunable) && 00698 new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) { 00699 if (language_model_debug_level > 1) { 00700 tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n", 00701 new_vse->cost, 00702 curr_state->viterbi_state_entries_prunable_max_cost); 00703 } 00704 delete new_vse; 00705 return false; 00706 } 00707 00708 // Update best choice if needed. 00709 if (word_end) { 00710 UpdateBestChoice(new_vse, pain_points, word_res, 00711 best_choice_bundle, blamer_bundle); 00712 // Discard the entry if UpdateBestChoice() found flaws in it. 00713 if (new_vse->cost >= WERD_CHOICE::kBadRating && 00714 new_vse != best_choice_bundle->best_vse) { 00715 if (language_model_debug_level > 1) { 00716 tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost); 00717 } 00718 delete new_vse; 00719 return false; 00720 } 00721 } 00722 00723 // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries. 00724 curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare, 00725 false, new_vse); 00726 curr_state->viterbi_state_entries_length++; 00727 if (PrunablePath(*new_vse)) { 00728 curr_state->viterbi_state_entries_prunable_length++; 00729 } 00730 00731 // Update lms->viterbi_state_entries_prunable_max_cost and clear 00732 // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum. 00733 if ((curr_state->viterbi_state_entries_prunable_length >= 00734 language_model_viterbi_list_max_num_prunable) || 00735 new_vse->top_choice_flags) { 00736 ASSERT_HOST(!curr_state->viterbi_state_entries.empty()); 00737 int prunable_counter = language_model_viterbi_list_max_num_prunable; 00738 vit.set_to_list(&(curr_state->viterbi_state_entries)); 00739 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) { 00740 ViterbiStateEntry *curr_vse = vit.data(); 00741 // Clear the appropriate top choice flags of the entries in the 00742 // list that have cost higher thank new_entry->cost 00743 // (since they will not be top choices any more). 00744 if (curr_vse->top_choice_flags && curr_vse != new_vse && 00745 curr_vse->cost > new_vse->cost) { 00746 curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags); 00747 } 00748 if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter; 00749 // Update curr_state->viterbi_state_entries_prunable_max_cost. 00750 if (prunable_counter == 0) { 00751 curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost; 00752 if (language_model_debug_level > 1) { 00753 tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n", 00754 curr_state->viterbi_state_entries_prunable_max_cost); 00755 } 00756 prunable_counter = -1; // stop counting 00757 } 00758 } 00759 } 00760 00761 // Print the newly created ViterbiStateEntry. 00762 if (language_model_debug_level > 2) { 00763 new_vse->Print("New"); 00764 if (language_model_debug_level > 5) 00765 curr_state->Print("Updated viterbi list"); 00766 } 00767 00768 return true; 00769 } 00770 00771 void LanguageModel::GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, 00772 const ViterbiStateEntry *parent_vse, 00773 LanguageModelState *lms) { 00774 ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries)); 00775 for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags && 00776 new_vse->cost >= vit.data()->cost; vit.forward()) { 00777 // Clear the appropriate flags if the list already contains 00778 // a top choice entry with a lower cost. 00779 new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags); 00780 } 00781 if (language_model_debug_level > 2) { 00782 tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n", 00783 new_vse->top_choice_flags); 00784 } 00785 } 00786 00787 LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo( 00788 bool word_end, 00789 int curr_col, int curr_row, 00790 const BLOB_CHOICE &b, 00791 const ViterbiStateEntry *parent_vse) { 00792 // Initialize active_dawgs from parent_vse if it is not NULL. 00793 // Otherwise use very_beginning_active_dawgs_. 00794 if (parent_vse == NULL) { 00795 dawg_args_->active_dawgs = very_beginning_active_dawgs_; 00796 dawg_args_->permuter = NO_PERM; 00797 } else { 00798 if (parent_vse->dawg_info == NULL) return NULL; // not a dict word path 00799 dawg_args_->active_dawgs = parent_vse->dawg_info->active_dawgs; 00800 dawg_args_->permuter = parent_vse->dawg_info->permuter; 00801 } 00802 00803 // Deal with hyphenated words. 00804 if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) { 00805 if (language_model_debug_level > 0) tprintf("Hyphenated word found\n"); 00806 return new LanguageModelDawgInfo(dawg_args_->active_dawgs, 00807 COMPOUND_PERM); 00808 } 00809 00810 // Deal with compound words. 00811 if (dict_->compound_marker(b.unichar_id()) && 00812 (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) { 00813 if (language_model_debug_level > 0) tprintf("Found compound marker\n"); 00814 // Do not allow compound operators at the beginning and end of the word. 00815 // Do not allow more than one compound operator per word. 00816 // Do not allow compounding of words with lengths shorter than 00817 // language_model_min_compound_length 00818 if (parent_vse == NULL || word_end || 00819 dawg_args_->permuter == COMPOUND_PERM || 00820 parent_vse->length < language_model_min_compound_length) return NULL; 00821 00822 int i; 00823 // Check a that the path terminated before the current character is a word. 00824 bool has_word_ending = false; 00825 for (i = 0; i < parent_vse->dawg_info->active_dawgs->size(); ++i) { 00826 const DawgPosition &pos = (*parent_vse->dawg_info->active_dawgs)[i]; 00827 const Dawg *pdawg = pos.dawg_index < 0 00828 ? NULL : dict_->GetDawg(pos.dawg_index); 00829 if (pdawg == NULL || pos.back_to_punc) continue;; 00830 if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE && 00831 pdawg->end_of_word(pos.dawg_ref)) { 00832 has_word_ending = true; 00833 break; 00834 } 00835 } 00836 if (!has_word_ending) return NULL; 00837 00838 if (language_model_debug_level > 0) tprintf("Compound word found\n"); 00839 return new LanguageModelDawgInfo(beginning_active_dawgs_, COMPOUND_PERM); 00840 } // done dealing with compound words 00841 00842 LanguageModelDawgInfo *dawg_info = NULL; 00843 00844 // Call LetterIsOkay(). 00845 // Use the normalized IDs so that all shapes of ' can be allowed in words 00846 // like don't. 00847 const GenericVector<UNICHAR_ID>& normed_ids = 00848 dict_->getUnicharset().normed_ids(b.unichar_id()); 00849 DawgPositionVector tmp_active_dawgs; 00850 for (int i = 0; i < normed_ids.size(); ++i) { 00851 if (language_model_debug_level > 2) 00852 tprintf("Test Letter OK for unichar %d, normed %d\n", 00853 b.unichar_id(), normed_ids[i]); 00854 dict_->LetterIsOkay(dawg_args_, normed_ids[i], 00855 word_end && i == normed_ids.size() - 1); 00856 if (dawg_args_->permuter == NO_PERM) { 00857 break; 00858 } else if (i < normed_ids.size() - 1) { 00859 tmp_active_dawgs = *dawg_args_->updated_dawgs; 00860 dawg_args_->active_dawgs = &tmp_active_dawgs; 00861 } 00862 if (language_model_debug_level > 2) 00863 tprintf("Letter was OK for unichar %d, normed %d\n", 00864 b.unichar_id(), normed_ids[i]); 00865 } 00866 dawg_args_->active_dawgs = NULL; 00867 if (dawg_args_->permuter != NO_PERM) { 00868 dawg_info = new LanguageModelDawgInfo(dawg_args_->updated_dawgs, 00869 dawg_args_->permuter); 00870 } else if (language_model_debug_level > 3) { 00871 tprintf("Letter %s not OK!\n", 00872 dict_->getUnicharset().id_to_unichar(b.unichar_id())); 00873 } 00874 00875 return dawg_info; 00876 } 00877 00878 LanguageModelNgramInfo *LanguageModel::GenerateNgramInfo( 00879 const char *unichar, float certainty, float denom, 00880 int curr_col, int curr_row, float outline_length, 00881 const ViterbiStateEntry *parent_vse) { 00882 // Initialize parent context. 00883 const char *pcontext_ptr = ""; 00884 int pcontext_unichar_step_len = 0; 00885 if (parent_vse == NULL) { 00886 pcontext_ptr = prev_word_str_.string(); 00887 pcontext_unichar_step_len = prev_word_unichar_step_len_; 00888 } else { 00889 pcontext_ptr = parent_vse->ngram_info->context.string(); 00890 pcontext_unichar_step_len = 00891 parent_vse->ngram_info->context_unichar_step_len; 00892 } 00893 // Compute p(unichar | parent context). 00894 int unichar_step_len = 0; 00895 bool pruned = false; 00896 float ngram_cost; 00897 float ngram_and_classifier_cost = 00898 ComputeNgramCost(unichar, certainty, denom, 00899 pcontext_ptr, &unichar_step_len, 00900 &pruned, &ngram_cost); 00901 // Normalize just the ngram_and_classifier_cost by outline_length. 00902 // The ngram_cost is used by the params_model, so it needs to be left as-is, 00903 // and the params model cost will be normalized by outline_length. 00904 ngram_and_classifier_cost *= 00905 outline_length / language_model_ngram_rating_factor; 00906 // Add the ngram_cost of the parent. 00907 if (parent_vse != NULL) { 00908 ngram_and_classifier_cost += 00909 parent_vse->ngram_info->ngram_and_classifier_cost; 00910 ngram_cost += parent_vse->ngram_info->ngram_cost; 00911 } 00912 00913 // Shorten parent context string by unichar_step_len unichars. 00914 int num_remove = (unichar_step_len + pcontext_unichar_step_len - 00915 language_model_ngram_order); 00916 if (num_remove > 0) pcontext_unichar_step_len -= num_remove; 00917 while (num_remove > 0 && *pcontext_ptr != '\0') { 00918 pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr); 00919 --num_remove; 00920 } 00921 00922 // Decide whether to prune this ngram path and update changed accordingly. 00923 if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true; 00924 00925 // Construct and return the new LanguageModelNgramInfo. 00926 LanguageModelNgramInfo *ngram_info = new LanguageModelNgramInfo( 00927 pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost, 00928 ngram_and_classifier_cost); 00929 ngram_info->context += unichar; 00930 ngram_info->context_unichar_step_len += unichar_step_len; 00931 assert(ngram_info->context_unichar_step_len <= language_model_ngram_order); 00932 return ngram_info; 00933 } 00934 00935 float LanguageModel::ComputeNgramCost(const char *unichar, 00936 float certainty, 00937 float denom, 00938 const char *context, 00939 int *unichar_step_len, 00940 bool *found_small_prob, 00941 float *ngram_cost) { 00942 const char *context_ptr = context; 00943 char *modified_context = NULL; 00944 char *modified_context_end = NULL; 00945 const char *unichar_ptr = unichar; 00946 const char *unichar_end = unichar_ptr + strlen(unichar_ptr); 00947 float prob = 0.0f; 00948 int step = 0; 00949 while (unichar_ptr < unichar_end && 00950 (step = UNICHAR::utf8_step(unichar_ptr)) > 0) { 00951 if (language_model_debug_level > 1) { 00952 tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr, 00953 dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step)); 00954 } 00955 prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step); 00956 ++(*unichar_step_len); 00957 if (language_model_ngram_use_only_first_uft8_step) break; 00958 unichar_ptr += step; 00959 // If there are multiple UTF8 characters present in unichar, context is 00960 // updated to include the previously examined characters from str, 00961 // unless use_only_first_uft8_step is true. 00962 if (unichar_ptr < unichar_end) { 00963 if (modified_context == NULL) { 00964 int context_len = strlen(context); 00965 modified_context = 00966 new char[context_len + strlen(unichar_ptr) + step + 1]; 00967 strncpy(modified_context, context, context_len); 00968 modified_context_end = modified_context + context_len; 00969 context_ptr = modified_context; 00970 } 00971 strncpy(modified_context_end, unichar_ptr - step, step); 00972 modified_context_end += step; 00973 *modified_context_end = '\0'; 00974 } 00975 } 00976 prob /= static_cast<float>(*unichar_step_len); // normalize 00977 if (prob < language_model_ngram_small_prob) { 00978 if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob); 00979 *found_small_prob = true; 00980 prob = language_model_ngram_small_prob; 00981 } 00982 *ngram_cost = -1.0*log2(prob); 00983 float ngram_and_classifier_cost = 00984 -1.0*log2(CertaintyScore(certainty)/denom) + 00985 *ngram_cost * language_model_ngram_scale_factor; 00986 if (language_model_debug_level > 1) { 00987 tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar, 00988 unichar, context_ptr, CertaintyScore(certainty)/denom, prob, 00989 ngram_and_classifier_cost); 00990 } 00991 if (modified_context != NULL) delete[] modified_context; 00992 return ngram_and_classifier_cost; 00993 } 00994 00995 float LanguageModel::ComputeDenom(BLOB_CHOICE_LIST *curr_list) { 00996 if (curr_list->empty()) return 1.0f; 00997 float denom = 0.0f; 00998 int len = 0; 00999 BLOB_CHOICE_IT c_it(curr_list); 01000 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { 01001 ASSERT_HOST(c_it.data() != NULL); 01002 ++len; 01003 denom += CertaintyScore(c_it.data()->certainty()); 01004 } 01005 assert(len != 0); 01006 // The ideal situation would be to have the classifier scores for 01007 // classifying each position as each of the characters in the unicharset. 01008 // Since we can not do this because of speed, we add a very crude estimate 01009 // of what these scores for the "missing" classifications would sum up to. 01010 denom += (dict_->getUnicharset().size() - len) * 01011 CertaintyScore(language_model_ngram_nonmatch_score); 01012 01013 return denom; 01014 } 01015 01016 void LanguageModel::FillConsistencyInfo( 01017 int curr_col, 01018 bool word_end, 01019 BLOB_CHOICE *b, 01020 ViterbiStateEntry *parent_vse, 01021 WERD_RES *word_res, 01022 LMConsistencyInfo *consistency_info) { 01023 const UNICHARSET &unicharset = dict_->getUnicharset(); 01024 UNICHAR_ID unichar_id = b->unichar_id(); 01025 BLOB_CHOICE* parent_b = parent_vse != NULL ? parent_vse->curr_b : NULL; 01026 01027 // Check punctuation validity. 01028 if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++; 01029 if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) { 01030 if (dict_->compound_marker(unichar_id) && parent_b != NULL && 01031 (unicharset.get_isalpha(parent_b->unichar_id()) || 01032 unicharset.get_isdigit(parent_b->unichar_id()))) { 01033 // reset punc_ref for compound words 01034 consistency_info->punc_ref = NO_EDGE; 01035 } else { 01036 bool is_apos = dict_->is_apostrophe(unichar_id); 01037 bool prev_is_numalpha = (parent_b != NULL && 01038 (unicharset.get_isalpha(parent_b->unichar_id()) || 01039 unicharset.get_isdigit(parent_b->unichar_id()))); 01040 UNICHAR_ID pattern_unichar_id = 01041 (unicharset.get_isalpha(unichar_id) || 01042 unicharset.get_isdigit(unichar_id) || 01043 (is_apos && prev_is_numalpha)) ? 01044 Dawg::kPatternUnicharID : unichar_id; 01045 if (consistency_info->punc_ref == NO_EDGE || 01046 pattern_unichar_id != Dawg::kPatternUnicharID || 01047 dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) != 01048 Dawg::kPatternUnicharID) { 01049 NODE_REF node = Dict::GetStartingNode(dict_->GetPuncDawg(), 01050 consistency_info->punc_ref); 01051 consistency_info->punc_ref = 01052 (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of( 01053 node, pattern_unichar_id, word_end) : NO_EDGE; 01054 if (consistency_info->punc_ref == NO_EDGE) { 01055 consistency_info->invalid_punc = true; 01056 } 01057 } 01058 } 01059 } 01060 01061 // Update case related counters. 01062 if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) { 01063 // Reset counters if we are dealing with a compound word. 01064 consistency_info->num_lower = 0; 01065 consistency_info->num_non_first_upper = 0; 01066 } 01067 else if (unicharset.get_islower(unichar_id)) { 01068 consistency_info->num_lower++; 01069 } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) { 01070 if (unicharset.get_isupper(parent_b->unichar_id()) || 01071 consistency_info->num_lower > 0 || 01072 consistency_info->num_non_first_upper > 0) { 01073 consistency_info->num_non_first_upper++; 01074 } 01075 } 01076 01077 // Initialize consistency_info->script_id (use script of unichar_id 01078 // if it is not Common, use script id recorded by the parent otherwise). 01079 // Set inconsistent_script to true if the script of the current unichar 01080 // is not consistent with that of the parent. 01081 consistency_info->script_id = unicharset.get_script(unichar_id); 01082 // Hiragana and Katakana can mix with Han. 01083 if (dict_->getUnicharset().han_sid() != dict_->getUnicharset().null_sid()) { 01084 if ((unicharset.hiragana_sid() != unicharset.null_sid() && 01085 consistency_info->script_id == unicharset.hiragana_sid()) || 01086 (unicharset.katakana_sid() != unicharset.null_sid() && 01087 consistency_info->script_id == unicharset.katakana_sid())) { 01088 consistency_info->script_id = dict_->getUnicharset().han_sid(); 01089 } 01090 } 01091 01092 if (parent_vse != NULL && 01093 (parent_vse->consistency_info.script_id != 01094 dict_->getUnicharset().common_sid())) { 01095 int parent_script_id = parent_vse->consistency_info.script_id; 01096 // If script_id is Common, use script id of the parent instead. 01097 if (consistency_info->script_id == dict_->getUnicharset().common_sid()) { 01098 consistency_info->script_id = parent_script_id; 01099 } 01100 if (consistency_info->script_id != parent_script_id) { 01101 consistency_info->inconsistent_script = true; 01102 } 01103 } 01104 01105 // Update chartype related counters. 01106 if (unicharset.get_isalpha(unichar_id)) { 01107 consistency_info->num_alphas++; 01108 } else if (unicharset.get_isdigit(unichar_id)) { 01109 consistency_info->num_digits++; 01110 } else if (!unicharset.get_ispunctuation(unichar_id)) { 01111 consistency_info->num_other++; 01112 } 01113 01114 // Check font and spacing consistency. 01115 if (fontinfo_table_->size() > 0 && parent_b != NULL) { 01116 int fontinfo_id = -1; 01117 if (parent_b->fontinfo_id() == b->fontinfo_id() || 01118 parent_b->fontinfo_id2() == b->fontinfo_id()) { 01119 fontinfo_id = b->fontinfo_id(); 01120 } else if (parent_b->fontinfo_id() == b->fontinfo_id2() || 01121 parent_b->fontinfo_id2() == b->fontinfo_id2()) { 01122 fontinfo_id = b->fontinfo_id2(); 01123 } 01124 if(language_model_debug_level > 1) { 01125 tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n", 01126 (parent_b->fontinfo_id() >= 0) ? 01127 fontinfo_table_->get(parent_b->fontinfo_id()).name : "" , 01128 (parent_b->fontinfo_id2() >= 0) ? 01129 fontinfo_table_->get(parent_b->fontinfo_id2()).name : "", 01130 (b->fontinfo_id() >= 0) ? 01131 fontinfo_table_->get(b->fontinfo_id()).name : "", 01132 (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "", 01133 (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "", 01134 fontinfo_id); 01135 } 01136 if (!word_res->blob_widths.empty()) { // if we have widths/gaps info 01137 bool expected_gap_found = false; 01138 float expected_gap; 01139 int temp_gap; 01140 if (fontinfo_id >= 0) { // found a common font 01141 ASSERT_HOST(fontinfo_id < fontinfo_table_->size()); 01142 if (fontinfo_table_->get(fontinfo_id).get_spacing( 01143 parent_b->unichar_id(), unichar_id, &temp_gap)) { 01144 expected_gap = temp_gap; 01145 expected_gap_found = true; 01146 } 01147 } else { 01148 consistency_info->inconsistent_font = true; 01149 // Get an average of the expected gaps in each font 01150 int num_addends = 0; 01151 expected_gap = 0; 01152 int temp_fid; 01153 for (int i = 0; i < 4; ++i) { 01154 if (i == 0) { 01155 temp_fid = parent_b->fontinfo_id(); 01156 } else if (i == 1) { 01157 temp_fid = parent_b->fontinfo_id2(); 01158 } else if (i == 2) { 01159 temp_fid = b->fontinfo_id(); 01160 } else { 01161 temp_fid = b->fontinfo_id2(); 01162 } 01163 ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size()); 01164 if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing( 01165 parent_b->unichar_id(), unichar_id, &temp_gap)) { 01166 expected_gap += temp_gap; 01167 num_addends++; 01168 } 01169 } 01170 expected_gap_found = (num_addends > 0); 01171 if (num_addends > 0) { 01172 expected_gap /= static_cast<float>(num_addends); 01173 } 01174 } 01175 if (expected_gap_found) { 01176 float actual_gap = 01177 static_cast<float>(word_res->GetBlobsGap(curr_col-1)); 01178 float gap_ratio = expected_gap / actual_gap; 01179 // TODO(rays) The gaps seem to be way off most of the time, saved by 01180 // the error here that the ratio was compared to 1/2, when it should 01181 // have been 0.5f. Find the source of the gaps discrepancy and put 01182 // the 0.5f here in place of 0.0f. 01183 // Test on 2476595.sj, pages 0 to 6. (In French.) 01184 if (gap_ratio < 0.0f || gap_ratio > 2.0f) { 01185 consistency_info->num_inconsistent_spaces++; 01186 } 01187 if (language_model_debug_level > 1) { 01188 tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n", 01189 unicharset.id_to_unichar(parent_b->unichar_id()), 01190 parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id), 01191 unichar_id, curr_col, expected_gap, actual_gap); 01192 } 01193 } 01194 } 01195 } 01196 } 01197 01198 float LanguageModel::ComputeAdjustedPathCost(ViterbiStateEntry *vse) { 01199 ASSERT_HOST(vse != NULL); 01200 if (params_model_.Initialized()) { 01201 float features[PTRAIN_NUM_FEATURE_TYPES]; 01202 ExtractFeaturesFromPath(*vse, features); 01203 float cost = params_model_.ComputeCost(features); 01204 if (language_model_debug_level > 3) { 01205 tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost); 01206 if (language_model_debug_level >= 5) { 01207 for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) { 01208 tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]); 01209 } 01210 } 01211 } 01212 return cost * vse->outline_length; 01213 } else { 01214 float adjustment = 1.0f; 01215 if (vse->dawg_info == NULL || vse->dawg_info->permuter != FREQ_DAWG_PERM) { 01216 adjustment += language_model_penalty_non_freq_dict_word; 01217 } 01218 if (vse->dawg_info == NULL) { 01219 adjustment += language_model_penalty_non_dict_word; 01220 if (vse->length > language_model_min_compound_length) { 01221 adjustment += ((vse->length - language_model_min_compound_length) * 01222 language_model_penalty_increment); 01223 } 01224 } 01225 if (vse->associate_stats.shape_cost > 0) { 01226 adjustment += vse->associate_stats.shape_cost / 01227 static_cast<float>(vse->length); 01228 } 01229 if (language_model_ngram_on) { 01230 ASSERT_HOST(vse->ngram_info != NULL); 01231 return vse->ngram_info->ngram_and_classifier_cost * adjustment; 01232 } else { 01233 adjustment += ComputeConsistencyAdjustment(vse->dawg_info, 01234 vse->consistency_info); 01235 return vse->ratings_sum * adjustment; 01236 } 01237 } 01238 } 01239 01240 void LanguageModel::UpdateBestChoice( 01241 ViterbiStateEntry *vse, 01242 LMPainPoints *pain_points, 01243 WERD_RES *word_res, 01244 BestChoiceBundle *best_choice_bundle, 01245 BlamerBundle *blamer_bundle) { 01246 bool truth_path; 01247 WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt, 01248 blamer_bundle, &truth_path); 01249 ASSERT_HOST(word != NULL); 01250 if (dict_->stopper_debug_level >= 1) { 01251 STRING word_str; 01252 word->string_and_lengths(&word_str, NULL); 01253 vse->Print(word_str.string()); 01254 } 01255 if (language_model_debug_level > 0) { 01256 word->print("UpdateBestChoice() constructed word"); 01257 } 01258 // Record features from the current path if necessary. 01259 ParamsTrainingHypothesis curr_hyp; 01260 if (blamer_bundle != NULL) { 01261 if (vse->dawg_info != NULL) vse->dawg_info->permuter = 01262 static_cast<PermuterType>(word->permuter()); 01263 ExtractFeaturesFromPath(*vse, curr_hyp.features); 01264 word->string_and_lengths(&(curr_hyp.str), NULL); 01265 curr_hyp.cost = vse->cost; // record cost for error rate computations 01266 if (language_model_debug_level > 0) { 01267 tprintf("Raw features extracted from %s (cost=%g) [ ", 01268 curr_hyp.str.string(), curr_hyp.cost); 01269 for (int deb_i = 0; deb_i < PTRAIN_NUM_FEATURE_TYPES; ++deb_i) { 01270 tprintf("%g ", curr_hyp.features[deb_i]); 01271 } 01272 tprintf("]\n"); 01273 } 01274 // Record the current hypothesis in params_training_bundle. 01275 blamer_bundle->AddHypothesis(curr_hyp); 01276 if (truth_path) 01277 blamer_bundle->UpdateBestRating(word->rating()); 01278 } 01279 if (blamer_bundle != NULL && blamer_bundle->GuidedSegsearchStillGoing()) { 01280 // The word was constructed solely for blamer_bundle->AddHypothesis, so 01281 // we no longer need it. 01282 delete word; 01283 return; 01284 } 01285 if (word_res->chopped_word != NULL && !word_res->chopped_word->blobs.empty()) 01286 word->SetScriptPositions(false, word_res->chopped_word); 01287 // Update and log new raw_choice if needed. 01288 if (word_res->raw_choice == NULL || 01289 word->rating() < word_res->raw_choice->rating()) { 01290 if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0) 01291 tprintf("Updated raw choice\n"); 01292 } 01293 // Set the modified rating for best choice to vse->cost and log best choice. 01294 word->set_rating(vse->cost); 01295 // Call LogNewChoice() for best choice from Dict::adjust_word() since it 01296 // computes adjust_factor that is used by the adaption code (e.g. by 01297 // ClassifyAdaptableWord() to compute adaption acceptance thresholds). 01298 // Note: the rating of the word is not adjusted. 01299 dict_->adjust_word(word, vse->dawg_info == NULL, 01300 vse->consistency_info.xht_decision, 0.0, 01301 false, language_model_debug_level > 0); 01302 // Hand ownership of the word over to the word_res. 01303 if (!word_res->LogNewCookedChoice(dict_->tessedit_truncate_wordchoice_log, 01304 dict_->stopper_debug_level >= 1, word)) { 01305 // The word was so bad that it was deleted. 01306 return; 01307 } 01308 if (word_res->best_choice == word) { 01309 // Word was the new best. 01310 if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) && 01311 AcceptablePath(*vse)) { 01312 acceptable_choice_found_ = true; 01313 } 01314 // Update best_choice_bundle. 01315 best_choice_bundle->updated = true; 01316 best_choice_bundle->best_vse = vse; 01317 if (language_model_debug_level > 0) { 01318 tprintf("Updated best choice\n"); 01319 word->print_state("New state "); 01320 } 01321 // Update hyphen state if we are dealing with a dictionary word. 01322 if (vse->dawg_info != NULL) { 01323 if (dict_->has_hyphen_end(*word)) { 01324 dict_->set_hyphen_word(*word, *(dawg_args_->active_dawgs)); 01325 } else { 01326 dict_->reset_hyphen_vars(true); 01327 } 01328 } 01329 01330 if (blamer_bundle != NULL) { 01331 blamer_bundle->set_best_choice_is_dict_and_top_choice( 01332 vse->dawg_info != NULL && vse->top_choice_flags); 01333 } 01334 } 01335 if (wordrec_display_segmentations && word_res->chopped_word != NULL) { 01336 word->DisplaySegmentation(word_res->chopped_word); 01337 } 01338 } 01339 01340 void LanguageModel::ExtractFeaturesFromPath( 01341 const ViterbiStateEntry &vse, float features[]) { 01342 memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES); 01343 // Record dictionary match info. 01344 int len = vse.length <= kMaxSmallWordUnichars ? 0 : 01345 vse.length <= kMaxMediumWordUnichars ? 1 : 2; 01346 if (vse.dawg_info != NULL) { 01347 int permuter = vse.dawg_info->permuter; 01348 if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) { 01349 if (vse.consistency_info.num_digits == vse.length) { 01350 features[PTRAIN_DIGITS_SHORT+len] = 1.0; 01351 } else { 01352 features[PTRAIN_NUM_SHORT+len] = 1.0; 01353 } 01354 } else if (permuter == DOC_DAWG_PERM) { 01355 features[PTRAIN_DOC_SHORT+len] = 1.0; 01356 } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM || 01357 permuter == COMPOUND_PERM) { 01358 features[PTRAIN_DICT_SHORT+len] = 1.0; 01359 } else if (permuter == FREQ_DAWG_PERM) { 01360 features[PTRAIN_FREQ_SHORT+len] = 1.0; 01361 } 01362 } 01363 // Record shape cost feature (normalized by path length). 01364 features[PTRAIN_SHAPE_COST_PER_CHAR] = 01365 vse.associate_stats.shape_cost / static_cast<float>(vse.length); 01366 // Record ngram cost. (normalized by the path length). 01367 features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0; 01368 if (vse.ngram_info != NULL) { 01369 features[PTRAIN_NGRAM_COST_PER_CHAR] = 01370 vse.ngram_info->ngram_cost / static_cast<float>(vse.length); 01371 } 01372 // Record consistency-related features. 01373 // Disabled this feature for due to its poor performance. 01374 // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc(); 01375 features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase(); 01376 features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision; 01377 features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == NULL ? 01378 vse.consistency_info.NumInconsistentChartype() : 0.0; 01379 features[PTRAIN_NUM_BAD_SPACING] = 01380 vse.consistency_info.NumInconsistentSpaces(); 01381 // Disabled this feature for now due to its poor performance. 01382 // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font; 01383 01384 // Classifier-related features. 01385 features[PTRAIN_RATING_PER_CHAR] = 01386 vse.ratings_sum / static_cast<float>(vse.outline_length); 01387 } 01388 01389 WERD_CHOICE *LanguageModel::ConstructWord( 01390 ViterbiStateEntry *vse, 01391 WERD_RES *word_res, 01392 DANGERR *fixpt, 01393 BlamerBundle *blamer_bundle, 01394 bool *truth_path) { 01395 if (truth_path != NULL) { 01396 *truth_path = 01397 (blamer_bundle != NULL && 01398 vse->length == blamer_bundle->correct_segmentation_length()); 01399 } 01400 BLOB_CHOICE *curr_b = vse->curr_b; 01401 ViterbiStateEntry *curr_vse = vse; 01402 01403 int i; 01404 bool compound = dict_->hyphenated(); // treat hyphenated words as compound 01405 01406 // Re-compute the variance of the width-to-height ratios (since we now 01407 // can compute the mean over the whole word). 01408 float full_wh_ratio_mean = 0.0f; 01409 if (vse->associate_stats.full_wh_ratio_var != 0.0f) { 01410 vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var; 01411 full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total / 01412 static_cast<float>(vse->length)); 01413 vse->associate_stats.full_wh_ratio_var = 0.0f; 01414 } 01415 01416 // Construct a WERD_CHOICE by tracing parent pointers. 01417 WERD_CHOICE *word = new WERD_CHOICE(word_res->uch_set, vse->length); 01418 word->set_length(vse->length); 01419 int total_blobs = 0; 01420 for (i = (vse->length-1); i >= 0; --i) { 01421 if (blamer_bundle != NULL && truth_path != NULL && *truth_path && 01422 !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) { 01423 *truth_path = false; 01424 } 01425 // The number of blobs used for this choice is row - col + 1. 01426 int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1; 01427 total_blobs += num_blobs; 01428 word->set_blob_choice(i, num_blobs, curr_b); 01429 // Update the width-to-height ratio variance. Useful non-space delimited 01430 // languages to ensure that the blobs are of uniform width. 01431 // Skip leading and trailing punctuation when computing the variance. 01432 if ((full_wh_ratio_mean != 0.0f && 01433 ((curr_vse != vse && curr_vse->parent_vse != NULL) || 01434 !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) { 01435 vse->associate_stats.full_wh_ratio_var += 01436 pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2); 01437 if (language_model_debug_level > 2) { 01438 tprintf("full_wh_ratio_var += (%g-%g)^2\n", 01439 full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio); 01440 } 01441 } 01442 01443 // Mark the word as compound if compound permuter was set for any of 01444 // the unichars on the path (usually this will happen for unichars 01445 // that are compounding operators, like "-" and "/"). 01446 if (!compound && curr_vse->dawg_info && 01447 curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true; 01448 01449 // Update curr_* pointers. 01450 curr_vse = curr_vse->parent_vse; 01451 if (curr_vse == NULL) break; 01452 curr_b = curr_vse->curr_b; 01453 } 01454 ASSERT_HOST(i == 0); // check that we recorded all the unichar ids. 01455 ASSERT_HOST(total_blobs == word_res->ratings->dimension()); 01456 // Re-adjust shape cost to include the updated width-to-height variance. 01457 if (full_wh_ratio_mean != 0.0f) { 01458 vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var; 01459 } 01460 01461 word->set_rating(vse->ratings_sum); 01462 word->set_certainty(vse->min_certainty); 01463 word->set_x_heights(vse->consistency_info.BodyMinXHeight(), 01464 vse->consistency_info.BodyMaxXHeight()); 01465 if (vse->dawg_info != NULL) { 01466 word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter); 01467 } else if (language_model_ngram_on && !vse->ngram_info->pruned) { 01468 word->set_permuter(NGRAM_PERM); 01469 } else if (vse->top_choice_flags) { 01470 word->set_permuter(TOP_CHOICE_PERM); 01471 } else { 01472 word->set_permuter(NO_PERM); 01473 } 01474 word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true, 01475 word_res->ratings)); 01476 return word; 01477 } 01478 01479 } // namespace tesseract