|
tesseract 3.04.01
|
00001 00002 // File: dict.cpp 00003 // Description: dict class. 00004 // Author: Samuel Charron 00005 // 00006 // (C) Copyright 2006, Google Inc. 00007 // Licensed under the Apache License, Version 2.0 (the "License"); 00008 // you may not use this file except in compliance with the License. 00009 // You may obtain a copy of the License at 00010 // http://www.apache.org/licenses/LICENSE-2.0 00011 // Unless required by applicable law or agreed to in writing, software 00012 // distributed under the License is distributed on an "AS IS" BASIS, 00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 // See the License for the specific language governing permissions and 00015 // limitations under the License. 00016 // 00018 00019 #include <stdio.h> 00020 00021 #include "dict.h" 00022 #include "unicodes.h" 00023 00024 #ifdef _MSC_VER 00025 #pragma warning(disable:4244) // Conversion warnings 00026 #endif 00027 #include "tprintf.h" 00028 00029 namespace tesseract { 00030 00031 class Image; 00032 00033 Dict::Dict(CCUtil* ccutil) 00034 : letter_is_okay_(&tesseract::Dict::def_letter_is_okay), 00035 probability_in_context_(&tesseract::Dict::def_probability_in_context), 00036 params_model_classify_(NULL), 00037 ccutil_(ccutil), 00038 STRING_MEMBER(user_words_file, "", 00039 "A filename of user-provided words.", 00040 getCCUtil()->params()), 00041 STRING_INIT_MEMBER(user_words_suffix, "", 00042 "A suffix of user-provided words located in tessdata.", 00043 getCCUtil()->params()), 00044 STRING_MEMBER(user_patterns_file, "", 00045 "A filename of user-provided patterns.", 00046 getCCUtil()->params()), 00047 STRING_INIT_MEMBER(user_patterns_suffix, "", 00048 "A suffix of user-provided patterns located in " 00049 "tessdata.", 00050 getCCUtil()->params()), 00051 BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.", 00052 getCCUtil()->params()), 00053 BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.", 00054 getCCUtil()->params()), 00055 BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.", 00056 getCCUtil()->params()), 00057 BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation" 00058 " patterns.", getCCUtil()->params()), 00059 BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number" 00060 " patterns.", getCCUtil()->params()), 00061 BOOL_INIT_MEMBER(load_bigram_dawg, true, "Load dawg with special word " 00062 "bigrams.", getCCUtil()->params()), 00063 double_MEMBER(xheight_penalty_subscripts, 0.125, 00064 "Score penalty (0.1 = 10%) added if there are subscripts " 00065 "or superscripts in a word, but it is otherwise OK.", 00066 getCCUtil()->params()), 00067 double_MEMBER(xheight_penalty_inconsistent, 0.25, 00068 "Score penalty (0.1 = 10%) added if an xheight is " 00069 "inconsistent.", getCCUtil()->params()), 00070 double_MEMBER(segment_penalty_dict_frequent_word, 1.0, 00071 "Score multiplier for word matches which have good case and" 00072 "are frequent in the given language (lower is better).", 00073 getCCUtil()->params()), 00074 double_MEMBER(segment_penalty_dict_case_ok, 1.1, 00075 "Score multiplier for word matches that have good case " 00076 "(lower is better).", getCCUtil()->params()), 00077 double_MEMBER(segment_penalty_dict_case_bad, 1.3125, 00078 "Default score multiplier for word matches, which may have " 00079 "case issues (lower is better).", 00080 getCCUtil()->params()), 00081 double_MEMBER(segment_penalty_ngram_best_choice, 1.24, 00082 "Multipler to for the best choice from the ngram model.", 00083 getCCUtil()->params()), 00084 double_MEMBER(segment_penalty_dict_nonword, 1.25, 00085 "Score multiplier for glyph fragment segmentations which " 00086 "do not match a dictionary word (lower is better).", 00087 getCCUtil()->params()), 00088 double_MEMBER(segment_penalty_garbage, 1.50, 00089 "Score multiplier for poorly cased strings that are not in" 00090 " the dictionary and generally look like garbage (lower is" 00091 " better).", getCCUtil()->params()), 00092 STRING_MEMBER(output_ambig_words_file, "", 00093 "Output file for ambiguities found in the dictionary", 00094 getCCUtil()->params()), 00095 INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info" 00096 ", to 2 for more details, to 3 to see all the debug messages", 00097 getCCUtil()->params()), 00098 INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.", 00099 getCCUtil()->params()), 00100 INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.", 00101 getCCUtil()->params()), 00102 BOOL_MEMBER(use_only_first_uft8_step, false, 00103 "Use only the first UTF8 step of the given string" 00104 " when computing log probabilities.", 00105 getCCUtil()->params()), 00106 double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", 00107 getCCUtil()->params()), 00108 double_MEMBER(stopper_nondict_certainty_base, -2.50, 00109 "Certainty threshold for non-dict words", 00110 getCCUtil()->params()), 00111 double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0, 00112 "Reject certainty offset", 00113 getCCUtil()->params()), 00114 INT_MEMBER(stopper_smallword_size, 2, 00115 "Size of dict word to be treated as non-dict word", 00116 getCCUtil()->params()), 00117 double_MEMBER(stopper_certainty_per_char, -0.50, "Certainty to add" 00118 " for each dict char above small word size.", 00119 getCCUtil()->params()), 00120 double_MEMBER(stopper_allowable_character_badness, 3.0, 00121 "Max certaintly variation allowed in a word (in sigma)", 00122 getCCUtil()->params()), 00123 INT_MEMBER(stopper_debug_level, 0, "Stopper debug level", 00124 getCCUtil()->params()), 00125 BOOL_MEMBER(stopper_no_acceptable_choices, false, 00126 "Make AcceptableChoice() always return false. Useful" 00127 " when there is a need to explore all segmentations", 00128 getCCUtil()->params()), 00129 BOOL_MEMBER(save_raw_choices, false, 00130 "Deprecated- backward compatibility only", 00131 getCCUtil()->params()), 00132 INT_MEMBER(tessedit_truncate_wordchoice_log, 10, 00133 "Max words to keep in list", 00134 getCCUtil()->params()), 00135 STRING_MEMBER(word_to_debug, "", "Word for which stopper debug" 00136 " information should be printed to stdout", 00137 getCCUtil()->params()), 00138 STRING_MEMBER(word_to_debug_lengths, "", 00139 "Lengths of unichars in word_to_debug", 00140 getCCUtil()->params()), 00141 INT_MEMBER(fragments_debug, 0, "Debug character fragments", 00142 getCCUtil()->params()), 00143 BOOL_MEMBER(segment_nonalphabetic_script, false, 00144 "Don't use any alphabetic-specific tricks." 00145 "Set to true in the traineddata config file for" 00146 " scripts that are cursive or inherently fixed-pitch", 00147 getCCUtil()->params()), 00148 BOOL_MEMBER(save_doc_words, 0, "Save Document Words", 00149 getCCUtil()->params()), 00150 double_MEMBER(doc_dict_pending_threshold, 0.0, 00151 "Worst certainty for using pending dictionary", 00152 getCCUtil()->params()), 00153 double_MEMBER(doc_dict_certainty_threshold, -2.25, 00154 "Worst certainty for words that can be inserted into the" 00155 "document dictionary", getCCUtil()->params()), 00156 INT_MEMBER(max_permuter_attempts, 10000, "Maximum number of different" 00157 " character choices to consider during permutation." 00158 " This limit is especially useful when user patterns" 00159 " are specified, since overly generic patterns can result in" 00160 " dawg search exploring an overly large number of options.", 00161 getCCUtil()->params()) { 00162 dang_ambigs_table_ = NULL; 00163 replace_ambigs_table_ = NULL; 00164 reject_offset_ = 0.0; 00165 go_deeper_fxn_ = NULL; 00166 hyphen_word_ = NULL; 00167 last_word_on_line_ = false; 00168 hyphen_unichar_id_ = INVALID_UNICHAR_ID; 00169 document_words_ = NULL; 00170 dawg_cache_ = NULL; 00171 dawg_cache_is_ours_ = false; 00172 pending_words_ = NULL; 00173 bigram_dawg_ = NULL; 00174 freq_dawg_ = NULL; 00175 punc_dawg_ = NULL; 00176 unambig_dawg_ = NULL; 00177 wordseg_rating_adjust_factor_ = -1.0f; 00178 output_ambig_words_file_ = NULL; 00179 } 00180 00181 Dict::~Dict() { 00182 if (hyphen_word_ != NULL) delete hyphen_word_; 00183 if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_); 00184 } 00185 00186 DawgCache *Dict::GlobalDawgCache() { 00187 // We dynamically allocate this global cache (a singleton) so it will outlive 00188 // every Tesseract instance (even those that someone else might declare as 00189 // global statics). 00190 static DawgCache *cache = new DawgCache(); // evil global singleton 00191 return cache; 00192 } 00193 00194 void Dict::Load(DawgCache *dawg_cache) { 00195 STRING name; 00196 STRING &lang = getCCUtil()->lang; 00197 00198 if (dawgs_.length() != 0) this->End(); 00199 00200 apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol); 00201 question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol); 00202 slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol); 00203 hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol); 00204 00205 if (dawg_cache != NULL) { 00206 dawg_cache_ = dawg_cache; 00207 dawg_cache_is_ours_ = false; 00208 } else { 00209 dawg_cache_ = new DawgCache(); 00210 dawg_cache_is_ours_ = true; 00211 } 00212 00213 TessdataManager &tessdata_manager = getCCUtil()->tessdata_manager; 00214 const char *data_file_name = tessdata_manager.GetDataFileName().string(); 00215 00216 // Load dawgs_. 00217 if (load_punc_dawg) { 00218 punc_dawg_ = dawg_cache_->GetSquishedDawg( 00219 lang, data_file_name, TESSDATA_PUNC_DAWG, dawg_debug_level); 00220 if (punc_dawg_) dawgs_ += punc_dawg_; 00221 } 00222 if (load_system_dawg) { 00223 Dawg *system_dawg = dawg_cache_->GetSquishedDawg( 00224 lang, data_file_name, TESSDATA_SYSTEM_DAWG, dawg_debug_level); 00225 if (system_dawg) dawgs_ += system_dawg; 00226 } 00227 if (load_number_dawg) { 00228 Dawg *number_dawg = dawg_cache_->GetSquishedDawg( 00229 lang, data_file_name, TESSDATA_NUMBER_DAWG, dawg_debug_level); 00230 if (number_dawg) dawgs_ += number_dawg; 00231 } 00232 if (load_bigram_dawg) { 00233 bigram_dawg_ = dawg_cache_->GetSquishedDawg( 00234 lang, data_file_name, TESSDATA_BIGRAM_DAWG, dawg_debug_level); 00235 } 00236 if (load_freq_dawg) { 00237 freq_dawg_ = dawg_cache_->GetSquishedDawg( 00238 lang, data_file_name, TESSDATA_FREQ_DAWG, dawg_debug_level); 00239 if (freq_dawg_) { dawgs_ += freq_dawg_; } 00240 } 00241 if (load_unambig_dawg) { 00242 unambig_dawg_ = dawg_cache_->GetSquishedDawg( 00243 lang, data_file_name, TESSDATA_UNAMBIG_DAWG, dawg_debug_level); 00244 if (unambig_dawg_) dawgs_ += unambig_dawg_; 00245 } 00246 00247 if (((STRING &)user_words_suffix).length() > 0 || 00248 ((STRING &)user_words_file).length() > 0) { 00249 Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, 00250 getUnicharset().size(), dawg_debug_level); 00251 if (((STRING &)user_words_file).length() > 0) { 00252 name = user_words_file; 00253 } else { 00254 name = getCCUtil()->language_data_path_prefix; 00255 name += user_words_suffix; 00256 } 00257 if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(), 00258 Trie::RRP_REVERSE_IF_HAS_RTL)) { 00259 tprintf("Error: failed to load %s\n", name.string()); 00260 delete trie_ptr; 00261 } else { 00262 dawgs_ += trie_ptr; 00263 } 00264 } 00265 00266 if (((STRING &)user_patterns_suffix).length() > 0 || 00267 ((STRING &)user_patterns_file).length() > 0) { 00268 Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, 00269 getUnicharset().size(), dawg_debug_level); 00270 trie_ptr->initialize_patterns(&(getUnicharset())); 00271 if (((STRING &)user_patterns_file).length() > 0) { 00272 name = user_patterns_file; 00273 } else { 00274 name = getCCUtil()->language_data_path_prefix; 00275 name += user_patterns_suffix; 00276 } 00277 if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) { 00278 tprintf("Error: failed to load %s\n", name.string()); 00279 delete trie_ptr; 00280 } else { 00281 dawgs_ += trie_ptr; 00282 } 00283 } 00284 00285 document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, 00286 getUnicharset().size(), dawg_debug_level); 00287 dawgs_ += document_words_; 00288 00289 // This dawg is temporary and should not be searched by letter_is_ok. 00290 pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM, 00291 getUnicharset().size(), dawg_debug_level); 00292 00293 // Construct a list of corresponding successors for each dawg. Each entry i 00294 // in the successors_ vector is a vector of integers that represent the 00295 // indices into the dawgs_ vector of the successors for dawg i. 00296 successors_.reserve(dawgs_.length()); 00297 for (int i = 0; i < dawgs_.length(); ++i) { 00298 const Dawg *dawg = dawgs_[i]; 00299 SuccessorList *lst = new SuccessorList(); 00300 for (int j = 0; j < dawgs_.length(); ++j) { 00301 const Dawg *other = dawgs_[j]; 00302 if (dawg != NULL && other != NULL && 00303 (dawg->lang() == other->lang()) && 00304 kDawgSuccessors[dawg->type()][other->type()]) *lst += j; 00305 } 00306 successors_ += lst; 00307 } 00308 } 00309 00310 void Dict::End() { 00311 if (dawgs_.length() == 0) 00312 return; // Not safe to call twice. 00313 for (int i = 0; i < dawgs_.size(); i++) { 00314 if (!dawg_cache_->FreeDawg(dawgs_[i])) { 00315 delete dawgs_[i]; 00316 } 00317 } 00318 dawg_cache_->FreeDawg(bigram_dawg_); 00319 if (dawg_cache_is_ours_) { 00320 delete dawg_cache_; 00321 dawg_cache_ = NULL; 00322 } 00323 successors_.delete_data_pointers(); 00324 dawgs_.clear(); 00325 successors_.clear(); 00326 document_words_ = NULL; 00327 if (pending_words_ != NULL) { 00328 delete pending_words_; 00329 pending_words_ = NULL; 00330 } 00331 } 00332 00333 // Returns true if in light of the current state unichar_id is allowed 00334 // according to at least one of the dawgs in the dawgs_ vector. 00335 // See more extensive comments in dict.h where this function is declared. 00336 int Dict::def_letter_is_okay(void* void_dawg_args, 00337 UNICHAR_ID unichar_id, 00338 bool word_end) const { 00339 DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args); 00340 00341 if (dawg_debug_level >= 3) { 00342 tprintf("def_letter_is_okay: current unichar=%s word_end=%d" 00343 " num active dawgs=%d\n", 00344 getUnicharset().debug_str(unichar_id).string(), word_end, 00345 dawg_args->active_dawgs->length()); 00346 } 00347 00348 // Do not accept words that contain kPatternUnicharID. 00349 // (otherwise pattern dawgs would not function correctly). 00350 // Do not accept words containing INVALID_UNICHAR_IDs. 00351 if (unichar_id == Dawg::kPatternUnicharID || 00352 unichar_id == INVALID_UNICHAR_ID) { 00353 dawg_args->permuter = NO_PERM; 00354 return NO_PERM; 00355 } 00356 00357 // Initialization. 00358 PermuterType curr_perm = NO_PERM; 00359 dawg_args->updated_dawgs->clear(); 00360 00361 // Go over the active_dawgs vector and insert DawgPosition records 00362 // with the updated ref (an edge with the corresponding unichar id) into 00363 // dawg_args->updated_pos. 00364 for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) { 00365 const DawgPosition &pos = (*dawg_args->active_dawgs)[a]; 00366 const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : NULL; 00367 const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : NULL; 00368 00369 if (!dawg && !punc_dawg) { 00370 // shouldn't happen. 00371 tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n"); 00372 continue; 00373 } 00374 if (!dawg) { 00375 // We're in the punctuation dawg. A core dawg has not been chosen. 00376 NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref); 00377 EDGE_REF punc_transition_edge = punc_dawg->edge_char_of( 00378 punc_node, Dawg::kPatternUnicharID, word_end); 00379 if (punc_transition_edge != NO_EDGE) { 00380 // Find all successors, and see which can transition. 00381 const SuccessorList &slist = *(successors_[pos.punc_index]); 00382 for (int s = 0; s < slist.length(); ++s) { 00383 int sdawg_index = slist[s]; 00384 const Dawg *sdawg = dawgs_[sdawg_index]; 00385 UNICHAR_ID ch = char_for_dawg(unichar_id, sdawg); 00386 EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end); 00387 if (dawg_edge != NO_EDGE) { 00388 if (dawg_debug_level >=3) { 00389 tprintf("Letter found in dawg %d\n", sdawg_index); 00390 } 00391 dawg_args->updated_dawgs->add_unique( 00392 DawgPosition(sdawg_index, dawg_edge, 00393 pos.punc_index, punc_transition_edge, false), 00394 dawg_debug_level > 0, 00395 "Append transition from punc dawg to current dawgs: "); 00396 if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter(); 00397 } 00398 } 00399 } 00400 EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id, 00401 word_end); 00402 if (punc_edge != NO_EDGE) { 00403 if (dawg_debug_level >=3) { 00404 tprintf("Letter found in punctuation dawg\n"); 00405 } 00406 dawg_args->updated_dawgs->add_unique( 00407 DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false), 00408 dawg_debug_level > 0, 00409 "Extend punctuation dawg: "); 00410 if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM; 00411 } 00412 continue; 00413 } 00414 00415 if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) { 00416 // We can end the main word here. 00417 // If we can continue on the punc ref, add that possibility. 00418 NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref); 00419 EDGE_REF punc_edge = punc_node == NO_EDGE ? NO_EDGE 00420 : punc_dawg->edge_char_of(punc_node, unichar_id, word_end); 00421 if (punc_edge != NO_EDGE) { 00422 dawg_args->updated_dawgs->add_unique( 00423 DawgPosition(pos.dawg_index, pos.dawg_ref, 00424 pos.punc_index, punc_edge, true), 00425 dawg_debug_level > 0, 00426 "Return to punctuation dawg: "); 00427 if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter(); 00428 } 00429 } 00430 00431 if (pos.back_to_punc) continue; 00432 00433 // If we are dealing with the pattern dawg, look up all the 00434 // possible edges, not only for the exact unichar_id, but also 00435 // for all its character classes (alpha, digit, etc). 00436 if (dawg->type() == DAWG_TYPE_PATTERN) { 00437 ProcessPatternEdges(dawg, pos, unichar_id, word_end, 00438 dawg_args->updated_dawgs, &curr_perm); 00439 // There can't be any successors to dawg that is of type 00440 // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition. 00441 continue; 00442 } 00443 00444 // Find the edge out of the node for the unichar_id. 00445 NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); 00446 EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE 00447 : dawg->edge_char_of(node, char_for_dawg(unichar_id, dawg), word_end); 00448 00449 if (dawg_debug_level >= 3) { 00450 tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", 00451 pos.dawg_index, node, edge); 00452 } 00453 00454 if (edge != NO_EDGE) { // the unichar was found in the current dawg 00455 if (dawg_debug_level >=3) { 00456 tprintf("Letter found in dawg %d\n", pos.dawg_index); 00457 } 00458 if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) { 00459 if (dawg_debug_level >= 3) { 00460 tprintf("Punctuation constraint not satisfied at end of word.\n"); 00461 } 00462 continue; 00463 } 00464 if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter(); 00465 dawg_args->updated_dawgs->add_unique( 00466 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, 00467 false), 00468 dawg_debug_level > 0, 00469 "Append current dawg to updated active dawgs: "); 00470 } 00471 } // end for 00472 // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM 00473 // or if we found the current letter in a non-punctuation dawg. This 00474 // allows preserving information on which dawg the "core" word came from. 00475 // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM. 00476 if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM || 00477 (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) { 00478 dawg_args->permuter = curr_perm; 00479 } 00480 if (dawg_debug_level >= 2) { 00481 tprintf("Returning %d for permuter code for this character.\n"); 00482 } 00483 return dawg_args->permuter; 00484 } 00485 00486 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, 00487 UNICHAR_ID unichar_id, bool word_end, 00488 DawgPositionVector *updated_dawgs, 00489 PermuterType *curr_perm) const { 00490 NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); 00491 // Try to find the edge corresponding to the exact unichar_id and to all the 00492 // edges corresponding to the character class of unichar_id. 00493 GenericVector<UNICHAR_ID> unichar_id_patterns; 00494 unichar_id_patterns.push_back(unichar_id); 00495 dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), 00496 &unichar_id_patterns); 00497 for (int i = 0; i < unichar_id_patterns.size(); ++i) { 00498 // On the first iteration check all the outgoing edges. 00499 // On the second iteration check all self-loops. 00500 for (int k = 0; k < 2; ++k) { 00501 EDGE_REF edge = (k == 0) 00502 ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end) 00503 : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_patterns[i], word_end); 00504 if (edge == NO_EDGE) continue; 00505 if (dawg_debug_level >= 3) { 00506 tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", 00507 pos.dawg_index, node, edge); 00508 tprintf("Letter found in pattern dawg %d\n", pos.dawg_index); 00509 } 00510 if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter(); 00511 updated_dawgs->add_unique( 00512 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, 00513 pos.back_to_punc), 00514 dawg_debug_level > 0, 00515 "Append current dawg to updated active dawgs: "); 00516 } 00517 } 00518 } 00519 00520 // Fill the given active_dawgs vector with dawgs that could contain the 00521 // beginning of the word. If hyphenated() returns true, copy the entries 00522 // from hyphen_active_dawgs_ instead. 00523 void Dict::init_active_dawgs(DawgPositionVector *active_dawgs, 00524 bool ambigs_mode) const { 00525 int i; 00526 if (hyphenated()) { 00527 *active_dawgs = hyphen_active_dawgs_; 00528 if (dawg_debug_level >= 3) { 00529 for (i = 0; i < hyphen_active_dawgs_.size(); ++i) { 00530 tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n", 00531 hyphen_active_dawgs_[i].dawg_index, 00532 hyphen_active_dawgs_[i].dawg_ref); 00533 } 00534 } 00535 } else { 00536 default_dawgs(active_dawgs, ambigs_mode); 00537 } 00538 } 00539 00540 void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, 00541 bool suppress_patterns) const { 00542 bool punc_dawg_available = 00543 (punc_dawg_ != NULL) && 00544 punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE; 00545 00546 for (int i = 0; i < dawgs_.length(); i++) { 00547 if (dawgs_[i] != NULL && 00548 !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) { 00549 int dawg_ty = dawgs_[i]->type(); 00550 bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty]; 00551 if (dawg_ty == DAWG_TYPE_PUNCTUATION) { 00552 *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false); 00553 if (dawg_debug_level >= 3) { 00554 tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, 00555 NO_EDGE); 00556 } 00557 } else if (!punc_dawg_available || !subsumed_by_punc) { 00558 *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false); 00559 if (dawg_debug_level >= 3) { 00560 tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE); 00561 } 00562 } 00563 } 00564 } 00565 } 00566 00567 void Dict::add_document_word(const WERD_CHOICE &best_choice) { 00568 // Do not add hyphenated word parts to the document dawg. 00569 // hyphen_word_ will be non-NULL after the set_hyphen_word() is 00570 // called when the first part of the hyphenated word is 00571 // discovered and while the second part of the word is recognized. 00572 // hyphen_word_ is cleared in cc_recg() before the next word on 00573 // the line is recognized. 00574 if (hyphen_word_) return; 00575 00576 char filename[CHARS_PER_LINE]; 00577 FILE *doc_word_file; 00578 int stringlen = best_choice.length(); 00579 00580 if (valid_word(best_choice) || stringlen < 2) 00581 return; 00582 00583 // Discard words that contain >= kDocDictMaxRepChars repeating unichars. 00584 if (best_choice.length() >= kDocDictMaxRepChars) { 00585 int num_rep_chars = 1; 00586 UNICHAR_ID uch_id = best_choice.unichar_id(0); 00587 for (int i = 1; i < best_choice.length(); ++i) { 00588 if (best_choice.unichar_id(i) != uch_id) { 00589 num_rep_chars = 1; 00590 uch_id = best_choice.unichar_id(i); 00591 } else { 00592 ++num_rep_chars; 00593 if (num_rep_chars == kDocDictMaxRepChars) return; 00594 } 00595 } 00596 } 00597 00598 if (best_choice.certainty() < doc_dict_certainty_threshold || 00599 stringlen == 2) { 00600 if (best_choice.certainty() < doc_dict_pending_threshold) 00601 return; 00602 00603 if (!pending_words_->word_in_dawg(best_choice)) { 00604 if (stringlen > 2 || 00605 (stringlen == 2 && 00606 getUnicharset().get_isupper(best_choice.unichar_id(0)) && 00607 getUnicharset().get_isupper(best_choice.unichar_id(1)))) { 00608 pending_words_->add_word_to_dawg(best_choice); 00609 } 00610 return; 00611 } 00612 } 00613 00614 if (save_doc_words) { 00615 strcpy(filename, getCCUtil()->imagefile.string()); 00616 strcat(filename, ".doc"); 00617 doc_word_file = open_file (filename, "a"); 00618 fprintf(doc_word_file, "%s\n", 00619 best_choice.debug_string().string()); 00620 fclose(doc_word_file); 00621 } 00622 document_words_->add_word_to_dawg(best_choice); 00623 } 00624 00625 void Dict::adjust_word(WERD_CHOICE *word, 00626 bool nonword, 00627 XHeightConsistencyEnum xheight_consistency, 00628 float additional_adjust, 00629 bool modify_rating, 00630 bool debug) { 00631 bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() && 00632 word->GetTopScriptID() == getUnicharset().han_sid()); 00633 bool case_is_ok = (is_han || case_ok(*word, getUnicharset())); 00634 bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word)); 00635 00636 float adjust_factor = additional_adjust; 00637 float new_rating = word->rating(); 00638 new_rating += kRatingPad; 00639 const char *xheight_triggered = ""; 00640 if (word->length() > 1) { 00641 // Calculate x-height and y-offset consistency penalties. 00642 switch (xheight_consistency) { 00643 case XH_INCONSISTENT: 00644 adjust_factor += xheight_penalty_inconsistent; 00645 xheight_triggered = ", xhtBAD"; 00646 break; 00647 case XH_SUBNORMAL: 00648 adjust_factor += xheight_penalty_subscripts; 00649 xheight_triggered = ", xhtSUB"; 00650 break; 00651 case XH_GOOD: 00652 // leave the factor alone - all good! 00653 break; 00654 } 00655 // TODO(eger): if nonword is true, but there is a "core" thats' a dict 00656 // word, negate nonword status. 00657 } else { 00658 if (debug) { 00659 tprintf("Consistency could not be calculated.\n"); 00660 } 00661 } 00662 if (debug) { 00663 tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "", 00664 word->unichar_string().string(), word->rating(), 00665 xheight_triggered); 00666 } 00667 00668 if (nonword) { // non-dictionary word 00669 if (case_is_ok && punc_is_ok) { 00670 adjust_factor += segment_penalty_dict_nonword; 00671 new_rating *= adjust_factor; 00672 if (debug) tprintf(", W"); 00673 } else { 00674 adjust_factor += segment_penalty_garbage; 00675 new_rating *= adjust_factor; 00676 if (debug) { 00677 if (!case_is_ok) tprintf(", C"); 00678 if (!punc_is_ok) tprintf(", P"); 00679 } 00680 } 00681 } else { // dictionary word 00682 if (case_is_ok) { 00683 if (!is_han && freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) { 00684 word->set_permuter(FREQ_DAWG_PERM); 00685 adjust_factor += segment_penalty_dict_frequent_word; 00686 new_rating *= adjust_factor; 00687 if (debug) tprintf(", F"); 00688 } else { 00689 adjust_factor += segment_penalty_dict_case_ok; 00690 new_rating *= adjust_factor; 00691 if (debug) tprintf(", "); 00692 } 00693 } else { 00694 adjust_factor += segment_penalty_dict_case_bad; 00695 new_rating *= adjust_factor; 00696 if (debug) tprintf(", C"); 00697 } 00698 } 00699 new_rating -= kRatingPad; 00700 if (modify_rating) word->set_rating(new_rating); 00701 if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating); 00702 word->set_adjust_factor(adjust_factor); 00703 } 00704 00705 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const { 00706 const WERD_CHOICE *word_ptr = &word; 00707 WERD_CHOICE temp_word(word.unicharset()); 00708 if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) { 00709 copy_hyphen_info(&temp_word); 00710 temp_word += word; 00711 word_ptr = &temp_word; 00712 } 00713 if (word_ptr->length() == 0) return NO_PERM; 00714 // Allocate vectors for holding current and updated 00715 // active_dawgs and initialize them. 00716 DawgPositionVector *active_dawgs = new DawgPositionVector[2]; 00717 init_active_dawgs(&(active_dawgs[0]), false); 00718 DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM); 00719 int last_index = word_ptr->length() - 1; 00720 // Call leter_is_okay for each letter in the word. 00721 for (int i = hyphen_base_size(); i <= last_index; ++i) { 00722 if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i), 00723 i == last_index))) break; 00724 // Swap active_dawgs, constraints with the corresponding updated vector. 00725 if (dawg_args.updated_dawgs == &(active_dawgs[1])) { 00726 dawg_args.updated_dawgs = &(active_dawgs[0]); 00727 ++(dawg_args.active_dawgs); 00728 } else { 00729 ++(dawg_args.updated_dawgs); 00730 dawg_args.active_dawgs = &(active_dawgs[0]); 00731 } 00732 } 00733 delete[] active_dawgs; 00734 return valid_word_permuter(dawg_args.permuter, numbers_ok) ? 00735 dawg_args.permuter : NO_PERM; 00736 } 00737 00738 bool Dict::valid_bigram(const WERD_CHOICE &word1, 00739 const WERD_CHOICE &word2) const { 00740 if (bigram_dawg_ == NULL) return false; 00741 00742 // Extract the core word from the middle of each word with any digits 00743 // replaced with question marks. 00744 int w1start, w1end, w2start, w2end; 00745 word1.punct_stripped(&w1start, &w1end); 00746 word2.punct_stripped(&w2start, &w2end); 00747 00748 // We don't want to penalize a single guillemet, hyphen, etc. 00749 // But our bigram list doesn't have any information about punctuation. 00750 if (w1start >= w1end) return word1.length() < 3; 00751 if (w2start >= w2end) return word2.length() < 3; 00752 00753 const UNICHARSET& uchset = getUnicharset(); 00754 GenericVector<UNICHAR_ID> bigram_string; 00755 bigram_string.reserve(w1end + w2end + 1); 00756 for (int i = w1start; i < w1end; i++) { 00757 const GenericVector<UNICHAR_ID>& normed_ids = 00758 getUnicharset().normed_ids(word1.unichar_id(i)); 00759 if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) 00760 bigram_string.push_back(question_unichar_id_); 00761 else 00762 bigram_string += normed_ids; 00763 } 00764 bigram_string.push_back(UNICHAR_SPACE); 00765 for (int i = w2start; i < w2end; i++) { 00766 const GenericVector<UNICHAR_ID>& normed_ids = 00767 getUnicharset().normed_ids(word2.unichar_id(i)); 00768 if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) 00769 bigram_string.push_back(question_unichar_id_); 00770 else 00771 bigram_string += normed_ids; 00772 } 00773 WERD_CHOICE normalized_word(&uchset, bigram_string.size()); 00774 for (int i = 0; i < bigram_string.size(); ++i) { 00775 normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1, 00776 0.0f, 0.0f); 00777 } 00778 return bigram_dawg_->word_in_dawg(normalized_word); 00779 } 00780 00781 bool Dict::valid_punctuation(const WERD_CHOICE &word) { 00782 if (word.length() == 0) return NO_PERM; 00783 int i; 00784 WERD_CHOICE new_word(word.unicharset()); 00785 int last_index = word.length() - 1; 00786 int new_len = 0; 00787 for (i = 0; i <= last_index; ++i) { 00788 UNICHAR_ID unichar_id = (word.unichar_id(i)); 00789 if (getUnicharset().get_ispunctuation(unichar_id)) { 00790 new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0); 00791 } else if (!getUnicharset().get_isalpha(unichar_id) && 00792 !getUnicharset().get_isdigit(unichar_id)) { 00793 return false; // neither punc, nor alpha, nor digit 00794 } else if ((new_len = new_word.length()) == 0 || 00795 new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) { 00796 new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0); 00797 } 00798 } 00799 for (i = 0; i < dawgs_.size(); ++i) { 00800 if (dawgs_[i] != NULL && 00801 dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION && 00802 dawgs_[i]->word_in_dawg(new_word)) return true; 00803 } 00804 return false; 00805 } 00806 00807 00808 } // namespace tesseract