tesseract 3.04.01

dict/dict.cpp

Go to the documentation of this file.
00001 
00002 // File:        dict.cpp
00003 // Description: dict class.
00004 // Author:      Samuel Charron
00005 //
00006 // (C) Copyright 2006, Google Inc.
00007 // Licensed under the Apache License, Version 2.0 (the "License");
00008 // you may not use this file except in compliance with the License.
00009 // You may obtain a copy of the License at
00010 // http://www.apache.org/licenses/LICENSE-2.0
00011 // Unless required by applicable law or agreed to in writing, software
00012 // distributed under the License is distributed on an "AS IS" BASIS,
00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014 // See the License for the specific language governing permissions and
00015 // limitations under the License.
00016 //
00018 
00019 #include <stdio.h>
00020 
00021 #include "dict.h"
00022 #include "unicodes.h"
00023 
00024 #ifdef _MSC_VER
00025 #pragma warning(disable:4244)  // Conversion warnings
00026 #endif
00027 #include "tprintf.h"
00028 
00029 namespace tesseract {
00030 
00031 class Image;
00032 
00033 Dict::Dict(CCUtil* ccutil)
00034     : letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
00035       probability_in_context_(&tesseract::Dict::def_probability_in_context),
00036       params_model_classify_(NULL),
00037       ccutil_(ccutil),
00038       STRING_MEMBER(user_words_file, "",
00039                     "A filename of user-provided words.",
00040                     getCCUtil()->params()),
00041       STRING_INIT_MEMBER(user_words_suffix, "",
00042                          "A suffix of user-provided words located in tessdata.",
00043                          getCCUtil()->params()),
00044       STRING_MEMBER(user_patterns_file, "",
00045                     "A filename of user-provided patterns.",
00046                     getCCUtil()->params()),
00047       STRING_INIT_MEMBER(user_patterns_suffix, "",
00048                          "A suffix of user-provided patterns located in "
00049                          "tessdata.",
00050                          getCCUtil()->params()),
00051       BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
00052                        getCCUtil()->params()),
00053       BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
00054                        getCCUtil()->params()),
00055       BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
00056                        getCCUtil()->params()),
00057       BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation"
00058                        " patterns.", getCCUtil()->params()),
00059       BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number"
00060                        " patterns.", getCCUtil()->params()),
00061       BOOL_INIT_MEMBER(load_bigram_dawg, true, "Load dawg with special word "
00062                        "bigrams.", getCCUtil()->params()),
00063       double_MEMBER(xheight_penalty_subscripts, 0.125,
00064                     "Score penalty (0.1 = 10%) added if there are subscripts "
00065                     "or superscripts in a word, but it is otherwise OK.",
00066                     getCCUtil()->params()),
00067       double_MEMBER(xheight_penalty_inconsistent, 0.25,
00068                     "Score penalty (0.1 = 10%) added if an xheight is "
00069                     "inconsistent.", getCCUtil()->params()),
00070       double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
00071                     "Score multiplier for word matches which have good case and"
00072                     "are frequent in the given language (lower is better).",
00073                     getCCUtil()->params()),
00074       double_MEMBER(segment_penalty_dict_case_ok, 1.1,
00075                     "Score multiplier for word matches that have good case "
00076                     "(lower is better).", getCCUtil()->params()),
00077       double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
00078                     "Default score multiplier for word matches, which may have "
00079                     "case issues (lower is better).",
00080                     getCCUtil()->params()),
00081       double_MEMBER(segment_penalty_ngram_best_choice, 1.24,
00082                    "Multipler to for the best choice from the ngram model.",
00083                    getCCUtil()->params()),
00084       double_MEMBER(segment_penalty_dict_nonword, 1.25,
00085                     "Score multiplier for glyph fragment segmentations which "
00086                     "do not match a dictionary word (lower is better).",
00087                     getCCUtil()->params()),
00088       double_MEMBER(segment_penalty_garbage, 1.50,
00089                     "Score multiplier for poorly cased strings that are not in"
00090                     " the dictionary and generally look like garbage (lower is"
00091                     " better).", getCCUtil()->params()),
00092       STRING_MEMBER(output_ambig_words_file, "",
00093                     "Output file for ambiguities found in the dictionary",
00094                     getCCUtil()->params()),
00095       INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info"
00096                  ", to 2 for more details, to 3 to see all the debug messages",
00097                  getCCUtil()->params()),
00098       INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
00099                  getCCUtil()->params()),
00100       INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.",
00101                  getCCUtil()->params()),
00102       BOOL_MEMBER(use_only_first_uft8_step, false,
00103                   "Use only the first UTF8 step of the given string"
00104                   " when computing log probabilities.",
00105                   getCCUtil()->params()),
00106       double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
00107                     getCCUtil()->params()),
00108       double_MEMBER(stopper_nondict_certainty_base, -2.50,
00109                     "Certainty threshold for non-dict words",
00110                     getCCUtil()->params()),
00111       double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
00112                     "Reject certainty offset",
00113                     getCCUtil()->params()),
00114       INT_MEMBER(stopper_smallword_size, 2,
00115                  "Size of dict word to be treated as non-dict word",
00116                  getCCUtil()->params()),
00117       double_MEMBER(stopper_certainty_per_char, -0.50, "Certainty to add"
00118                     " for each dict char above small word size.",
00119                     getCCUtil()->params()),
00120       double_MEMBER(stopper_allowable_character_badness, 3.0,
00121                     "Max certaintly variation allowed in a word (in sigma)",
00122                     getCCUtil()->params()),
00123       INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
00124                  getCCUtil()->params()),
00125       BOOL_MEMBER(stopper_no_acceptable_choices, false,
00126                   "Make AcceptableChoice() always return false. Useful"
00127                   " when there is a need to explore all segmentations",
00128                   getCCUtil()->params()),
00129       BOOL_MEMBER(save_raw_choices, false,
00130                   "Deprecated- backward compatibility only",
00131                   getCCUtil()->params()),
00132       INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
00133                  "Max words to keep in list",
00134                  getCCUtil()->params()),
00135       STRING_MEMBER(word_to_debug, "", "Word for which stopper debug"
00136                     " information should be printed to stdout",
00137                     getCCUtil()->params()),
00138       STRING_MEMBER(word_to_debug_lengths, "",
00139                     "Lengths of unichars in word_to_debug",
00140                     getCCUtil()->params()),
00141       INT_MEMBER(fragments_debug, 0, "Debug character fragments",
00142                  getCCUtil()->params()),
00143       BOOL_MEMBER(segment_nonalphabetic_script, false,
00144                  "Don't use any alphabetic-specific tricks."
00145                  "Set to true in the traineddata config file for"
00146                  " scripts that are cursive or inherently fixed-pitch",
00147                  getCCUtil()->params()),
00148       BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
00149                   getCCUtil()->params()),
00150       double_MEMBER(doc_dict_pending_threshold, 0.0,
00151                     "Worst certainty for using pending dictionary",
00152                     getCCUtil()->params()),
00153       double_MEMBER(doc_dict_certainty_threshold, -2.25,
00154                     "Worst certainty for words that can be inserted into the"
00155                     "document dictionary", getCCUtil()->params()),
00156       INT_MEMBER(max_permuter_attempts, 10000, "Maximum number of different"
00157                  " character choices to consider during permutation."
00158                  " This limit is especially useful when user patterns"
00159                  " are specified, since overly generic patterns can result in"
00160                  " dawg search exploring an overly large number of options.",
00161                  getCCUtil()->params()) {
00162   dang_ambigs_table_ = NULL;
00163   replace_ambigs_table_ = NULL;
00164   reject_offset_ = 0.0;
00165   go_deeper_fxn_ = NULL;
00166   hyphen_word_ = NULL;
00167   last_word_on_line_ = false;
00168   hyphen_unichar_id_ = INVALID_UNICHAR_ID;
00169   document_words_ = NULL;
00170   dawg_cache_ = NULL;
00171   dawg_cache_is_ours_ = false;
00172   pending_words_ = NULL;
00173   bigram_dawg_ = NULL;
00174   freq_dawg_ = NULL;
00175   punc_dawg_ = NULL;
00176   unambig_dawg_ = NULL;
00177   wordseg_rating_adjust_factor_ = -1.0f;
00178   output_ambig_words_file_ = NULL;
00179 }
00180 
00181 Dict::~Dict() {
00182   if (hyphen_word_ != NULL) delete hyphen_word_;
00183   if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_);
00184 }
00185 
00186 DawgCache *Dict::GlobalDawgCache() {
00187   // We dynamically allocate this global cache (a singleton) so it will outlive
00188   // every Tesseract instance (even those that someone else might declare as
00189   // global statics).
00190   static DawgCache *cache = new DawgCache();  // evil global singleton
00191   return cache;
00192 }
00193 
00194 void Dict::Load(DawgCache *dawg_cache) {
00195   STRING name;
00196   STRING &lang = getCCUtil()->lang;
00197 
00198   if (dawgs_.length() != 0) this->End();
00199 
00200   apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
00201   question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
00202   slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
00203   hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
00204 
00205   if (dawg_cache != NULL) {
00206     dawg_cache_ = dawg_cache;
00207     dawg_cache_is_ours_ = false;
00208   } else {
00209     dawg_cache_ = new DawgCache();
00210     dawg_cache_is_ours_ = true;
00211   }
00212 
00213   TessdataManager &tessdata_manager = getCCUtil()->tessdata_manager;
00214   const char *data_file_name = tessdata_manager.GetDataFileName().string();
00215 
00216   // Load dawgs_.
00217   if (load_punc_dawg) {
00218     punc_dawg_ = dawg_cache_->GetSquishedDawg(
00219         lang, data_file_name, TESSDATA_PUNC_DAWG, dawg_debug_level);
00220     if (punc_dawg_) dawgs_ += punc_dawg_;
00221   }
00222   if (load_system_dawg) {
00223     Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
00224         lang, data_file_name, TESSDATA_SYSTEM_DAWG, dawg_debug_level);
00225     if (system_dawg) dawgs_ += system_dawg;
00226   }
00227   if (load_number_dawg) {
00228     Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
00229         lang, data_file_name, TESSDATA_NUMBER_DAWG, dawg_debug_level);
00230     if (number_dawg) dawgs_ += number_dawg;
00231   }
00232   if (load_bigram_dawg) {
00233     bigram_dawg_ = dawg_cache_->GetSquishedDawg(
00234         lang, data_file_name, TESSDATA_BIGRAM_DAWG, dawg_debug_level);
00235   }
00236   if (load_freq_dawg) {
00237     freq_dawg_ = dawg_cache_->GetSquishedDawg(
00238         lang, data_file_name, TESSDATA_FREQ_DAWG, dawg_debug_level);
00239     if (freq_dawg_) { dawgs_ += freq_dawg_; }
00240   }
00241   if (load_unambig_dawg) {
00242     unambig_dawg_ = dawg_cache_->GetSquishedDawg(
00243         lang, data_file_name, TESSDATA_UNAMBIG_DAWG, dawg_debug_level);
00244     if (unambig_dawg_) dawgs_ += unambig_dawg_;
00245   }
00246 
00247   if (((STRING &)user_words_suffix).length() > 0 ||
00248       ((STRING &)user_words_file).length() > 0) {
00249     Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
00250                               getUnicharset().size(), dawg_debug_level);
00251     if (((STRING &)user_words_file).length() > 0) {
00252         name = user_words_file;
00253     } else {
00254         name = getCCUtil()->language_data_path_prefix;
00255         name += user_words_suffix;
00256     }
00257     if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
00258                                           Trie::RRP_REVERSE_IF_HAS_RTL)) {
00259       tprintf("Error: failed to load %s\n", name.string());
00260       delete trie_ptr;
00261     } else {
00262       dawgs_ += trie_ptr;
00263     }
00264   }
00265 
00266   if (((STRING &)user_patterns_suffix).length() > 0 ||
00267       ((STRING &)user_patterns_file).length() > 0) {
00268     Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
00269                               getUnicharset().size(), dawg_debug_level);
00270     trie_ptr->initialize_patterns(&(getUnicharset()));
00271     if (((STRING &)user_patterns_file).length() > 0) {
00272         name = user_patterns_file;
00273     } else {
00274         name = getCCUtil()->language_data_path_prefix;
00275         name += user_patterns_suffix;
00276     }
00277     if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
00278       tprintf("Error: failed to load %s\n", name.string());
00279       delete trie_ptr;
00280     } else {
00281       dawgs_ += trie_ptr;
00282     }
00283   }
00284 
00285   document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
00286                              getUnicharset().size(), dawg_debug_level);
00287   dawgs_ += document_words_;
00288 
00289   // This dawg is temporary and should not be searched by letter_is_ok.
00290   pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
00291                             getUnicharset().size(), dawg_debug_level);
00292 
00293   // Construct a list of corresponding successors for each dawg. Each entry i
00294   // in the successors_ vector is a vector of integers that represent the
00295   // indices into the dawgs_ vector of the successors for dawg i.
00296   successors_.reserve(dawgs_.length());
00297   for (int i = 0; i < dawgs_.length(); ++i) {
00298     const Dawg *dawg = dawgs_[i];
00299     SuccessorList *lst = new SuccessorList();
00300     for (int j = 0; j < dawgs_.length(); ++j) {
00301       const Dawg *other = dawgs_[j];
00302       if (dawg != NULL && other != NULL &&
00303           (dawg->lang() == other->lang()) &&
00304           kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
00305     }
00306     successors_ += lst;
00307   }
00308 }
00309 
00310 void Dict::End() {
00311   if (dawgs_.length() == 0)
00312     return;  // Not safe to call twice.
00313   for (int i = 0; i < dawgs_.size(); i++) {
00314     if (!dawg_cache_->FreeDawg(dawgs_[i])) {
00315       delete dawgs_[i];
00316     }
00317   }
00318   dawg_cache_->FreeDawg(bigram_dawg_);
00319   if (dawg_cache_is_ours_) {
00320     delete dawg_cache_;
00321     dawg_cache_ = NULL;
00322   }
00323   successors_.delete_data_pointers();
00324   dawgs_.clear();
00325   successors_.clear();
00326   document_words_ = NULL;
00327   if (pending_words_ != NULL) {
00328     delete pending_words_;
00329     pending_words_ = NULL;
00330   }
00331 }
00332 
00333 // Returns true if in light of the current state unichar_id is allowed
00334 // according to at least one of the dawgs in the dawgs_ vector.
00335 // See more extensive comments in dict.h where this function is declared.
00336 int Dict::def_letter_is_okay(void* void_dawg_args,
00337                              UNICHAR_ID unichar_id,
00338                              bool word_end) const {
00339   DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);
00340 
00341   if (dawg_debug_level >= 3) {
00342     tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
00343             " num active dawgs=%d\n",
00344             getUnicharset().debug_str(unichar_id).string(), word_end,
00345             dawg_args->active_dawgs->length());
00346   }
00347 
00348   // Do not accept words that contain kPatternUnicharID.
00349   // (otherwise pattern dawgs would not function correctly).
00350   // Do not accept words containing INVALID_UNICHAR_IDs.
00351   if (unichar_id == Dawg::kPatternUnicharID ||
00352       unichar_id == INVALID_UNICHAR_ID) {
00353     dawg_args->permuter = NO_PERM;
00354     return NO_PERM;
00355   }
00356 
00357   // Initialization.
00358   PermuterType curr_perm = NO_PERM;
00359   dawg_args->updated_dawgs->clear();
00360 
00361   // Go over the active_dawgs vector and insert DawgPosition records
00362   // with the updated ref (an edge with the corresponding unichar id) into
00363   // dawg_args->updated_pos.
00364   for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
00365     const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
00366     const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : NULL;
00367     const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : NULL;
00368 
00369     if (!dawg && !punc_dawg) {
00370       // shouldn't happen.
00371       tprintf("Received DawgPosition with no dawg or punc_dawg.  wth?\n");
00372       continue;
00373     }
00374     if (!dawg) {
00375       // We're in the punctuation dawg.  A core dawg has not been chosen.
00376       NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
00377       EDGE_REF punc_transition_edge = punc_dawg->edge_char_of(
00378           punc_node, Dawg::kPatternUnicharID, word_end);
00379       if (punc_transition_edge != NO_EDGE) {
00380         // Find all successors, and see which can transition.
00381         const SuccessorList &slist = *(successors_[pos.punc_index]);
00382         for (int s = 0; s < slist.length(); ++s) {
00383           int sdawg_index = slist[s];
00384           const Dawg *sdawg = dawgs_[sdawg_index];
00385           UNICHAR_ID ch = char_for_dawg(unichar_id, sdawg);
00386           EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
00387           if (dawg_edge != NO_EDGE) {
00388             if (dawg_debug_level >=3) {
00389               tprintf("Letter found in dawg %d\n", sdawg_index);
00390             }
00391             dawg_args->updated_dawgs->add_unique(
00392                 DawgPosition(sdawg_index, dawg_edge,
00393                              pos.punc_index, punc_transition_edge, false),
00394                 dawg_debug_level > 0,
00395                 "Append transition from punc dawg to current dawgs: ");
00396             if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
00397           }
00398         }
00399       }
00400       EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id,
00401                                                    word_end);
00402       if (punc_edge != NO_EDGE) {
00403         if (dawg_debug_level >=3) {
00404           tprintf("Letter found in punctuation dawg\n");
00405         }
00406         dawg_args->updated_dawgs->add_unique(
00407             DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
00408             dawg_debug_level > 0,
00409             "Extend punctuation dawg: ");
00410         if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
00411       }
00412       continue;
00413     }
00414 
00415     if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
00416       // We can end the main word here.
00417       //  If we can continue on the punc ref, add that possibility.
00418       NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
00419       EDGE_REF punc_edge = punc_node == NO_EDGE ? NO_EDGE
00420           : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
00421       if (punc_edge != NO_EDGE) {
00422         dawg_args->updated_dawgs->add_unique(
00423             DawgPosition(pos.dawg_index, pos.dawg_ref,
00424                          pos.punc_index, punc_edge, true),
00425             dawg_debug_level > 0,
00426             "Return to punctuation dawg: ");
00427         if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
00428       }
00429     }
00430 
00431     if (pos.back_to_punc) continue;
00432 
00433     // If we are dealing with the pattern dawg, look up all the
00434     // possible edges, not only for the exact unichar_id, but also
00435     // for all its character classes (alpha, digit, etc).
00436     if (dawg->type() == DAWG_TYPE_PATTERN) {
00437       ProcessPatternEdges(dawg, pos, unichar_id, word_end,
00438                           dawg_args->updated_dawgs, &curr_perm);
00439       // There can't be any successors to dawg that is of type
00440       // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
00441       continue;
00442     }
00443 
00444     // Find the edge out of the node for the unichar_id.
00445     NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
00446     EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE
00447         : dawg->edge_char_of(node, char_for_dawg(unichar_id, dawg), word_end);
00448 
00449     if (dawg_debug_level >= 3) {
00450       tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
00451               pos.dawg_index, node, edge);
00452     }
00453 
00454     if (edge != NO_EDGE) {  // the unichar was found in the current dawg
00455       if (dawg_debug_level >=3) {
00456         tprintf("Letter found in dawg %d\n", pos.dawg_index);
00457       }
00458       if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
00459         if (dawg_debug_level >= 3) {
00460           tprintf("Punctuation constraint not satisfied at end of word.\n");
00461         }
00462         continue;
00463       }
00464       if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
00465       dawg_args->updated_dawgs->add_unique(
00466           DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
00467                        false),
00468           dawg_debug_level > 0,
00469           "Append current dawg to updated active dawgs: ");
00470     }
00471   }  // end for
00472   // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
00473   // or if we found the current letter in a non-punctuation dawg. This
00474   // allows preserving information on which dawg the "core" word came from.
00475   // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
00476   if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
00477       (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
00478     dawg_args->permuter = curr_perm;
00479   }
00480   if (dawg_debug_level >= 2) {
00481     tprintf("Returning %d for permuter code for this character.\n");
00482   }
00483   return dawg_args->permuter;
00484 }
00485 
00486 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos,
00487                                UNICHAR_ID unichar_id, bool word_end,
00488                                DawgPositionVector *updated_dawgs,
00489                                PermuterType *curr_perm) const {
00490   NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
00491   // Try to find the edge corresponding to the exact unichar_id and to all the
00492   // edges corresponding to the character class of unichar_id.
00493   GenericVector<UNICHAR_ID> unichar_id_patterns;
00494   unichar_id_patterns.push_back(unichar_id);
00495   dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
00496                                &unichar_id_patterns);
00497   for (int i = 0; i < unichar_id_patterns.size(); ++i) {
00498     // On the first iteration check all the outgoing edges.
00499     // On the second iteration check all self-loops.
00500     for (int k = 0; k < 2; ++k) {
00501       EDGE_REF edge = (k == 0)
00502       ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
00503       : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_patterns[i], word_end);
00504       if (edge == NO_EDGE) continue;
00505       if (dawg_debug_level >= 3) {
00506         tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
00507                 pos.dawg_index, node, edge);
00508         tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
00509       }
00510       if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
00511       updated_dawgs->add_unique(
00512           DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
00513                        pos.back_to_punc),
00514           dawg_debug_level > 0,
00515           "Append current dawg to updated active dawgs: ");
00516     }
00517   }
00518 }
00519 
00520 // Fill the given active_dawgs vector with dawgs that could contain the
00521 // beginning of the word. If hyphenated() returns true, copy the entries
00522 // from hyphen_active_dawgs_ instead.
00523 void Dict::init_active_dawgs(DawgPositionVector *active_dawgs,
00524                              bool ambigs_mode) const {
00525   int i;
00526   if (hyphenated()) {
00527     *active_dawgs = hyphen_active_dawgs_;
00528     if (dawg_debug_level >= 3) {
00529       for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
00530         tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
00531                 hyphen_active_dawgs_[i].dawg_index,
00532                 hyphen_active_dawgs_[i].dawg_ref);
00533       }
00534     }
00535   } else {
00536     default_dawgs(active_dawgs, ambigs_mode);
00537   }
00538 }
00539 
00540 void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec,
00541                          bool suppress_patterns) const {
00542   bool punc_dawg_available =
00543     (punc_dawg_ != NULL) &&
00544     punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
00545 
00546   for (int i = 0; i < dawgs_.length(); i++) {
00547     if (dawgs_[i] != NULL &&
00548         !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
00549       int dawg_ty = dawgs_[i]->type();
00550       bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
00551       if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
00552         *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
00553         if (dawg_debug_level >= 3) {
00554           tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
00555                   NO_EDGE);
00556         }
00557       } else if (!punc_dawg_available || !subsumed_by_punc) {
00558         *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
00559         if (dawg_debug_level >= 3) {
00560           tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
00561         }
00562       }
00563     }
00564   }
00565 }
00566 
00567 void Dict::add_document_word(const WERD_CHOICE &best_choice) {
00568   // Do not add hyphenated word parts to the document dawg.
00569   // hyphen_word_ will be non-NULL after the set_hyphen_word() is
00570   // called when the first part of the hyphenated word is
00571   // discovered and while the second part of the word is recognized.
00572   // hyphen_word_ is cleared in cc_recg() before the next word on
00573   // the line is recognized.
00574   if (hyphen_word_) return;
00575 
00576   char filename[CHARS_PER_LINE];
00577   FILE *doc_word_file;
00578   int stringlen = best_choice.length();
00579 
00580   if (valid_word(best_choice) || stringlen < 2)
00581     return;
00582 
00583   // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
00584   if (best_choice.length() >= kDocDictMaxRepChars) {
00585     int num_rep_chars = 1;
00586     UNICHAR_ID uch_id = best_choice.unichar_id(0);
00587     for (int i = 1; i < best_choice.length(); ++i) {
00588       if (best_choice.unichar_id(i) != uch_id) {
00589         num_rep_chars = 1;
00590         uch_id = best_choice.unichar_id(i);
00591       } else {
00592         ++num_rep_chars;
00593         if (num_rep_chars == kDocDictMaxRepChars) return;
00594       }
00595     }
00596   }
00597 
00598   if (best_choice.certainty() < doc_dict_certainty_threshold ||
00599       stringlen == 2) {
00600     if (best_choice.certainty() < doc_dict_pending_threshold)
00601       return;
00602 
00603     if (!pending_words_->word_in_dawg(best_choice)) {
00604       if (stringlen > 2 ||
00605           (stringlen == 2 &&
00606            getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
00607            getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
00608         pending_words_->add_word_to_dawg(best_choice);
00609       }
00610       return;
00611     }
00612   }
00613 
00614   if (save_doc_words) {
00615     strcpy(filename, getCCUtil()->imagefile.string());
00616     strcat(filename, ".doc");
00617     doc_word_file = open_file (filename, "a");
00618     fprintf(doc_word_file, "%s\n",
00619             best_choice.debug_string().string());
00620     fclose(doc_word_file);
00621   }
00622   document_words_->add_word_to_dawg(best_choice);
00623 }
00624 
00625 void Dict::adjust_word(WERD_CHOICE *word,
00626                        bool nonword,
00627                        XHeightConsistencyEnum xheight_consistency,
00628                        float additional_adjust,
00629                        bool modify_rating,
00630                        bool debug) {
00631   bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
00632                  word->GetTopScriptID() == getUnicharset().han_sid());
00633   bool case_is_ok = (is_han || case_ok(*word, getUnicharset()));
00634   bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
00635 
00636   float adjust_factor = additional_adjust;
00637   float new_rating = word->rating();
00638   new_rating += kRatingPad;
00639   const char *xheight_triggered = "";
00640   if (word->length() > 1) {
00641     // Calculate x-height and y-offset consistency penalties.
00642     switch (xheight_consistency) {
00643       case XH_INCONSISTENT:
00644         adjust_factor += xheight_penalty_inconsistent;
00645         xheight_triggered = ", xhtBAD";
00646         break;
00647       case XH_SUBNORMAL:
00648         adjust_factor += xheight_penalty_subscripts;
00649         xheight_triggered = ", xhtSUB";
00650         break;
00651       case XH_GOOD:
00652         // leave the factor alone - all good!
00653         break;
00654     }
00655     // TODO(eger): if nonword is true, but there is a "core" thats' a dict
00656     // word, negate nonword status.
00657   } else {
00658     if (debug) {
00659       tprintf("Consistency could not be calculated.\n");
00660     }
00661   }
00662   if (debug) {
00663     tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
00664             word->unichar_string().string(), word->rating(),
00665             xheight_triggered);
00666   }
00667 
00668   if (nonword) {  // non-dictionary word
00669     if (case_is_ok && punc_is_ok) {
00670       adjust_factor += segment_penalty_dict_nonword;
00671       new_rating *= adjust_factor;
00672       if (debug) tprintf(", W");
00673     } else {
00674       adjust_factor += segment_penalty_garbage;
00675       new_rating *= adjust_factor;
00676       if (debug) {
00677         if (!case_is_ok) tprintf(", C");
00678         if (!punc_is_ok) tprintf(", P");
00679       }
00680     }
00681   } else {  // dictionary word
00682     if (case_is_ok) {
00683       if (!is_han && freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {
00684         word->set_permuter(FREQ_DAWG_PERM);
00685         adjust_factor += segment_penalty_dict_frequent_word;
00686         new_rating *= adjust_factor;
00687         if (debug) tprintf(", F");
00688       } else {
00689         adjust_factor += segment_penalty_dict_case_ok;
00690         new_rating *= adjust_factor;
00691         if (debug) tprintf(", ");
00692       }
00693     } else {
00694       adjust_factor += segment_penalty_dict_case_bad;
00695       new_rating *= adjust_factor;
00696       if (debug) tprintf(", C");
00697     }
00698   }
00699   new_rating -= kRatingPad;
00700   if (modify_rating) word->set_rating(new_rating);
00701   if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
00702   word->set_adjust_factor(adjust_factor);
00703 }
00704 
00705 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
00706   const WERD_CHOICE *word_ptr = &word;
00707   WERD_CHOICE temp_word(word.unicharset());
00708   if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
00709     copy_hyphen_info(&temp_word);
00710     temp_word += word;
00711     word_ptr = &temp_word;
00712   }
00713   if (word_ptr->length() == 0) return NO_PERM;
00714   // Allocate vectors for holding current and updated
00715   // active_dawgs and initialize them.
00716   DawgPositionVector *active_dawgs = new DawgPositionVector[2];
00717   init_active_dawgs(&(active_dawgs[0]), false);
00718   DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
00719   int last_index = word_ptr->length() - 1;
00720   // Call leter_is_okay for each letter in the word.
00721   for (int i = hyphen_base_size(); i <= last_index; ++i) {
00722     if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i),
00723                                    i == last_index))) break;
00724     // Swap active_dawgs, constraints with the corresponding updated vector.
00725     if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
00726       dawg_args.updated_dawgs = &(active_dawgs[0]);
00727       ++(dawg_args.active_dawgs);
00728     } else {
00729       ++(dawg_args.updated_dawgs);
00730       dawg_args.active_dawgs = &(active_dawgs[0]);
00731     }
00732   }
00733   delete[] active_dawgs;
00734   return valid_word_permuter(dawg_args.permuter, numbers_ok) ?
00735     dawg_args.permuter : NO_PERM;
00736 }
00737 
00738 bool Dict::valid_bigram(const WERD_CHOICE &word1,
00739                         const WERD_CHOICE &word2) const {
00740   if (bigram_dawg_ == NULL) return false;
00741 
00742   // Extract the core word from the middle of each word with any digits
00743   //         replaced with question marks.
00744   int w1start, w1end, w2start, w2end;
00745   word1.punct_stripped(&w1start, &w1end);
00746   word2.punct_stripped(&w2start, &w2end);
00747 
00748   // We don't want to penalize a single guillemet, hyphen, etc.
00749   // But our bigram list doesn't have any information about punctuation.
00750   if (w1start >= w1end) return word1.length() < 3;
00751   if (w2start >= w2end) return word2.length() < 3;
00752 
00753   const UNICHARSET& uchset = getUnicharset();
00754   GenericVector<UNICHAR_ID> bigram_string;
00755   bigram_string.reserve(w1end + w2end + 1);
00756   for (int i = w1start; i < w1end; i++) {
00757     const GenericVector<UNICHAR_ID>& normed_ids =
00758         getUnicharset().normed_ids(word1.unichar_id(i));
00759     if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
00760       bigram_string.push_back(question_unichar_id_);
00761     else
00762       bigram_string += normed_ids;
00763   }
00764   bigram_string.push_back(UNICHAR_SPACE);
00765   for (int i = w2start; i < w2end; i++) {
00766     const GenericVector<UNICHAR_ID>& normed_ids =
00767         getUnicharset().normed_ids(word2.unichar_id(i));
00768     if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
00769       bigram_string.push_back(question_unichar_id_);
00770     else
00771       bigram_string += normed_ids;
00772   }
00773   WERD_CHOICE normalized_word(&uchset, bigram_string.size());
00774   for (int i = 0; i < bigram_string.size(); ++i) {
00775     normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1,
00776                                                       0.0f, 0.0f);
00777   }
00778   return bigram_dawg_->word_in_dawg(normalized_word);
00779 }
00780 
00781 bool Dict::valid_punctuation(const WERD_CHOICE &word) {
00782   if (word.length() == 0) return NO_PERM;
00783   int i;
00784   WERD_CHOICE new_word(word.unicharset());
00785   int last_index = word.length() - 1;
00786   int new_len = 0;
00787   for (i = 0; i <= last_index; ++i) {
00788     UNICHAR_ID unichar_id = (word.unichar_id(i));
00789     if (getUnicharset().get_ispunctuation(unichar_id)) {
00790       new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
00791     } else if (!getUnicharset().get_isalpha(unichar_id) &&
00792                !getUnicharset().get_isdigit(unichar_id)) {
00793       return false;  // neither punc, nor alpha, nor digit
00794     } else if ((new_len = new_word.length()) == 0 ||
00795                new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
00796       new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
00797     }
00798   }
00799   for (i = 0; i < dawgs_.size(); ++i) {
00800     if (dawgs_[i] != NULL &&
00801         dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
00802         dawgs_[i]->word_in_dawg(new_word)) return true;
00803   }
00804   return false;
00805 }
00806 
00807 
00808 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines