|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: word_list_lang_model.cpp 00003 * Description: Implementation of the Word List Language Model Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2008 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include <string> 00021 #include <vector> 00022 #include "word_list_lang_model.h" 00023 #include "cube_utils.h" 00024 00025 #include "ratngs.h" 00026 #include "trie.h" 00027 00028 namespace tesseract { 00029 WordListLangModel::WordListLangModel(CubeRecoContext *cntxt) { 00030 cntxt_ = cntxt; 00031 dawg_ = NULL; 00032 init_ = false; 00033 } 00034 00035 WordListLangModel::~WordListLangModel() { 00036 Cleanup(); 00037 } 00038 00039 // Cleanup 00040 void WordListLangModel::Cleanup() { 00041 if (dawg_ != NULL) { 00042 delete dawg_; 00043 dawg_ = NULL; 00044 } 00045 init_ = false; 00046 } 00047 00048 // Initialize the language model 00049 bool WordListLangModel::Init() { 00050 if (init_ == true) { 00051 return true; 00052 } 00053 // The last parameter to the Trie constructor (the debug level) is set to 00054 // false for now, until Cube has a way to express its preferred debug level. 00055 dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM, 00056 cntxt_->CharacterSet()->ClassCount(), false); 00057 if (dawg_ == NULL) { 00058 return false; 00059 } 00060 init_ = true; 00061 return true; 00062 } 00063 00064 // return a pointer to the root 00065 LangModEdge * WordListLangModel::Root() { 00066 return NULL; 00067 } 00068 00069 // return the edges emerging from the current state 00070 LangModEdge **WordListLangModel::GetEdges(CharAltList *alt_list, 00071 LangModEdge *edge, 00072 int *edge_cnt) { 00073 // initialize if necessary 00074 if (init_ == false) { 00075 if (Init() == false) { 00076 return NULL; 00077 } 00078 } 00079 00080 (*edge_cnt) = 0; 00081 00082 EDGE_REF edge_ref; 00083 00084 TessLangModEdge *tess_lm_edge = reinterpret_cast<TessLangModEdge *>(edge); 00085 00086 if (tess_lm_edge == NULL) { 00087 edge_ref = 0; 00088 } else { 00089 edge_ref = tess_lm_edge->EndEdge(); 00090 00091 // advance node 00092 edge_ref = dawg_->next_node(edge_ref); 00093 if (edge_ref == 0) { 00094 return NULL; 00095 } 00096 } 00097 00098 // allocate memory for edges 00099 LangModEdge **edge_array = new LangModEdge *[kMaxEdge]; 00100 if (edge_array == NULL) { 00101 return NULL; 00102 } 00103 00104 // now get all the emerging edges 00105 (*edge_cnt) += TessLangModEdge::CreateChildren(cntxt_, dawg_, edge_ref, 00106 edge_array + (*edge_cnt)); 00107 00108 return edge_array; 00109 } 00110 00111 // returns true if the char_32 is supported by the language model 00112 // TODO(ahmadab) currently not implemented 00113 bool WordListLangModel::IsValidSequence(const char_32 *sequence, 00114 bool terminal, LangModEdge **edges) { 00115 return false; 00116 } 00117 00118 // Recursive helper function for WordVariants(). 00119 void WordListLangModel::WordVariants(const CharSet &char_set, 00120 string_32 prefix_str32, 00121 WERD_CHOICE *word_so_far, 00122 string_32 str32, 00123 vector<WERD_CHOICE *> *word_variants) { 00124 int str_len = str32.length(); 00125 if (str_len == 0) { 00126 if (word_so_far->length() > 0) { 00127 word_variants->push_back(new WERD_CHOICE(*word_so_far)); 00128 } 00129 } else { 00130 // Try out all the possible prefixes of the str32. 00131 for (int len = 1; len <= str_len; len++) { 00132 // Check if prefix is supported in character set. 00133 string_32 str_pref32 = str32.substr(0, len); 00134 int class_id = char_set.ClassID(reinterpret_cast<const char_32 *>( 00135 str_pref32.c_str())); 00136 if (class_id <= 0) { 00137 continue; 00138 } else { 00139 string_32 new_prefix_str32 = prefix_str32 + str_pref32; 00140 string_32 new_str32 = str32.substr(len); 00141 word_so_far->append_unichar_id(class_id, 1, 0.0, 0.0); 00142 WordVariants(char_set, new_prefix_str32, word_so_far, new_str32, 00143 word_variants); 00144 word_so_far->remove_last_unichar_id(); 00145 } 00146 } 00147 } 00148 } 00149 00150 // Compute all the variants of a 32-bit string in terms of the class-ids 00151 // This is needed for languages that have ligatures. A word can then have more 00152 // than one spelling in terms of the class-ids 00153 void WordListLangModel::WordVariants(const CharSet &char_set, 00154 const UNICHARSET *uchset, string_32 str32, 00155 vector<WERD_CHOICE *> *word_variants) { 00156 for (int i = 0; i < word_variants->size(); i++) { 00157 delete (*word_variants)[i]; 00158 } 00159 word_variants->clear(); 00160 string_32 prefix_str32; 00161 WERD_CHOICE word_so_far(uchset); 00162 WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants); 00163 } 00164 00165 // add a new UTF-8 string to the lang model 00166 bool WordListLangModel::AddString(const char *char_ptr) { 00167 if (!init_ && !Init()) { // initialize if necessary 00168 return false; 00169 } 00170 00171 string_32 str32; 00172 CubeUtils::UTF8ToUTF32(char_ptr, &str32); 00173 if (str32.length() < 1) { 00174 return false; 00175 } 00176 return AddString32(str32.c_str()); 00177 } 00178 00179 // add a new UTF-32 string to the lang model 00180 bool WordListLangModel::AddString32(const char_32 *char_32_ptr) { 00181 if (char_32_ptr == NULL) { 00182 return false; 00183 } 00184 // get all the word variants 00185 vector<WERD_CHOICE *> word_variants; 00186 WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(), 00187 char_32_ptr, &word_variants); 00188 00189 if (word_variants.size() > 0) { 00190 // find the shortest variant 00191 int shortest_word = 0; 00192 for (int word = 1; word < word_variants.size(); word++) { 00193 if (word_variants[shortest_word]->length() > 00194 word_variants[word]->length()) { 00195 shortest_word = word; 00196 } 00197 } 00198 // only add the shortest grapheme interpretation of string to the word list 00199 dawg_->add_word_to_dawg(*word_variants[shortest_word]); 00200 } 00201 for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; } 00202 return true; 00203 } 00204 00205 }