tesseract 3.04.01

cube/word_list_lang_model.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        word_list_lang_model.cpp
00003  * Description: Implementation of the Word List Language Model Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2008
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include <string>
00021 #include <vector>
00022 #include "word_list_lang_model.h"
00023 #include "cube_utils.h"
00024 
00025 #include "ratngs.h"
00026 #include "trie.h"
00027 
00028 namespace tesseract {
00029 WordListLangModel::WordListLangModel(CubeRecoContext *cntxt) {
00030   cntxt_ = cntxt;
00031   dawg_ = NULL;
00032   init_ = false;
00033 }
00034 
00035 WordListLangModel::~WordListLangModel() {
00036   Cleanup();
00037 }
00038 
00039 // Cleanup
00040 void WordListLangModel::Cleanup() {
00041   if (dawg_ != NULL) {
00042     delete dawg_;
00043     dawg_ = NULL;
00044   }
00045   init_ = false;
00046 }
00047 
00048 // Initialize the language model
00049 bool WordListLangModel::Init() {
00050   if (init_ == true) {
00051     return true;
00052   }
00053   // The last parameter to the Trie constructor (the debug level) is set to
00054   // false for now, until Cube has a way to express its preferred debug level.
00055   dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM,
00056                    cntxt_->CharacterSet()->ClassCount(), false);
00057   if (dawg_ == NULL) {
00058     return false;
00059   }
00060   init_ = true;
00061   return true;
00062 }
00063 
00064 // return a pointer to the root
00065 LangModEdge * WordListLangModel::Root() {
00066   return NULL;
00067 }
00068 
00069 // return the edges emerging from the current state
00070 LangModEdge **WordListLangModel::GetEdges(CharAltList *alt_list,
00071                                           LangModEdge *edge,
00072                                           int *edge_cnt) {
00073   // initialize if necessary
00074   if (init_ == false) {
00075     if (Init() == false) {
00076       return NULL;
00077     }
00078   }
00079 
00080   (*edge_cnt) = 0;
00081 
00082   EDGE_REF edge_ref;
00083 
00084   TessLangModEdge *tess_lm_edge = reinterpret_cast<TessLangModEdge *>(edge);
00085 
00086   if (tess_lm_edge == NULL) {
00087     edge_ref = 0;
00088   } else {
00089     edge_ref = tess_lm_edge->EndEdge();
00090 
00091     // advance node
00092     edge_ref = dawg_->next_node(edge_ref);
00093     if (edge_ref == 0) {
00094       return NULL;
00095     }
00096   }
00097 
00098   // allocate memory for edges
00099   LangModEdge **edge_array = new LangModEdge *[kMaxEdge];
00100   if (edge_array == NULL) {
00101     return NULL;
00102   }
00103 
00104   // now get all the emerging edges
00105   (*edge_cnt) += TessLangModEdge::CreateChildren(cntxt_, dawg_, edge_ref,
00106                                                  edge_array + (*edge_cnt));
00107 
00108   return edge_array;
00109 }
00110 
00111 // returns true if the char_32 is supported by the language model
00112 // TODO(ahmadab) currently not implemented
00113 bool WordListLangModel::IsValidSequence(const char_32 *sequence,
00114                                         bool terminal, LangModEdge **edges) {
00115   return false;
00116 }
00117 
00118 // Recursive helper function for WordVariants().
00119 void WordListLangModel::WordVariants(const CharSet &char_set,
00120                                      string_32 prefix_str32,
00121                                      WERD_CHOICE *word_so_far,
00122                                      string_32 str32,
00123                                      vector<WERD_CHOICE *> *word_variants) {
00124   int str_len = str32.length();
00125   if (str_len == 0) {
00126     if (word_so_far->length() > 0) {
00127       word_variants->push_back(new WERD_CHOICE(*word_so_far));
00128     }
00129   } else {
00130     // Try out all the possible prefixes of the str32.
00131     for (int len = 1; len <= str_len; len++) {
00132       // Check if prefix is supported in character set.
00133       string_32 str_pref32 = str32.substr(0, len);
00134       int class_id = char_set.ClassID(reinterpret_cast<const char_32 *>(
00135           str_pref32.c_str()));
00136       if (class_id <= 0) {
00137         continue;
00138       } else {
00139         string_32 new_prefix_str32 = prefix_str32 + str_pref32;
00140         string_32 new_str32 = str32.substr(len);
00141         word_so_far->append_unichar_id(class_id, 1, 0.0, 0.0);
00142         WordVariants(char_set, new_prefix_str32, word_so_far, new_str32,
00143                      word_variants);
00144         word_so_far->remove_last_unichar_id();
00145       }
00146     }
00147   }
00148 }
00149 
00150 // Compute all the variants of a 32-bit string in terms of the class-ids
00151 // This is needed for languages that have ligatures. A word can then have more
00152 // than one spelling in terms of the class-ids
00153 void WordListLangModel::WordVariants(const CharSet &char_set,
00154                                      const UNICHARSET *uchset, string_32 str32,
00155                                      vector<WERD_CHOICE *> *word_variants) {
00156   for (int i = 0; i < word_variants->size(); i++) {
00157     delete (*word_variants)[i];
00158   }
00159   word_variants->clear();
00160   string_32 prefix_str32;
00161   WERD_CHOICE word_so_far(uchset);
00162   WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants);
00163 }
00164 
00165 // add a new UTF-8 string to the lang model
00166 bool WordListLangModel::AddString(const char *char_ptr) {
00167   if (!init_ && !Init()) {  // initialize if necessary
00168     return false;
00169   }
00170 
00171   string_32 str32;
00172   CubeUtils::UTF8ToUTF32(char_ptr, &str32);
00173   if (str32.length() < 1) {
00174     return false;
00175   }
00176   return AddString32(str32.c_str());
00177 }
00178 
00179 // add a new UTF-32 string to the lang model
00180 bool WordListLangModel::AddString32(const char_32 *char_32_ptr) {
00181   if (char_32_ptr == NULL) {
00182     return false;
00183   }
00184   // get all the word variants
00185   vector<WERD_CHOICE *> word_variants;
00186   WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(),
00187                char_32_ptr, &word_variants);
00188 
00189   if (word_variants.size() > 0) {
00190     // find the shortest variant
00191     int shortest_word = 0;
00192     for (int word = 1; word < word_variants.size(); word++) {
00193       if (word_variants[shortest_word]->length() >
00194           word_variants[word]->length()) {
00195         shortest_word = word;
00196       }
00197     }
00198     // only add the shortest grapheme interpretation of string to the word list
00199     dawg_->add_word_to_dawg(*word_variants[shortest_word]);
00200   }
00201   for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; }
00202   return true;
00203 }
00204 
00205 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines