|
tesseract 3.04.01
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: dawg.c (Formerly dawg.c) 00005 * Description: Use a Directed Accyclic Word Graph 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Wed Jul 24 16:59:16 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 /*---------------------------------------------------------------------- 00026 I n c l u d e s 00027 ----------------------------------------------------------------------*/ 00028 00029 #ifdef _MSC_VER 00030 #pragma warning(disable:4244) // Conversion warnings 00031 #pragma warning(disable:4800) // int/bool warnings 00032 #endif 00033 #include "dawg.h" 00034 00035 #include "cutil.h" 00036 #include "dict.h" 00037 #include "emalloc.h" 00038 #include "freelist.h" 00039 #include "helpers.h" 00040 #include "strngs.h" 00041 #include "tesscallback.h" 00042 #include "tprintf.h" 00043 00044 /*---------------------------------------------------------------------- 00045 F u n c t i o n s f o r D a w g 00046 ----------------------------------------------------------------------*/ 00047 namespace tesseract { 00048 00049 bool Dawg::prefix_in_dawg(const WERD_CHOICE &word, 00050 bool requires_complete) const { 00051 if (word.length() == 0) return !requires_complete; 00052 NODE_REF node = 0; 00053 int end_index = word.length() - 1; 00054 for (int i = 0; i < end_index; i++) { 00055 EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false); 00056 if (edge == NO_EDGE) { 00057 return false; 00058 } 00059 if ((node = next_node(edge)) == 0) { 00060 // This only happens if all words following this edge terminate -- 00061 // there are no larger words. See Trie::add_word_to_dawg() 00062 return false; 00063 } 00064 } 00065 // Now check the last character. 00066 return edge_char_of(node, word.unichar_id(end_index), requires_complete) != 00067 NO_EDGE; 00068 } 00069 00070 bool Dawg::word_in_dawg(const WERD_CHOICE &word) const { 00071 return prefix_in_dawg(word, true); 00072 } 00073 00074 int Dawg::check_for_words(const char *filename, 00075 const UNICHARSET &unicharset, 00076 bool enable_wildcard) const { 00077 if (filename == NULL) return 0; 00078 00079 FILE *word_file; 00080 char string [CHARS_PER_LINE]; 00081 int misses = 0; 00082 UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard); 00083 00084 word_file = open_file (filename, "r"); 00085 00086 while (fgets (string, CHARS_PER_LINE, word_file) != NULL) { 00087 chomp_string(string); // remove newline 00088 WERD_CHOICE word(string, unicharset); 00089 if (word.length() > 0 && 00090 !word.contains_unichar_id(INVALID_UNICHAR_ID)) { 00091 if (!match_words(&word, 0, 0, 00092 enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) { 00093 tprintf("Missing word: %s\n", string); 00094 ++misses; 00095 } 00096 } else { 00097 tprintf("Failed to create a valid word from %s\n", string); 00098 } 00099 } 00100 fclose (word_file); 00101 // Make sure the user sees this with fprintf instead of tprintf. 00102 if (debug_level_) tprintf("Number of lost words=%d\n", misses); 00103 return misses; 00104 } 00105 00106 void Dawg::iterate_words(const UNICHARSET &unicharset, 00107 TessCallback1<const WERD_CHOICE *> *cb) const { 00108 WERD_CHOICE word(&unicharset); 00109 iterate_words_rec(word, 0, cb); 00110 } 00111 00112 void CallWithUTF8(TessCallback1<const char *> *cb, const WERD_CHOICE *wc) { 00113 STRING s; 00114 wc->string_and_lengths(&s, NULL); 00115 cb->Run(s.string()); 00116 } 00117 00118 void Dawg::iterate_words(const UNICHARSET &unicharset, 00119 TessCallback1<const char *> *cb) const { 00120 TessCallback1<const WERD_CHOICE *> *shim = 00121 NewPermanentTessCallback(CallWithUTF8, cb); 00122 WERD_CHOICE word(&unicharset); 00123 iterate_words_rec(word, 0, shim); 00124 delete shim; 00125 } 00126 00127 void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far, 00128 NODE_REF to_explore, 00129 TessCallback1<const WERD_CHOICE *> *cb) const { 00130 NodeChildVector children; 00131 this->unichar_ids_of(to_explore, &children, false); 00132 for (int i = 0; i < children.size(); i++) { 00133 WERD_CHOICE next_word(word_so_far); 00134 next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0); 00135 if (this->end_of_word(children[i].edge_ref)) { 00136 cb->Run(&next_word); 00137 } 00138 NODE_REF next = next_node(children[i].edge_ref); 00139 if (next != 0) { 00140 iterate_words_rec(next_word, next, cb); 00141 } 00142 } 00143 } 00144 00145 bool Dawg::match_words(WERD_CHOICE *word, inT32 index, 00146 NODE_REF node, UNICHAR_ID wildcard) const { 00147 EDGE_REF edge; 00148 inT32 word_end; 00149 00150 if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) { 00151 bool any_matched = false; 00152 NodeChildVector vec; 00153 this->unichar_ids_of(node, &vec, false); 00154 for (int i = 0; i < vec.size(); ++i) { 00155 word->set_unichar_id(vec[i].unichar_id, index); 00156 if (match_words(word, index, node, wildcard)) 00157 any_matched = true; 00158 } 00159 word->set_unichar_id(wildcard, index); 00160 return any_matched; 00161 } else { 00162 word_end = index == word->length() - 1; 00163 edge = edge_char_of(node, word->unichar_id(index), word_end); 00164 if (edge != NO_EDGE) { // normal edge in DAWG 00165 node = next_node(edge); 00166 if (word_end) { 00167 if (debug_level_ > 1) word->print("match_words() found: "); 00168 return true; 00169 } else if (node != 0) { 00170 return match_words(word, index+1, node, wildcard); 00171 } 00172 } 00173 } 00174 return false; 00175 } 00176 00177 void Dawg::init(DawgType type, const STRING &lang, 00178 PermuterType perm, int unicharset_size, int debug_level) { 00179 type_ = type; 00180 lang_ = lang; 00181 perm_ = perm; 00182 ASSERT_HOST(unicharset_size > 0); 00183 unicharset_size_ = unicharset_size; 00184 // Set bit masks. We will use the value unicharset_size_ as a null char, so 00185 // the actual number of unichars is unicharset_size_ + 1. 00186 flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0)); 00187 next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS; 00188 letter_mask_ = ~(~0ull << flag_start_bit_); 00189 next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS); 00190 flags_mask_ = ~(letter_mask_ | next_node_mask_); 00191 00192 debug_level_ = debug_level; 00193 } 00194 00195 00196 /*---------------------------------------------------------------------- 00197 F u n c t i o n s f o r S q u i s h e d D a w g 00198 ----------------------------------------------------------------------*/ 00199 00200 SquishedDawg::~SquishedDawg() { memfree(edges_); } 00201 00202 EDGE_REF SquishedDawg::edge_char_of(NODE_REF node, 00203 UNICHAR_ID unichar_id, 00204 bool word_end) const { 00205 EDGE_REF edge = node; 00206 if (node == 0) { // binary search 00207 EDGE_REF start = 0; 00208 EDGE_REF end = num_forward_edges_in_node0 - 1; 00209 int compare; 00210 while (start <= end) { 00211 edge = (start + end) >> 1; // (start + end) / 2 00212 compare = given_greater_than_edge_rec(NO_EDGE, word_end, 00213 unichar_id, edges_[edge]); 00214 if (compare == 0) { // given == vec[k] 00215 return edge; 00216 } else if (compare == 1) { // given > vec[k] 00217 start = edge + 1; 00218 } else { // given < vec[k] 00219 end = edge - 1; 00220 } 00221 } 00222 } else { // linear search 00223 if (edge != NO_EDGE && edge_occupied(edge)) { 00224 do { 00225 if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) && 00226 (!word_end || end_of_word_from_edge_rec(edges_[edge]))) 00227 return (edge); 00228 } while (!last_edge(edge++)); 00229 } 00230 } 00231 return (NO_EDGE); // not found 00232 } 00233 00234 inT32 SquishedDawg::num_forward_edges(NODE_REF node) const { 00235 EDGE_REF edge = node; 00236 inT32 num = 0; 00237 00238 if (forward_edge (edge)) { 00239 do { 00240 num++; 00241 } while (!last_edge(edge++)); 00242 } 00243 00244 return (num); 00245 } 00246 00247 void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const { 00248 if (node == NO_EDGE) return; // nothing to print 00249 00250 EDGE_REF edge = node; 00251 const char *forward_string = "FORWARD"; 00252 const char *backward_string = " "; 00253 00254 const char *last_string = "LAST"; 00255 const char *not_last_string = " "; 00256 00257 const char *eow_string = "EOW"; 00258 const char *not_eow_string = " "; 00259 00260 const char *direction; 00261 const char *is_last; 00262 const char *eow; 00263 00264 UNICHAR_ID unichar_id; 00265 00266 if (edge_occupied(edge)) { 00267 do { 00268 direction = 00269 forward_edge(edge) ? forward_string : backward_string; 00270 is_last = last_edge(edge) ? last_string : not_last_string; 00271 eow = end_of_word(edge) ? eow_string : not_eow_string; 00272 00273 unichar_id = edge_letter(edge); 00274 tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n", 00275 edge, next_node(edge), unichar_id, 00276 direction, is_last, eow); 00277 00278 if (edge - node > max_num_edges) return; 00279 } while (!last_edge(edge++)); 00280 00281 if (edge < num_edges_ && 00282 edge_occupied(edge) && backward_edge(edge)) { 00283 do { 00284 direction = 00285 forward_edge(edge) ? forward_string : backward_string; 00286 is_last = last_edge(edge) ? last_string : not_last_string; 00287 eow = end_of_word(edge) ? eow_string : not_eow_string; 00288 00289 unichar_id = edge_letter(edge); 00290 tprintf(REFFORMAT " : next = " REFFORMAT 00291 ", unichar_id = %d, %s %s %s\n", 00292 edge, next_node(edge), unichar_id, 00293 direction, is_last, eow); 00294 00295 if (edge - node > MAX_NODE_EDGES_DISPLAY) return; 00296 } while (!last_edge(edge++)); 00297 } 00298 } 00299 else { 00300 tprintf(REFFORMAT " : no edges in this node\n", node); 00301 } 00302 tprintf("\n"); 00303 } 00304 00305 void SquishedDawg::print_edge(EDGE_REF edge) const { 00306 if (edge == NO_EDGE) { 00307 tprintf("NO_EDGE\n"); 00308 } else { 00309 tprintf(REFFORMAT " : next = " REFFORMAT 00310 ", unichar_id = '%d', %s %s %s\n", edge, 00311 next_node(edge), edge_letter(edge), 00312 (forward_edge(edge) ? "FORWARD" : " "), 00313 (last_edge(edge) ? "LAST" : " "), 00314 (end_of_word(edge) ? "EOW" : "")); 00315 } 00316 } 00317 00318 void SquishedDawg::read_squished_dawg(FILE *file, 00319 DawgType type, 00320 const STRING &lang, 00321 PermuterType perm, 00322 int debug_level) { 00323 if (debug_level) tprintf("Reading squished dawg\n"); 00324 00325 // Read the magic number and if it does not match kDawgMagicNumber 00326 // set swap to true to indicate that we need to switch endianness. 00327 inT16 magic; 00328 fread(&magic, sizeof(inT16), 1, file); 00329 bool swap = (magic != kDawgMagicNumber); 00330 00331 int unicharset_size; 00332 fread(&unicharset_size, sizeof(inT32), 1, file); 00333 fread(&num_edges_, sizeof(inT32), 1, file); 00334 00335 if (swap) { 00336 ReverseN(&unicharset_size, sizeof(unicharset_size)); 00337 ReverseN(&num_edges_, sizeof(num_edges_)); 00338 } 00339 ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty 00340 Dawg::init(type, lang, perm, unicharset_size, debug_level); 00341 00342 edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_); 00343 fread(&edges_[0], sizeof(EDGE_RECORD), num_edges_, file); 00344 EDGE_REF edge; 00345 if (swap) { 00346 for (edge = 0; edge < num_edges_; ++edge) { 00347 ReverseN(&edges_[edge], sizeof(edges_[edge])); 00348 } 00349 } 00350 if (debug_level > 2) { 00351 tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n", 00352 type_, lang_.string(), perm_, unicharset_size_, num_edges_); 00353 for (edge = 0; edge < num_edges_; ++edge) 00354 print_edge(edge); 00355 } 00356 } 00357 00358 NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const { 00359 EDGE_REF edge; 00360 NODE_MAP node_map; 00361 inT32 node_counter; 00362 inT32 num_edges; 00363 00364 node_map = (NODE_MAP) malloc(sizeof(EDGE_REF) * num_edges_); 00365 00366 for (edge = 0; edge < num_edges_; edge++) // init all slots 00367 node_map [edge] = -1; 00368 00369 node_counter = num_forward_edges(0); 00370 00371 *num_nodes = 0; 00372 for (edge = 0; edge < num_edges_; edge++) { // search all slots 00373 00374 if (forward_edge(edge)) { 00375 (*num_nodes)++; // count nodes links 00376 node_map[edge] = (edge ? node_counter : 0); 00377 num_edges = num_forward_edges(edge); 00378 if (edge != 0) node_counter += num_edges; 00379 edge += num_edges; 00380 if (edge >= num_edges_) break; 00381 if (backward_edge(edge)) while (!last_edge(edge++)); 00382 edge--; 00383 } 00384 } 00385 return (node_map); 00386 } 00387 00388 void SquishedDawg::write_squished_dawg(FILE *file) { 00389 EDGE_REF edge; 00390 inT32 num_edges; 00391 inT32 node_count = 0; 00392 NODE_MAP node_map; 00393 EDGE_REF old_index; 00394 EDGE_RECORD temp_record; 00395 00396 if (debug_level_) tprintf("write_squished_dawg\n"); 00397 00398 node_map = build_node_map(&node_count); 00399 00400 // Write the magic number to help detecting a change in endianness. 00401 inT16 magic = kDawgMagicNumber; 00402 fwrite(&magic, sizeof(inT16), 1, file); 00403 fwrite(&unicharset_size_, sizeof(inT32), 1, file); 00404 00405 // Count the number of edges in this Dawg. 00406 num_edges = 0; 00407 for (edge=0; edge < num_edges_; edge++) 00408 if (forward_edge(edge)) 00409 num_edges++; 00410 00411 fwrite(&num_edges, sizeof(inT32), 1, file); // write edge count to file 00412 00413 if (debug_level_) { 00414 tprintf("%d nodes in DAWG\n", node_count); 00415 tprintf("%d edges in DAWG\n", num_edges); 00416 } 00417 00418 for (edge = 0; edge < num_edges_; edge++) { 00419 if (forward_edge(edge)) { // write forward edges 00420 do { 00421 old_index = next_node_from_edge_rec(edges_[edge]); 00422 set_next_node(edge, node_map[old_index]); 00423 temp_record = edges_[edge]; 00424 fwrite(&(temp_record), sizeof(EDGE_RECORD), 1, file); 00425 set_next_node(edge, old_index); 00426 } while (!last_edge(edge++)); 00427 00428 if (edge >= num_edges_) break; 00429 if (backward_edge(edge)) // skip back links 00430 while (!last_edge(edge++)); 00431 00432 edge--; 00433 } 00434 } 00435 free(node_map); 00436 } 00437 00438 } // namespace tesseract