|
tesseract 3.04.01
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: trie.c (Formerly trie.c) 00005 * Description: Functions to build a trie data structure. 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Fri Jul 26 12:18:10 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 /*---------------------------------------------------------------------- 00026 I n c l u d e s 00027 ----------------------------------------------------------------------*/ 00028 #ifdef _MSC_VER 00029 #pragma warning(disable:4244) // Conversion warnings 00030 #pragma warning(disable:4800) // int/bool warnings 00031 #endif 00032 #include "trie.h" 00033 00034 #include "callcpp.h" 00035 #include "dawg.h" 00036 #include "dict.h" 00037 #include "freelist.h" 00038 #include "genericvector.h" 00039 #include "helpers.h" 00040 #include "kdpair.h" 00041 00042 namespace tesseract { 00043 00044 const char kDoNotReverse[] = "RRP_DO_NO_REVERSE"; 00045 const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL"; 00046 const char kForceReverse[] = "RRP_FORCE_REVERSE"; 00047 00048 const char * const RTLReversePolicyNames[] = { 00049 kDoNotReverse, 00050 kReverseIfHasRTL, 00051 kForceReverse 00052 }; 00053 00054 const char Trie::kAlphaPatternUnicode[] = "\u2000"; 00055 const char Trie::kDigitPatternUnicode[] = "\u2001"; 00056 const char Trie::kAlphanumPatternUnicode[] = "\u2002"; 00057 const char Trie::kPuncPatternUnicode[] = "\u2003"; 00058 const char Trie::kLowerPatternUnicode[] = "\u2004"; 00059 const char Trie::kUpperPatternUnicode[] = "\u2005"; 00060 00061 const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) { 00062 return RTLReversePolicyNames[reverse_policy]; 00063 } 00064 00065 // Reset the Trie to empty. 00066 void Trie::clear() { 00067 nodes_.delete_data_pointers(); 00068 nodes_.clear(); 00069 root_back_freelist_.clear(); 00070 num_edges_ = 0; 00071 new_dawg_node(); // Need to allocate node 0. 00072 } 00073 00074 bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node, 00075 int direction, bool word_end, UNICHAR_ID unichar_id, 00076 EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const { 00077 if (debug_level_ == 3) { 00078 tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT 00079 " direction %d word_end %d unichar_id %d, exploring node:\n", 00080 node_ref, next_node, direction, word_end, unichar_id); 00081 if (node_ref != NO_EDGE) { 00082 print_node(node_ref, nodes_[node_ref]->forward_edges.size()); 00083 } 00084 } 00085 if (node_ref == NO_EDGE) return false; 00086 assert(node_ref < nodes_.size()); 00087 EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ? 00088 nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges; 00089 int vec_size = vec.size(); 00090 if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search 00091 EDGE_INDEX start = 0; 00092 EDGE_INDEX end = vec_size - 1; 00093 EDGE_INDEX k; 00094 int compare; 00095 while (start <= end) { 00096 k = (start + end) >> 1; // (start + end) / 2 00097 compare = given_greater_than_edge_rec(next_node, word_end, 00098 unichar_id, vec[k]); 00099 if (compare == 0) { // given == vec[k] 00100 *edge_ptr = &(vec[k]); 00101 *edge_index = k; 00102 return true; 00103 } else if (compare == 1) { // given > vec[k] 00104 start = k + 1; 00105 } else { // given < vec[k] 00106 end = k - 1; 00107 } 00108 } 00109 } else { // linear search 00110 for (int i = 0; i < vec_size; ++i) { 00111 EDGE_RECORD &edge_rec = vec[i]; 00112 if (edge_rec_match(next_node, word_end, unichar_id, 00113 next_node_from_edge_rec(edge_rec), 00114 end_of_word_from_edge_rec(edge_rec), 00115 unichar_id_from_edge_rec(edge_rec))) { 00116 *edge_ptr = &(edge_rec); 00117 *edge_index = i; 00118 return true; 00119 } 00120 } 00121 } 00122 return false; // not found 00123 } 00124 00125 bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, 00126 int direction, bool word_end, 00127 UNICHAR_ID unichar_id) { 00128 EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ? 00129 &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges); 00130 int search_index; 00131 if (node1 == 0 && direction == FORWARD_EDGE) { 00132 search_index = 0; // find the index to make the add sorted 00133 while (search_index < vec->size() && 00134 given_greater_than_edge_rec(node2, word_end, unichar_id, 00135 (*vec)[search_index]) == 1) { 00136 search_index++; 00137 } 00138 } else { 00139 search_index = vec->size(); // add is unsorted, so index does not matter 00140 } 00141 EDGE_RECORD edge_rec; 00142 link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id); 00143 if (node1 == 0 && direction == BACKWARD_EDGE && 00144 !root_back_freelist_.empty()) { 00145 EDGE_INDEX edge_index = root_back_freelist_.pop_back(); 00146 (*vec)[edge_index] = edge_rec; 00147 } else if (search_index < vec->size()) { 00148 vec->insert(edge_rec, search_index); 00149 } else { 00150 vec->push_back(edge_rec); 00151 } 00152 if (debug_level_ > 1) { 00153 tprintf("new edge in nodes_[" REFFORMAT "]: ", node1); 00154 print_edge_rec(edge_rec); 00155 tprintf("\n"); 00156 } 00157 num_edges_++; 00158 return true; 00159 } 00160 00161 void Trie::add_word_ending(EDGE_RECORD *edge_ptr, 00162 NODE_REF the_next_node, 00163 bool marker_flag, 00164 UNICHAR_ID unichar_id) { 00165 EDGE_RECORD *back_edge_ptr; 00166 EDGE_INDEX back_edge_index; 00167 ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false, 00168 unichar_id, &back_edge_ptr, &back_edge_index)); 00169 if (marker_flag) { 00170 *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_); 00171 *edge_ptr |= (MARKER_FLAG << flag_start_bit_); 00172 } 00173 // Mark both directions as end of word. 00174 *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_); 00175 *edge_ptr |= (WERD_END_FLAG << flag_start_bit_); 00176 } 00177 00178 bool Trie::add_word_to_dawg(const WERD_CHOICE &word, 00179 const GenericVector<bool> *repetitions) { 00180 if (word.length() <= 0) return false; // can't add empty words 00181 if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length()); 00182 // Make sure the word does not contain invalid unchar ids. 00183 for (int i = 0; i < word.length(); ++i) { 00184 if (word.unichar_id(i) < 0 || 00185 word.unichar_id(i) >= unicharset_size_) return false; 00186 } 00187 00188 EDGE_RECORD *edge_ptr; 00189 NODE_REF last_node = 0; 00190 NODE_REF the_next_node; 00191 bool marker_flag = false; 00192 EDGE_INDEX edge_index; 00193 int i; 00194 inT32 still_finding_chars = true; 00195 inT32 word_end = false; 00196 bool add_failed = false; 00197 bool found; 00198 00199 if (debug_level_ > 1) word.print("\nAdding word: "); 00200 00201 UNICHAR_ID unichar_id; 00202 for (i = 0; i < word.length() - 1; ++i) { 00203 unichar_id = word.unichar_id(i); 00204 marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false; 00205 if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id); 00206 if (still_finding_chars) { 00207 found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end, 00208 unichar_id, &edge_ptr, &edge_index); 00209 if (found && debug_level_ > 1) { 00210 tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n", 00211 edge_index, last_node); 00212 } 00213 if (!found) { 00214 still_finding_chars = false; 00215 } else if (next_node_from_edge_rec(*edge_ptr) == 0) { 00216 // We hit the end of an existing word, but the new word is longer. 00217 // In this case we have to disconnect the existing word from the 00218 // backwards root node, mark the current position as end-of-word 00219 // and add new nodes for the increased length. Disconnecting the 00220 // existing word from the backwards root node requires a linear 00221 // search, so it is much faster to add the longest words first, 00222 // to avoid having to come here. 00223 word_end = true; 00224 still_finding_chars = false; 00225 remove_edge(last_node, 0, word_end, unichar_id); 00226 } else { 00227 // We have to add a new branch here for the new word. 00228 if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr); 00229 last_node = next_node_from_edge_rec(*edge_ptr); 00230 } 00231 } 00232 if (!still_finding_chars) { 00233 the_next_node = new_dawg_node(); 00234 if (debug_level_ > 1) 00235 tprintf("adding node " REFFORMAT "\n", the_next_node); 00236 if (the_next_node == 0) { 00237 add_failed = true; 00238 break; 00239 } 00240 if (!add_new_edge(last_node, the_next_node, 00241 marker_flag, word_end, unichar_id)) { 00242 add_failed = true; 00243 break; 00244 } 00245 word_end = false; 00246 last_node = the_next_node; 00247 } 00248 } 00249 the_next_node = 0; 00250 unichar_id = word.unichar_id(i); 00251 marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false; 00252 if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id); 00253 if (still_finding_chars && 00254 edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false, 00255 unichar_id, &edge_ptr, &edge_index)) { 00256 // An extension of this word already exists in the trie, so we 00257 // only have to add the ending flags in both directions. 00258 add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr), 00259 marker_flag, unichar_id); 00260 } else { 00261 // Add a link to node 0. All leaves connect to node 0 so the back links can 00262 // be used in reduction to a dawg. This root backward node has one edge 00263 // entry for every word, (except prefixes of longer words) so it is huge. 00264 if (!add_failed && 00265 !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id)) 00266 add_failed = true; 00267 } 00268 if (add_failed) { 00269 tprintf("Re-initializing document dictionary...\n"); 00270 clear(); 00271 return false; 00272 } else { 00273 return true; 00274 } 00275 } 00276 00277 NODE_REF Trie::new_dawg_node() { 00278 TRIE_NODE_RECORD *node = new TRIE_NODE_RECORD(); 00279 if (node == NULL) return 0; // failed to create new node 00280 nodes_.push_back(node); 00281 return nodes_.length() - 1; 00282 } 00283 00284 // Sort function to sort words by decreasing order of length. 00285 static int sort_strings_by_dec_length(const void* v1, const void* v2) { 00286 const STRING* s1 = reinterpret_cast<const STRING*>(v1); 00287 const STRING* s2 = reinterpret_cast<const STRING*>(v2); 00288 return s2->length() - s1->length(); 00289 } 00290 00291 bool Trie::read_and_add_word_list(const char *filename, 00292 const UNICHARSET &unicharset, 00293 Trie::RTLReversePolicy reverse_policy) { 00294 GenericVector<STRING> word_list; 00295 if (!read_word_list(filename, unicharset, reverse_policy, &word_list)) 00296 return false; 00297 word_list.sort(sort_strings_by_dec_length); 00298 return add_word_list(word_list, unicharset); 00299 } 00300 00301 bool Trie::read_word_list(const char *filename, 00302 const UNICHARSET &unicharset, 00303 Trie::RTLReversePolicy reverse_policy, 00304 GenericVector<STRING>* words) { 00305 FILE *word_file; 00306 char string[CHARS_PER_LINE]; 00307 int word_count = 0; 00308 00309 word_file = fopen(filename, "rb"); 00310 if (word_file == NULL) return false; 00311 00312 while (fgets(string, CHARS_PER_LINE, word_file) != NULL) { 00313 chomp_string(string); // remove newline 00314 WERD_CHOICE word(string, unicharset); 00315 if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL && 00316 word.has_rtl_unichar_id()) || 00317 reverse_policy == RRP_FORCE_REVERSE) { 00318 word.reverse_and_mirror_unichar_ids(); 00319 } 00320 ++word_count; 00321 if (debug_level_ && word_count % 10000 == 0) 00322 tprintf("Read %d words so far\n", word_count); 00323 if (word.length() != 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) { 00324 words->push_back(word.unichar_string()); 00325 } else if (debug_level_) { 00326 tprintf("Skipping invalid word %s\n", string); 00327 if (debug_level_ >= 3) word.print(); 00328 } 00329 } 00330 if (debug_level_) 00331 tprintf("Read %d words total.\n", word_count); 00332 fclose(word_file); 00333 return true; 00334 } 00335 00336 bool Trie::add_word_list(const GenericVector<STRING>& words, 00337 const UNICHARSET &unicharset) { 00338 for (int i = 0; i < words.size(); ++i) { 00339 WERD_CHOICE word(words[i].string(), unicharset); 00340 if (!word_in_dawg(word)) { 00341 add_word_to_dawg(word); 00342 if (!word_in_dawg(word)) { 00343 tprintf("Error: word '%s' not in DAWG after adding it\n", 00344 words[i].string()); 00345 return false; 00346 } 00347 } 00348 } 00349 return true; 00350 } 00351 00352 void Trie::initialize_patterns(UNICHARSET *unicharset) { 00353 unicharset->unichar_insert(kAlphaPatternUnicode); 00354 alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode); 00355 unicharset->unichar_insert(kDigitPatternUnicode); 00356 digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode); 00357 unicharset->unichar_insert(kAlphanumPatternUnicode); 00358 alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode); 00359 unicharset->unichar_insert(kPuncPatternUnicode); 00360 punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode); 00361 unicharset->unichar_insert(kLowerPatternUnicode); 00362 lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode); 00363 unicharset->unichar_insert(kUpperPatternUnicode); 00364 upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode); 00365 initialized_patterns_ = true; 00366 unicharset_size_ = unicharset->size(); 00367 } 00368 00369 void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, 00370 const UNICHARSET &unicharset, 00371 GenericVector<UNICHAR_ID> *vec) const { 00372 bool is_alpha = unicharset.get_isalpha(unichar_id); 00373 if (is_alpha) { 00374 vec->push_back(alpha_pattern_); 00375 vec->push_back(alphanum_pattern_); 00376 if (unicharset.get_islower(unichar_id)) { 00377 vec->push_back(lower_pattern_); 00378 } else if (unicharset.get_isupper(unichar_id)) { 00379 vec->push_back(upper_pattern_); 00380 } 00381 } 00382 if (unicharset.get_isdigit(unichar_id)) { 00383 vec->push_back(digit_pattern_); 00384 if (!is_alpha) vec->push_back(alphanum_pattern_); 00385 } 00386 if (unicharset.get_ispunctuation(unichar_id)) { 00387 vec->push_back(punc_pattern_); 00388 } 00389 } 00390 00391 UNICHAR_ID Trie::character_class_to_pattern(char ch) { 00392 if (ch == 'c') { 00393 return alpha_pattern_; 00394 } else if (ch == 'd') { 00395 return digit_pattern_; 00396 } else if (ch == 'n') { 00397 return alphanum_pattern_; 00398 } else if (ch == 'p') { 00399 return punc_pattern_; 00400 } else if (ch == 'a') { 00401 return lower_pattern_; 00402 } else if (ch == 'A') { 00403 return upper_pattern_; 00404 } else { 00405 return INVALID_UNICHAR_ID; 00406 } 00407 } 00408 00409 bool Trie::read_pattern_list(const char *filename, 00410 const UNICHARSET &unicharset) { 00411 if (!initialized_patterns_) { 00412 tprintf("please call initialize_patterns() before read_pattern_list()\n"); 00413 return false; 00414 } 00415 00416 FILE *pattern_file = fopen(filename, "rb"); 00417 if (pattern_file == NULL) { 00418 tprintf("Error opening pattern file %s\n", filename); 00419 return false; 00420 } 00421 00422 int pattern_count = 0; 00423 char string[CHARS_PER_LINE]; 00424 while (fgets(string, CHARS_PER_LINE, pattern_file) != NULL) { 00425 chomp_string(string); // remove newline 00426 // Parse the pattern and construct a unichar id vector. 00427 // Record the number of repetitions of each unichar in the parallel vector. 00428 WERD_CHOICE word(&unicharset); 00429 GenericVector<bool> repetitions_vec; 00430 const char *str_ptr = string; 00431 int step = unicharset.step(str_ptr); 00432 bool failed = false; 00433 while (step > 0) { 00434 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID; 00435 if (step == 1 && *str_ptr == '\\') { 00436 ++str_ptr; 00437 if (*str_ptr == '\\') { // regular '\' unichar that was escaped 00438 curr_unichar_id = unicharset.unichar_to_id(str_ptr, step); 00439 } else { 00440 if (word.length() < kSaneNumConcreteChars) { 00441 tprintf("Please provide at least %d concrete characters at the" 00442 " beginning of the pattern\n", kSaneNumConcreteChars); 00443 failed = true; 00444 break; 00445 } 00446 // Parse character class from expression. 00447 curr_unichar_id = character_class_to_pattern(*str_ptr); 00448 } 00449 } else { 00450 curr_unichar_id = unicharset.unichar_to_id(str_ptr, step); 00451 } 00452 if (curr_unichar_id == INVALID_UNICHAR_ID) { 00453 failed = true; 00454 break; // failed to parse this pattern 00455 } 00456 word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0); 00457 repetitions_vec.push_back(false); 00458 str_ptr += step; 00459 step = unicharset.step(str_ptr); 00460 // Check if there is a repetition pattern specified after this unichar. 00461 if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') { 00462 repetitions_vec[repetitions_vec.size()-1] = true; 00463 str_ptr += 2; 00464 step = unicharset.step(str_ptr); 00465 } 00466 } 00467 if (failed) { 00468 tprintf("Invalid user pattern %s\n", string); 00469 continue; 00470 } 00471 // Insert the pattern into the trie. 00472 if (debug_level_ > 2) { 00473 tprintf("Inserting expanded user pattern %s\n", 00474 word.debug_string().string()); 00475 } 00476 if (!this->word_in_dawg(word)) { 00477 this->add_word_to_dawg(word, &repetitions_vec); 00478 if (!this->word_in_dawg(word)) { 00479 tprintf("Error: failed to insert pattern '%s'\n", string); 00480 } 00481 } 00482 ++pattern_count; 00483 } 00484 if (debug_level_) { 00485 tprintf("Read %d valid patterns from %s\n", pattern_count, filename); 00486 } 00487 fclose(pattern_file); 00488 return true; 00489 } 00490 00491 void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, 00492 bool word_end, UNICHAR_ID unichar_id) { 00493 EDGE_RECORD *edge_ptr = NULL; 00494 EDGE_INDEX edge_index = 0; 00495 ASSERT_HOST(edge_char_of(node1, node2, direction, word_end, 00496 unichar_id, &edge_ptr, &edge_index)); 00497 if (debug_level_ > 1) { 00498 tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1); 00499 print_edge_rec(*edge_ptr); 00500 tprintf("\n"); 00501 } 00502 if (direction == FORWARD_EDGE) { 00503 nodes_[node1]->forward_edges.remove(edge_index); 00504 } else if (node1 == 0) { 00505 KillEdge(&nodes_[node1]->backward_edges[edge_index]); 00506 root_back_freelist_.push_back(edge_index); 00507 } else { 00508 nodes_[node1]->backward_edges.remove(edge_index); 00509 } 00510 --num_edges_; 00511 } 00512 00513 // Some optimizations employed in add_word_to_dawg and trie_to_dawg: 00514 // 1 Avoid insertion sorting or bubble sorting the tail root node 00515 // (back links on node 0, a list of all the leaves.). The node is 00516 // huge, and sorting it with n^2 time is terrible. 00517 // 2 Avoid using GenericVector::remove on the tail root node. 00518 // (a) During add of words to the trie, zero-out the unichars and 00519 // keep a freelist of spaces to re-use. 00520 // (b) During reduction, just zero-out the unichars of deleted back 00521 // links, skipping zero entries while searching. 00522 // 3 Avoid linear search of the tail root node. This has to be done when 00523 // a suffix is added to an existing word. Adding words by decreasing 00524 // length avoids this problem entirely. Words can still be added in 00525 // any order, but it is faster to add the longest first. 00526 SquishedDawg *Trie::trie_to_dawg() { 00527 root_back_freelist_.clear(); // Will be invalided by trie_to_dawg. 00528 if (debug_level_ > 2) { 00529 print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY); 00530 } 00531 NODE_MARKER reduced_nodes = new bool[nodes_.size()]; 00532 for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = 0; 00533 this->reduce_node_input(0, reduced_nodes); 00534 delete[] reduced_nodes; 00535 00536 if (debug_level_ > 2) { 00537 print_all("After reduction:", MAX_NODE_EDGES_DISPLAY); 00538 } 00539 // Build a translation map from node indices in nodes_ vector to 00540 // their target indices in EDGE_ARRAY. 00541 NODE_REF *node_ref_map = new NODE_REF[nodes_.size() + 1]; 00542 int i, j; 00543 node_ref_map[0] = 0; 00544 for (i = 0; i < nodes_.size(); ++i) { 00545 node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size(); 00546 } 00547 int num_forward_edges = node_ref_map[i]; 00548 00549 // Convert nodes_ vector into EDGE_ARRAY translating the next node references 00550 // in edges using node_ref_map. Empty nodes and backward edges are dropped. 00551 EDGE_ARRAY edge_array = 00552 (EDGE_ARRAY)memalloc(num_forward_edges * sizeof(EDGE_RECORD)); 00553 EDGE_ARRAY edge_array_ptr = edge_array; 00554 for (i = 0; i < nodes_.size(); ++i) { 00555 TRIE_NODE_RECORD *node_ptr = nodes_[i]; 00556 int end = node_ptr->forward_edges.size(); 00557 for (j = 0; j < end; ++j) { 00558 EDGE_RECORD &edge_rec = node_ptr->forward_edges[j]; 00559 NODE_REF node_ref = next_node_from_edge_rec(edge_rec); 00560 ASSERT_HOST(node_ref < nodes_.size()); 00561 UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec); 00562 link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE, 00563 end_of_word_from_edge_rec(edge_rec), unichar_id); 00564 if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr); 00565 ++edge_array_ptr; 00566 } 00567 } 00568 delete[] node_ref_map; 00569 00570 return new SquishedDawg(edge_array, num_forward_edges, type_, lang_, 00571 perm_, unicharset_size_, debug_level_); 00572 } 00573 00574 bool Trie::eliminate_redundant_edges(NODE_REF node, 00575 const EDGE_RECORD &edge1, 00576 const EDGE_RECORD &edge2) { 00577 if (debug_level_ > 1) { 00578 tprintf("\nCollapsing node %d:\n", node); 00579 print_node(node, MAX_NODE_EDGES_DISPLAY); 00580 tprintf("Candidate edges: "); 00581 print_edge_rec(edge1); 00582 tprintf(", "); 00583 print_edge_rec(edge2); 00584 tprintf("\n\n"); 00585 } 00586 NODE_REF next_node1 = next_node_from_edge_rec(edge1); 00587 NODE_REF next_node2 = next_node_from_edge_rec(edge2); 00588 TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2]; 00589 // Translate all edges going to/from next_node2 to go to/from next_node1. 00590 EDGE_RECORD *edge_ptr = NULL; 00591 EDGE_INDEX edge_index; 00592 int i; 00593 // The backward link in node to next_node2 will be zeroed out by the caller. 00594 // Copy all the backward links in next_node2 to node next_node1 00595 for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) { 00596 const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i]; 00597 NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge); 00598 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge); 00599 int curr_word_end = end_of_word_from_edge_rec(bkw_edge); 00600 bool marker_flag = marker_flag_from_edge_rec(bkw_edge); 00601 add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE, 00602 curr_word_end, curr_unichar_id); 00603 // Relocate the corresponding forward edge in curr_next_node 00604 ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE, 00605 curr_word_end, curr_unichar_id, 00606 &edge_ptr, &edge_index)); 00607 set_next_node_in_edge_rec(edge_ptr, next_node1); 00608 } 00609 int next_node2_num_edges = (next_node2_ptr->forward_edges.size() + 00610 next_node2_ptr->backward_edges.size()); 00611 if (debug_level_ > 1) { 00612 tprintf("removed %d edges from node " REFFORMAT "\n", 00613 next_node2_num_edges, next_node2); 00614 } 00615 next_node2_ptr->forward_edges.clear(); 00616 next_node2_ptr->backward_edges.clear(); 00617 num_edges_ -= next_node2_num_edges; 00618 return true; 00619 } 00620 00621 bool Trie::reduce_lettered_edges(EDGE_INDEX edge_index, 00622 UNICHAR_ID unichar_id, 00623 NODE_REF node, 00624 EDGE_VECTOR* backward_edges, 00625 NODE_MARKER reduced_nodes) { 00626 if (debug_level_ > 1) 00627 tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index); 00628 // Compare each of the edge pairs with the given unichar_id. 00629 bool did_something = false; 00630 for (int i = edge_index; i < backward_edges->size() - 1; ++i) { 00631 // Find the first edge that can be eliminated. 00632 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID; 00633 while (i < backward_edges->size()) { 00634 if (!DeadEdge((*backward_edges)[i])) { 00635 curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]); 00636 if (curr_unichar_id != unichar_id) return did_something; 00637 if (can_be_eliminated((*backward_edges)[i])) break; 00638 } 00639 ++i; 00640 } 00641 if (i == backward_edges->size()) break; 00642 const EDGE_RECORD &edge_rec = (*backward_edges)[i]; 00643 // Compare it to the rest of the edges with the given unichar_id. 00644 for (int j = i + 1; j < backward_edges->size(); ++j) { 00645 const EDGE_RECORD &next_edge_rec = (*backward_edges)[j]; 00646 if (DeadEdge(next_edge_rec)) continue; 00647 UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec); 00648 if (next_id != unichar_id) break; 00649 if (end_of_word_from_edge_rec(next_edge_rec) == 00650 end_of_word_from_edge_rec(edge_rec) && 00651 can_be_eliminated(next_edge_rec) && 00652 eliminate_redundant_edges(node, edge_rec, next_edge_rec)) { 00653 reduced_nodes[next_node_from_edge_rec(edge_rec)] = 0; 00654 did_something = true; 00655 KillEdge(&(*backward_edges)[j]); 00656 } 00657 } 00658 } 00659 return did_something; 00660 } 00661 00662 void Trie::sort_edges(EDGE_VECTOR *edges) { 00663 int num_edges = edges->size(); 00664 if (num_edges <= 1) return; 00665 GenericVector<KDPairInc<UNICHAR_ID, EDGE_RECORD> > sort_vec; 00666 sort_vec.reserve(num_edges); 00667 for (int i = 0; i < num_edges; ++i) { 00668 sort_vec.push_back(KDPairInc<UNICHAR_ID, EDGE_RECORD>( 00669 unichar_id_from_edge_rec((*edges)[i]), (*edges)[i])); 00670 } 00671 sort_vec.sort(); 00672 for (int i = 0; i < num_edges; ++i) 00673 (*edges)[i] = sort_vec[i].data; 00674 } 00675 00676 void Trie::reduce_node_input(NODE_REF node, 00677 NODE_MARKER reduced_nodes) { 00678 EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges; 00679 sort_edges(&backward_edges); 00680 if (debug_level_ > 1) { 00681 tprintf("reduce_node_input(node=" REFFORMAT ")\n", node); 00682 print_node(node, MAX_NODE_EDGES_DISPLAY); 00683 } 00684 00685 EDGE_INDEX edge_index = 0; 00686 while (edge_index < backward_edges.size()) { 00687 if (DeadEdge(backward_edges[edge_index])) continue; 00688 UNICHAR_ID unichar_id = 00689 unichar_id_from_edge_rec(backward_edges[edge_index]); 00690 while (reduce_lettered_edges(edge_index, unichar_id, node, 00691 &backward_edges, reduced_nodes)); 00692 while (++edge_index < backward_edges.size()) { 00693 UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]); 00694 if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) break; 00695 } 00696 } 00697 reduced_nodes[node] = true; // mark as reduced 00698 00699 if (debug_level_ > 1) { 00700 tprintf("Node " REFFORMAT " after reduction:\n", node); 00701 print_node(node, MAX_NODE_EDGES_DISPLAY); 00702 } 00703 00704 for (int i = 0; i < backward_edges.size(); ++i) { 00705 if (DeadEdge(backward_edges[i])) continue; 00706 NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]); 00707 if (next_node != 0 && !reduced_nodes[next_node]) { 00708 reduce_node_input(next_node, reduced_nodes); 00709 } 00710 } 00711 } 00712 00713 void Trie::print_node(NODE_REF node, int max_num_edges) const { 00714 if (node == NO_EDGE) return; // nothing to print 00715 TRIE_NODE_RECORD *node_ptr = nodes_[node]; 00716 int num_fwd = node_ptr->forward_edges.size(); 00717 int num_bkw = node_ptr->backward_edges.size(); 00718 EDGE_VECTOR *vec; 00719 for (int dir = 0; dir < 2; ++dir) { 00720 if (dir == 0) { 00721 vec = &(node_ptr->forward_edges); 00722 tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw); 00723 } else { 00724 vec = &(node_ptr->backward_edges); 00725 tprintf("\t"); 00726 } 00727 int i; 00728 for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) && 00729 i < max_num_edges; ++i) { 00730 if (DeadEdge((*vec)[i])) continue; 00731 print_edge_rec((*vec)[i]); 00732 tprintf(" "); 00733 } 00734 if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("..."); 00735 tprintf("\n"); 00736 } 00737 } 00738 00739 } // namespace tesseract