tesseract 3.04.01

dict/trie.cpp

Go to the documentation of this file.
00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        trie.c  (Formerly trie.c)
00005  * Description:  Functions to build a trie data structure.
00006  * Author:       Mark Seaman, OCR Technology
00007  * Created:      Fri Oct 16 14:37:00 1987
00008  * Modified:     Fri Jul 26 12:18:10 1991 (Mark Seaman) marks@hpgrlt
00009  * Language:     C
00010  * Package:      N/A
00011  * Status:       Reusable Software Component
00012  *
00013  * (c) Copyright 1987, Hewlett-Packard Company.
00014  ** Licensed under the Apache License, Version 2.0 (the "License");
00015  ** you may not use this file except in compliance with the License.
00016  ** You may obtain a copy of the License at
00017  ** http://www.apache.org/licenses/LICENSE-2.0
00018  ** Unless required by applicable law or agreed to in writing, software
00019  ** distributed under the License is distributed on an "AS IS" BASIS,
00020  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021  ** See the License for the specific language governing permissions and
00022  ** limitations under the License.
00023  *
00024  *********************************************************************************/
00025 /*----------------------------------------------------------------------
00026               I n c l u d e s
00027 ----------------------------------------------------------------------*/
00028 #ifdef _MSC_VER
00029 #pragma warning(disable:4244)  // Conversion warnings
00030 #pragma warning(disable:4800)  // int/bool warnings
00031 #endif
00032 #include "trie.h"
00033 
00034 #include "callcpp.h"
00035 #include "dawg.h"
00036 #include "dict.h"
00037 #include "freelist.h"
00038 #include "genericvector.h"
00039 #include "helpers.h"
00040 #include "kdpair.h"
00041 
00042 namespace tesseract {
00043 
00044 const char kDoNotReverse[] = "RRP_DO_NO_REVERSE";
00045 const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL";
00046 const char kForceReverse[] = "RRP_FORCE_REVERSE";
00047 
00048 const char * const RTLReversePolicyNames[] = {
00049   kDoNotReverse,
00050   kReverseIfHasRTL,
00051   kForceReverse
00052 };
00053 
00054 const char Trie::kAlphaPatternUnicode[] = "\u2000";
00055 const char Trie::kDigitPatternUnicode[] = "\u2001";
00056 const char Trie::kAlphanumPatternUnicode[] = "\u2002";
00057 const char Trie::kPuncPatternUnicode[] = "\u2003";
00058 const char Trie::kLowerPatternUnicode[] = "\u2004";
00059 const char Trie::kUpperPatternUnicode[] = "\u2005";
00060 
00061 const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
00062   return RTLReversePolicyNames[reverse_policy];
00063 }
00064 
00065 // Reset the Trie to empty.
00066 void Trie::clear() {
00067   nodes_.delete_data_pointers();
00068   nodes_.clear();
00069   root_back_freelist_.clear();
00070   num_edges_ = 0;
00071   new_dawg_node();  // Need to allocate node 0.
00072 }
00073 
00074 bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node,
00075                         int direction, bool word_end, UNICHAR_ID unichar_id,
00076                         EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const {
00077   if (debug_level_ == 3) {
00078     tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
00079             " direction %d word_end %d unichar_id %d, exploring node:\n",
00080             node_ref, next_node, direction, word_end, unichar_id);
00081     if (node_ref != NO_EDGE) {
00082       print_node(node_ref, nodes_[node_ref]->forward_edges.size());
00083     }
00084   }
00085   if (node_ref == NO_EDGE) return false;
00086   assert(node_ref < nodes_.size());
00087   EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ?
00088     nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges;
00089   int vec_size = vec.size();
00090   if (node_ref == 0 && direction == FORWARD_EDGE) {  // binary search
00091     EDGE_INDEX start = 0;
00092     EDGE_INDEX end = vec_size - 1;
00093     EDGE_INDEX k;
00094     int compare;
00095     while (start <= end) {
00096       k = (start + end) >> 1;  // (start + end) / 2
00097       compare = given_greater_than_edge_rec(next_node, word_end,
00098                                             unichar_id, vec[k]);
00099       if (compare == 0) {  // given == vec[k]
00100         *edge_ptr = &(vec[k]);
00101         *edge_index = k;
00102         return true;
00103       } else if (compare == 1) {  // given > vec[k]
00104         start = k + 1;
00105       } else {  // given < vec[k]
00106         end = k - 1;
00107       }
00108     }
00109   } else {  // linear search
00110     for (int i = 0; i < vec_size; ++i) {
00111       EDGE_RECORD &edge_rec = vec[i];
00112       if (edge_rec_match(next_node, word_end, unichar_id,
00113                          next_node_from_edge_rec(edge_rec),
00114                          end_of_word_from_edge_rec(edge_rec),
00115                          unichar_id_from_edge_rec(edge_rec))) {
00116         *edge_ptr = &(edge_rec);
00117         *edge_index = i;
00118         return true;
00119       }
00120     }
00121   }
00122   return false;  // not found
00123 }
00124 
00125 bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag,
00126                             int direction, bool word_end,
00127                             UNICHAR_ID unichar_id) {
00128   EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
00129     &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
00130   int search_index;
00131   if (node1 == 0 && direction == FORWARD_EDGE) {
00132     search_index = 0;  // find the index to make the add sorted
00133     while (search_index < vec->size() &&
00134            given_greater_than_edge_rec(node2, word_end, unichar_id,
00135                                        (*vec)[search_index]) == 1) {
00136       search_index++;
00137     }
00138   } else {
00139     search_index = vec->size();  // add is unsorted, so index does not matter
00140   }
00141   EDGE_RECORD edge_rec;
00142   link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
00143   if (node1 == 0 && direction == BACKWARD_EDGE &&
00144       !root_back_freelist_.empty()) {
00145     EDGE_INDEX edge_index = root_back_freelist_.pop_back();
00146     (*vec)[edge_index] = edge_rec;
00147   } else if (search_index < vec->size()) {
00148     vec->insert(edge_rec, search_index);
00149   } else {
00150     vec->push_back(edge_rec);
00151   }
00152   if (debug_level_ > 1) {
00153     tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
00154     print_edge_rec(edge_rec);
00155     tprintf("\n");
00156   }
00157   num_edges_++;
00158   return true;
00159 }
00160 
00161 void Trie::add_word_ending(EDGE_RECORD *edge_ptr,
00162                            NODE_REF the_next_node,
00163                            bool marker_flag,
00164                            UNICHAR_ID unichar_id) {
00165   EDGE_RECORD *back_edge_ptr;
00166   EDGE_INDEX back_edge_index;
00167   ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false,
00168                            unichar_id, &back_edge_ptr, &back_edge_index));
00169   if (marker_flag) {
00170     *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
00171     *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
00172   }
00173   // Mark both directions as end of word.
00174   *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
00175   *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
00176 }
00177 
00178 bool Trie::add_word_to_dawg(const WERD_CHOICE &word,
00179                             const GenericVector<bool> *repetitions) {
00180   if (word.length() <= 0) return false;  // can't add empty words
00181   if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length());
00182   // Make sure the word does not contain invalid unchar ids.
00183   for (int i = 0; i < word.length(); ++i) {
00184     if (word.unichar_id(i) < 0 ||
00185         word.unichar_id(i) >= unicharset_size_) return false;
00186   }
00187 
00188   EDGE_RECORD *edge_ptr;
00189   NODE_REF last_node = 0;
00190   NODE_REF the_next_node;
00191   bool marker_flag = false;
00192   EDGE_INDEX edge_index;
00193   int i;
00194   inT32 still_finding_chars = true;
00195   inT32 word_end = false;
00196   bool  add_failed = false;
00197   bool found;
00198 
00199   if (debug_level_ > 1) word.print("\nAdding word: ");
00200 
00201   UNICHAR_ID unichar_id;
00202   for (i = 0; i < word.length() - 1; ++i) {
00203     unichar_id = word.unichar_id(i);
00204     marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
00205     if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
00206     if (still_finding_chars) {
00207       found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end,
00208                            unichar_id, &edge_ptr, &edge_index);
00209       if (found && debug_level_ > 1) {
00210         tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n",
00211                 edge_index, last_node);
00212       }
00213       if (!found) {
00214         still_finding_chars = false;
00215       } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
00216         // We hit the end of an existing word, but the new word is longer.
00217         // In this case we have to disconnect the existing word from the
00218         // backwards root node, mark the current position as end-of-word
00219         // and add new nodes for the increased length. Disconnecting the
00220         // existing word from the backwards root node requires a linear
00221         // search, so it is much faster to add the longest words first,
00222         // to avoid having to come here.
00223         word_end = true;
00224         still_finding_chars = false;
00225         remove_edge(last_node, 0, word_end, unichar_id);
00226       } else {
00227         // We have to add a new branch here for the new word.
00228         if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr);
00229         last_node = next_node_from_edge_rec(*edge_ptr);
00230       }
00231     }
00232     if (!still_finding_chars) {
00233       the_next_node = new_dawg_node();
00234       if (debug_level_ > 1)
00235         tprintf("adding node " REFFORMAT "\n", the_next_node);
00236       if (the_next_node == 0) {
00237         add_failed = true;
00238         break;
00239       }
00240       if (!add_new_edge(last_node, the_next_node,
00241                         marker_flag, word_end, unichar_id)) {
00242         add_failed = true;
00243         break;
00244       }
00245       word_end = false;
00246       last_node = the_next_node;
00247     }
00248   }
00249   the_next_node = 0;
00250   unichar_id = word.unichar_id(i);
00251   marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
00252   if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
00253   if (still_finding_chars &&
00254       edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false,
00255                    unichar_id, &edge_ptr, &edge_index)) {
00256     // An extension of this word already exists in the trie, so we
00257     // only have to add the ending flags in both directions.
00258     add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr),
00259                     marker_flag, unichar_id);
00260   } else {
00261     // Add a link to node 0. All leaves connect to node 0 so the back links can
00262     // be used in reduction to a dawg. This root backward node has one edge
00263     // entry for every word, (except prefixes of longer words) so it is huge.
00264     if (!add_failed &&
00265         !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id))
00266       add_failed = true;
00267   }
00268   if (add_failed) {
00269     tprintf("Re-initializing document dictionary...\n");
00270     clear();
00271     return false;
00272   } else {
00273     return true;
00274   }
00275 }
00276 
00277 NODE_REF Trie::new_dawg_node() {
00278   TRIE_NODE_RECORD *node = new TRIE_NODE_RECORD();
00279   if (node == NULL) return 0;  // failed to create new node
00280   nodes_.push_back(node);
00281   return nodes_.length() - 1;
00282 }
00283 
00284 // Sort function to sort words by decreasing order of length.
00285 static int sort_strings_by_dec_length(const void* v1, const void* v2) {
00286   const STRING* s1 = reinterpret_cast<const STRING*>(v1);
00287   const STRING* s2 = reinterpret_cast<const STRING*>(v2);
00288   return s2->length() - s1->length();
00289 }
00290 
00291 bool Trie::read_and_add_word_list(const char *filename,
00292                                   const UNICHARSET &unicharset,
00293                                   Trie::RTLReversePolicy reverse_policy) {
00294   GenericVector<STRING> word_list;
00295   if (!read_word_list(filename, unicharset, reverse_policy, &word_list))
00296     return false;
00297   word_list.sort(sort_strings_by_dec_length);
00298   return add_word_list(word_list, unicharset);
00299 }
00300 
00301 bool Trie::read_word_list(const char *filename,
00302                           const UNICHARSET &unicharset,
00303                           Trie::RTLReversePolicy reverse_policy,
00304                           GenericVector<STRING>* words) {
00305   FILE *word_file;
00306   char string[CHARS_PER_LINE];
00307   int  word_count = 0;
00308 
00309   word_file = fopen(filename, "rb");
00310   if (word_file == NULL) return false;
00311 
00312   while (fgets(string, CHARS_PER_LINE, word_file) != NULL) {
00313     chomp_string(string);  // remove newline
00314     WERD_CHOICE word(string, unicharset);
00315     if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
00316         word.has_rtl_unichar_id()) ||
00317         reverse_policy == RRP_FORCE_REVERSE) {
00318       word.reverse_and_mirror_unichar_ids();
00319     }
00320     ++word_count;
00321     if (debug_level_ && word_count % 10000 == 0)
00322       tprintf("Read %d words so far\n", word_count);
00323     if (word.length() != 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
00324       words->push_back(word.unichar_string());
00325     } else if (debug_level_) {
00326       tprintf("Skipping invalid word %s\n", string);
00327       if (debug_level_ >= 3) word.print();
00328     }
00329   }
00330   if (debug_level_)
00331     tprintf("Read %d words total.\n", word_count);
00332   fclose(word_file);
00333   return true;
00334 }
00335 
00336 bool Trie::add_word_list(const GenericVector<STRING>& words,
00337                    const UNICHARSET &unicharset) {
00338   for (int i = 0; i < words.size(); ++i) {
00339     WERD_CHOICE word(words[i].string(), unicharset);
00340     if (!word_in_dawg(word)) {
00341       add_word_to_dawg(word);
00342       if (!word_in_dawg(word)) {
00343         tprintf("Error: word '%s' not in DAWG after adding it\n",
00344                 words[i].string());
00345         return false;
00346       }
00347     }
00348   }
00349   return true;
00350 }
00351 
00352 void Trie::initialize_patterns(UNICHARSET *unicharset) {
00353   unicharset->unichar_insert(kAlphaPatternUnicode);
00354   alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode);
00355   unicharset->unichar_insert(kDigitPatternUnicode);
00356   digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode);
00357   unicharset->unichar_insert(kAlphanumPatternUnicode);
00358   alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode);
00359   unicharset->unichar_insert(kPuncPatternUnicode);
00360   punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode);
00361   unicharset->unichar_insert(kLowerPatternUnicode);
00362   lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode);
00363   unicharset->unichar_insert(kUpperPatternUnicode);
00364   upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode);
00365   initialized_patterns_ = true;
00366   unicharset_size_ = unicharset->size();
00367 }
00368 
00369 void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id,
00370                                   const UNICHARSET &unicharset,
00371                                   GenericVector<UNICHAR_ID> *vec) const {
00372   bool is_alpha = unicharset.get_isalpha(unichar_id);
00373   if (is_alpha) {
00374     vec->push_back(alpha_pattern_);
00375     vec->push_back(alphanum_pattern_);
00376     if (unicharset.get_islower(unichar_id)) {
00377       vec->push_back(lower_pattern_);
00378     } else if (unicharset.get_isupper(unichar_id)) {
00379       vec->push_back(upper_pattern_);
00380     }
00381   }
00382   if (unicharset.get_isdigit(unichar_id)) {
00383     vec->push_back(digit_pattern_);
00384     if (!is_alpha) vec->push_back(alphanum_pattern_);
00385   }
00386   if (unicharset.get_ispunctuation(unichar_id)) {
00387     vec->push_back(punc_pattern_);
00388   }
00389 }
00390 
00391 UNICHAR_ID Trie::character_class_to_pattern(char ch) {
00392   if (ch == 'c') {
00393     return alpha_pattern_;
00394   } else if (ch == 'd') {
00395     return digit_pattern_;
00396   } else if (ch == 'n') {
00397     return alphanum_pattern_;
00398   } else if (ch == 'p') {
00399     return punc_pattern_;
00400   } else if (ch == 'a') {
00401     return lower_pattern_;
00402   } else if (ch == 'A') {
00403     return upper_pattern_;
00404   } else {
00405     return INVALID_UNICHAR_ID;
00406   }
00407 }
00408 
00409 bool Trie::read_pattern_list(const char *filename,
00410                              const UNICHARSET &unicharset) {
00411   if (!initialized_patterns_) {
00412     tprintf("please call initialize_patterns() before read_pattern_list()\n");
00413     return false;
00414   }
00415 
00416   FILE *pattern_file = fopen(filename, "rb");
00417   if (pattern_file == NULL) {
00418     tprintf("Error opening pattern file %s\n", filename);
00419     return false;
00420   }
00421 
00422   int pattern_count = 0;
00423   char string[CHARS_PER_LINE];
00424   while (fgets(string, CHARS_PER_LINE, pattern_file) != NULL) {
00425     chomp_string(string);  // remove newline
00426     // Parse the pattern and construct a unichar id vector.
00427     // Record the number of repetitions of each unichar in the parallel vector.
00428     WERD_CHOICE word(&unicharset);
00429     GenericVector<bool> repetitions_vec;
00430     const char *str_ptr = string;
00431     int step = unicharset.step(str_ptr);
00432     bool failed = false;
00433     while (step > 0) {
00434       UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
00435       if (step == 1 && *str_ptr == '\\') {
00436         ++str_ptr;
00437         if (*str_ptr == '\\') {  // regular '\' unichar that was escaped
00438           curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
00439         } else {
00440           if (word.length() < kSaneNumConcreteChars) {
00441             tprintf("Please provide at least %d concrete characters at the"
00442                     " beginning of the pattern\n", kSaneNumConcreteChars);
00443             failed = true;
00444             break;
00445           }
00446           // Parse character class from expression.
00447           curr_unichar_id = character_class_to_pattern(*str_ptr);
00448         }
00449       } else {
00450         curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
00451       }
00452       if (curr_unichar_id ==  INVALID_UNICHAR_ID) {
00453         failed = true;
00454         break;  // failed to parse this pattern
00455       }
00456       word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
00457       repetitions_vec.push_back(false);
00458       str_ptr += step;
00459       step = unicharset.step(str_ptr);
00460       // Check if there is a repetition pattern specified after this unichar.
00461       if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') {
00462         repetitions_vec[repetitions_vec.size()-1] = true;
00463         str_ptr += 2;
00464         step = unicharset.step(str_ptr);
00465       }
00466     }
00467     if (failed) {
00468       tprintf("Invalid user pattern %s\n", string);
00469       continue;
00470     }
00471     // Insert the pattern into the trie.
00472     if (debug_level_ > 2) {
00473       tprintf("Inserting expanded user pattern %s\n",
00474               word.debug_string().string());
00475     }
00476     if (!this->word_in_dawg(word)) {
00477       this->add_word_to_dawg(word, &repetitions_vec);
00478       if (!this->word_in_dawg(word)) {
00479         tprintf("Error: failed to insert pattern '%s'\n", string);
00480       }
00481     }
00482     ++pattern_count;
00483   }
00484   if (debug_level_) {
00485     tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
00486   }
00487   fclose(pattern_file);
00488   return true;
00489 }
00490 
00491 void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction,
00492                                bool word_end, UNICHAR_ID unichar_id) {
00493   EDGE_RECORD *edge_ptr = NULL;
00494   EDGE_INDEX edge_index = 0;
00495   ASSERT_HOST(edge_char_of(node1, node2, direction, word_end,
00496                            unichar_id, &edge_ptr, &edge_index));
00497   if (debug_level_ > 1) {
00498     tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
00499     print_edge_rec(*edge_ptr);
00500     tprintf("\n");
00501   }
00502   if (direction == FORWARD_EDGE) {
00503     nodes_[node1]->forward_edges.remove(edge_index);
00504   } else if (node1 == 0) {
00505     KillEdge(&nodes_[node1]->backward_edges[edge_index]);
00506     root_back_freelist_.push_back(edge_index);
00507   } else {
00508     nodes_[node1]->backward_edges.remove(edge_index);
00509   }
00510   --num_edges_;
00511 }
00512 
00513 // Some optimizations employed in add_word_to_dawg and trie_to_dawg:
00514 // 1 Avoid insertion sorting or bubble sorting the tail root node
00515 //   (back links on node 0, a list of all the leaves.). The node is
00516 //   huge, and sorting it with n^2 time is terrible.
00517 // 2 Avoid using GenericVector::remove on the tail root node.
00518 //   (a) During add of words to the trie, zero-out the unichars and
00519 //       keep a freelist of spaces to re-use.
00520 //   (b) During reduction, just zero-out the unichars of deleted back
00521 //       links, skipping zero entries while searching.
00522 // 3 Avoid linear search of the tail root node. This has to be done when
00523 //   a suffix is added to an existing word. Adding words by decreasing
00524 //   length avoids this problem entirely. Words can still be added in
00525 //   any order, but it is faster to add the longest first.
00526 SquishedDawg *Trie::trie_to_dawg() {
00527   root_back_freelist_.clear();  // Will be invalided by trie_to_dawg.
00528   if (debug_level_ > 2) {
00529     print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
00530   }
00531   NODE_MARKER reduced_nodes = new bool[nodes_.size()];
00532   for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = 0;
00533   this->reduce_node_input(0, reduced_nodes);
00534   delete[] reduced_nodes;
00535 
00536   if (debug_level_ > 2) {
00537     print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
00538   }
00539   // Build a translation map from node indices in nodes_ vector to
00540   // their target indices in EDGE_ARRAY.
00541   NODE_REF *node_ref_map = new NODE_REF[nodes_.size() + 1];
00542   int i, j;
00543   node_ref_map[0] = 0;
00544   for (i = 0; i < nodes_.size(); ++i) {
00545     node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
00546   }
00547   int num_forward_edges = node_ref_map[i];
00548 
00549   // Convert nodes_ vector into EDGE_ARRAY translating the next node references
00550   // in edges using node_ref_map. Empty nodes and backward edges are dropped.
00551   EDGE_ARRAY edge_array =
00552     (EDGE_ARRAY)memalloc(num_forward_edges * sizeof(EDGE_RECORD));
00553   EDGE_ARRAY edge_array_ptr = edge_array;
00554   for (i = 0; i < nodes_.size(); ++i) {
00555     TRIE_NODE_RECORD *node_ptr = nodes_[i];
00556     int end = node_ptr->forward_edges.size();
00557     for (j = 0; j < end; ++j) {
00558       EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
00559       NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
00560       ASSERT_HOST(node_ref < nodes_.size());
00561       UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
00562       link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
00563                 end_of_word_from_edge_rec(edge_rec), unichar_id);
00564       if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr);
00565       ++edge_array_ptr;
00566     }
00567   }
00568   delete[] node_ref_map;
00569 
00570   return new SquishedDawg(edge_array, num_forward_edges, type_, lang_,
00571                           perm_, unicharset_size_, debug_level_);
00572 }
00573 
00574 bool Trie::eliminate_redundant_edges(NODE_REF node,
00575                                      const EDGE_RECORD &edge1,
00576                                      const EDGE_RECORD &edge2) {
00577   if (debug_level_ > 1) {
00578     tprintf("\nCollapsing node %d:\n", node);
00579     print_node(node, MAX_NODE_EDGES_DISPLAY);
00580     tprintf("Candidate edges: ");
00581     print_edge_rec(edge1);
00582     tprintf(", ");
00583     print_edge_rec(edge2);
00584     tprintf("\n\n");
00585   }
00586   NODE_REF next_node1 = next_node_from_edge_rec(edge1);
00587   NODE_REF next_node2 = next_node_from_edge_rec(edge2);
00588   TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
00589   // Translate all edges going to/from next_node2 to go to/from next_node1.
00590   EDGE_RECORD *edge_ptr = NULL;
00591   EDGE_INDEX edge_index;
00592   int i;
00593   // The backward link in node to next_node2 will be zeroed out by the caller.
00594   // Copy all the backward links in next_node2 to node next_node1
00595   for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
00596     const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
00597     NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
00598     UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
00599     int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
00600     bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
00601     add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE,
00602                      curr_word_end, curr_unichar_id);
00603     // Relocate the corresponding forward edge in curr_next_node
00604     ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE,
00605                              curr_word_end, curr_unichar_id,
00606                              &edge_ptr, &edge_index));
00607     set_next_node_in_edge_rec(edge_ptr, next_node1);
00608   }
00609   int next_node2_num_edges = (next_node2_ptr->forward_edges.size() +
00610                               next_node2_ptr->backward_edges.size());
00611   if (debug_level_ > 1) {
00612     tprintf("removed %d edges from node " REFFORMAT "\n",
00613             next_node2_num_edges, next_node2);
00614   }
00615   next_node2_ptr->forward_edges.clear();
00616   next_node2_ptr->backward_edges.clear();
00617   num_edges_ -= next_node2_num_edges;
00618   return true;
00619 }
00620 
00621 bool Trie::reduce_lettered_edges(EDGE_INDEX edge_index,
00622                                  UNICHAR_ID unichar_id,
00623                                  NODE_REF node,
00624                                  EDGE_VECTOR* backward_edges,
00625                                  NODE_MARKER reduced_nodes) {
00626   if (debug_level_ > 1)
00627     tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
00628   // Compare each of the edge pairs with the given unichar_id.
00629   bool did_something = false;
00630   for (int i = edge_index; i < backward_edges->size() - 1; ++i) {
00631     // Find the first edge that can be eliminated.
00632     UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
00633     while (i < backward_edges->size()) {
00634       if (!DeadEdge((*backward_edges)[i])) {
00635         curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]);
00636         if (curr_unichar_id != unichar_id) return did_something;
00637         if (can_be_eliminated((*backward_edges)[i])) break;
00638       }
00639       ++i;
00640     }
00641     if (i == backward_edges->size()) break;
00642     const EDGE_RECORD &edge_rec = (*backward_edges)[i];
00643     // Compare it to the rest of the edges with the given unichar_id.
00644     for (int j = i + 1; j < backward_edges->size(); ++j) {
00645       const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
00646       if (DeadEdge(next_edge_rec)) continue;
00647       UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec);
00648       if (next_id != unichar_id) break;
00649       if (end_of_word_from_edge_rec(next_edge_rec) ==
00650           end_of_word_from_edge_rec(edge_rec) &&
00651           can_be_eliminated(next_edge_rec) &&
00652           eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
00653         reduced_nodes[next_node_from_edge_rec(edge_rec)] = 0;
00654         did_something = true;
00655         KillEdge(&(*backward_edges)[j]);
00656       }
00657     }
00658   }
00659   return did_something;
00660 }
00661 
00662 void Trie::sort_edges(EDGE_VECTOR *edges) {
00663   int num_edges = edges->size();
00664   if (num_edges <= 1) return;
00665   GenericVector<KDPairInc<UNICHAR_ID, EDGE_RECORD> > sort_vec;
00666   sort_vec.reserve(num_edges);
00667   for (int i = 0; i < num_edges; ++i) {
00668     sort_vec.push_back(KDPairInc<UNICHAR_ID, EDGE_RECORD>(
00669         unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]));
00670   }
00671   sort_vec.sort();
00672   for (int i = 0; i < num_edges; ++i)
00673     (*edges)[i] = sort_vec[i].data;
00674 }
00675 
00676 void Trie::reduce_node_input(NODE_REF node,
00677                              NODE_MARKER reduced_nodes) {
00678   EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
00679   sort_edges(&backward_edges);
00680   if (debug_level_ > 1) {
00681     tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
00682     print_node(node, MAX_NODE_EDGES_DISPLAY);
00683   }
00684 
00685   EDGE_INDEX edge_index = 0;
00686   while (edge_index < backward_edges.size()) {
00687     if (DeadEdge(backward_edges[edge_index])) continue;
00688     UNICHAR_ID unichar_id =
00689       unichar_id_from_edge_rec(backward_edges[edge_index]);
00690     while (reduce_lettered_edges(edge_index, unichar_id, node,
00691                                  &backward_edges, reduced_nodes));
00692     while (++edge_index < backward_edges.size()) {
00693       UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);
00694       if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) break;
00695     }
00696   }
00697   reduced_nodes[node] = true;  // mark as reduced
00698 
00699   if (debug_level_ > 1) {
00700     tprintf("Node " REFFORMAT " after reduction:\n", node);
00701     print_node(node, MAX_NODE_EDGES_DISPLAY);
00702   }
00703 
00704   for (int i = 0; i < backward_edges.size(); ++i) {
00705     if (DeadEdge(backward_edges[i])) continue;
00706     NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]);
00707     if (next_node != 0 && !reduced_nodes[next_node]) {
00708       reduce_node_input(next_node, reduced_nodes);
00709     }
00710   }
00711 }
00712 
00713 void Trie::print_node(NODE_REF node, int max_num_edges) const {
00714   if (node == NO_EDGE) return;  // nothing to print
00715   TRIE_NODE_RECORD *node_ptr = nodes_[node];
00716   int num_fwd = node_ptr->forward_edges.size();
00717   int num_bkw = node_ptr->backward_edges.size();
00718   EDGE_VECTOR *vec;
00719   for (int dir = 0; dir < 2; ++dir) {
00720     if (dir == 0) {
00721       vec = &(node_ptr->forward_edges);
00722       tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
00723     } else {
00724       vec = &(node_ptr->backward_edges);
00725       tprintf("\t");
00726     }
00727     int i;
00728     for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
00729          i < max_num_edges; ++i) {
00730       if (DeadEdge((*vec)[i])) continue;
00731       print_edge_rec((*vec)[i]);
00732       tprintf(" ");
00733     }
00734     if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("...");
00735     tprintf("\n");
00736   }
00737 }
00738 
00739 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines