tesseract 3.04.01

dict/trie.h

Go to the documentation of this file.
00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        trie.h  (Formerly trie.h)
00005  * Description:  Functions to build a trie data structure.
00006  * Author:       Mark Seaman, SW Productivity
00007  * Created:      Fri Oct 16 14:37:00 1987
00008  * Modified:     Fri Jul 26 11:26:34 1991 (Mark Seaman) marks@hpgrlt
00009  * Language:     C
00010  * Package:      N/A
00011  * Status:       Reusable Software Component
00012  *
00013  * (c) Copyright 1987, Hewlett-Packard Company.
00014  ** Licensed under the Apache License, Version 2.0 (the "License");
00015  ** you may not use this file except in compliance with the License.
00016  ** You may obtain a copy of the License at
00017  ** http://www.apache.org/licenses/LICENSE-2.0
00018  ** Unless required by applicable law or agreed to in writing, software
00019  ** distributed under the License is distributed on an "AS IS" BASIS,
00020  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021  ** See the License for the specific language governing permissions and
00022  ** limitations under the License.
00023  *
00024  *********************************************************************************/
00025 #ifndef TRIE_H
00026 #define TRIE_H
00027 
00028 #include "dawg.h"
00029 #include "cutil.h"
00030 #include "genericvector.h"
00031 
00032 class UNICHARSET;
00033 
00034 // Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
00035 // max int32, we will need to change GenericVector to use int64 for size
00036 // and address indices. This does not seem to be needed immediately,
00037 // since currently the largest number of edges limit used by tesseract
00038 // (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
00039 // There are also int casts below to satisfy the WIN32 compiler that would
00040 // need to be changed.
00041 // It might be cleanest to change the types of most of the Trie/Dawg related
00042 // typedefs to int and restrict the casts to extracting these values from
00043 // the 64 bit EDGE_RECORD.
00044 typedef inT64 EDGE_INDEX;  // index of an edge in a given node
00045 typedef bool *NODE_MARKER;
00046 typedef GenericVector<EDGE_RECORD> EDGE_VECTOR;
00047 
00048 struct TRIE_NODE_RECORD {
00049   EDGE_VECTOR forward_edges;
00050   EDGE_VECTOR backward_edges;
00051 };
00052 typedef GenericVector<TRIE_NODE_RECORD *> TRIE_NODES;
00053 
00054 namespace tesseract {
00055 
00062 class Trie : public Dawg {
00063  public:
00064   enum RTLReversePolicy {
00065     RRP_DO_NO_REVERSE,
00066     RRP_REVERSE_IF_HAS_RTL,
00067     RRP_FORCE_REVERSE,
00068   };
00069 
00070   // Minimum number of concrete characters at the beginning of user patterns.
00071   static const int kSaneNumConcreteChars = 0;
00072   // Various unicode whitespace characters are used to denote unichar patterns,
00073   // (character classifier would never produce these whitespace characters as a
00074   // valid classification).
00075   static const char kAlphaPatternUnicode[];
00076   static const char kDigitPatternUnicode[];
00077   static const char kAlphanumPatternUnicode[];
00078   static const char kPuncPatternUnicode[];
00079   static const char kLowerPatternUnicode[];
00080   static const char kUpperPatternUnicode[];
00081 
00082   static const char *get_reverse_policy_name(
00083       RTLReversePolicy reverse_policy);
00084 
00085   // max_num_edges argument allows limiting the amount of memory this
00086   // Trie can consume (if a new word insert would cause the Trie to
00087   // contain more edges than max_num_edges, all the edges are cleared
00088   // so that new inserts can proceed).
00089   Trie(DawgType type, const STRING &lang, PermuterType perm,
00090        int unicharset_size, int debug_level) {
00091     init(type, lang, perm, unicharset_size, debug_level);
00092     num_edges_ = 0;
00093     deref_node_index_mask_ = ~letter_mask_;
00094     new_dawg_node();  // need to allocate node 0
00095     initialized_patterns_ = false;
00096   }
00097   virtual ~Trie() { nodes_.delete_data_pointers(); }
00098 
00099   // Reset the Trie to empty.
00100   void clear();
00101 
00103   EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id,
00104                         bool word_end) const {
00105     EDGE_RECORD *edge_ptr;
00106     EDGE_INDEX edge_index;
00107     if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id,
00108                       &edge_ptr, &edge_index)) return NO_EDGE;
00109     return make_edge_ref(node_ref, edge_index);
00110   }
00111 
00116   void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
00117                       bool word_end) const {
00118     const EDGE_VECTOR &forward_edges =
00119       nodes_[static_cast<int>(node)]->forward_edges;
00120     for (int i = 0; i < forward_edges.size(); ++i) {
00121       if (!word_end || end_of_word_from_edge_rec(forward_edges[i])) {
00122         vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]),
00123                                  make_edge_ref(node, i)));
00124       }
00125     }
00126   }
00127 
00132   NODE_REF next_node(EDGE_REF edge_ref) const {
00133     if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE;
00134     return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
00135   }
00136 
00141   bool end_of_word(EDGE_REF edge_ref) const {
00142     if (edge_ref == NO_EDGE || num_edges_ == 0) return false;
00143     return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
00144   }
00145 
00147   UNICHAR_ID edge_letter(EDGE_REF edge_ref) const {
00148     if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID;
00149     return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
00150   }
00151   // Sets the UNICHAR_ID in the given edge_rec to unicharset_size_, marking
00152   // the edge dead.
00153   void KillEdge(EDGE_RECORD* edge_rec) const {
00154     *edge_rec &= ~letter_mask_;
00155     *edge_rec |= (unicharset_size_ << LETTER_START_BIT);
00156   }
00157   bool DeadEdge(const EDGE_RECORD& edge_rec) const {
00158     return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
00159   }
00160 
00161   // Prints the contents of the node indicated by the given NODE_REF.
00162   // At most max_num_edges will be printed.
00163   void print_node(NODE_REF node, int max_num_edges) const;
00164 
00165   // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg.
00166   // Eliminates redundant edges and returns the pointer to the SquishedDawg.
00167   // Note: the caller is responsible for deallocating memory associated
00168   // with the returned SquishedDawg pointer.
00169   SquishedDawg *trie_to_dawg();
00170 
00171   // Reads a list of words from the given file and adds into the Trie.
00172   // Calls WERD_CHOICE::reverse_unichar_ids_if_rtl() according to the reverse
00173   // policy and information in the unicharset.
00174   // Returns false on error.
00175   bool read_and_add_word_list(const char *filename,
00176                               const UNICHARSET &unicharset,
00177                               Trie::RTLReversePolicy reverse);
00178 
00179   // Reads a list of words from the given file, applying the reverse_policy,
00180   // according to information in the unicharset.
00181   // Returns false on error.
00182   bool read_word_list(const char *filename,
00183                       const UNICHARSET &unicharset,
00184                       Trie::RTLReversePolicy reverse_policy,
00185                       GenericVector<STRING>* words);
00186   // Adds a list of words previously read using read_word_list to the trie
00187   // using the given unicharset to convert to unichar-ids.
00188   // Returns false on error.
00189   bool add_word_list(const GenericVector<STRING>& words,
00190                      const UNICHARSET &unicharset);
00191 
00192   // Inserts the list of patterns from the given file into the Trie.
00193   // The pattern list file should contain one pattern per line in UTF-8 format.
00194   //
00195   // Each pattern can contain any non-whitespace characters, however only the
00196   // patterns that contain characters from the unicharset of the corresponding
00197   // language will be useful.
00198   // The only meta character is '\'. To be used in a pattern as an ordinary
00199   // string it should be escaped with '\' (e.g. string "C:\Documents" should
00200   // be written in the patterns file as "C:\\Documents").
00201   // This function supports a very limited regular expression syntax. One can
00202   // express a character, a certain character class and a number of times the
00203   // entity should be repeated in the pattern.
00204   //
00205   // To denote a character class use one of:
00206   // \c - unichar for which UNICHARSET::get_isalpha() is true (character)
00207   // \d - unichar for which UNICHARSET::get_isdigit() is true
00208   // \n - unichar for which UNICHARSET::get_isdigit() and
00209   //      UNICHARSET::isalpha() are true
00210   // \p - unichar for which UNICHARSET::get_ispunct() is true
00211   // \a - unichar for which UNICHARSET::get_islower() is true
00212   // \A - unichar for which UNICHARSET::get_isupper() is true
00213   //
00214   // \* could be specified after each character or pattern to indicate that
00215   // the character/pattern can be repeated any number of times before the next
00216   // character/pattern occurs.
00217   //
00218   // Examples:
00219   // 1-8\d\d-GOOG-411 will be expanded to strings:
00220   // 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411.
00221   //
00222   // http://www.\n\*.com will be expanded to strings like:
00223   // http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com
00224   //
00225   // Note: In choosing which patterns to include please be aware of the fact
00226   // providing very generic patterns will make tesseract run slower.
00227   // For example \n\* at the beginning of the pattern will make Tesseract
00228   // consider all the combinations of proposed character choices for each
00229   // of the segmentations, which will be unacceptably slow.
00230   // Because of potential problems with speed that could be difficult to
00231   // identify, each user pattern has to have at least kSaneNumConcreteChars
00232   // concrete characters from the unicharset at the beginning.
00233   bool read_pattern_list(const char *filename, const UNICHARSET &unicharset);
00234 
00235   // Initializes the values of *_pattern_ unichar ids.
00236   // This function should be called before calling read_pattern_list().
00237   void initialize_patterns(UNICHARSET *unicharset);
00238 
00239   // Fills in the given unichar id vector with the unichar ids that represent
00240   // the patterns of the character classes of the given unichar_id.
00241   void unichar_id_to_patterns(UNICHAR_ID unichar_id,
00242                               const UNICHARSET &unicharset,
00243                               GenericVector<UNICHAR_ID> *vec) const;
00244 
00245   // Returns the given EDGE_REF if the EDGE_RECORD that it points to has
00246   // a self loop and the given unichar_id matches the unichar_id stored in the
00247   // EDGE_RECORD, returns NO_EDGE otherwise.
00248   virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref,
00249                                      UNICHAR_ID unichar_id,
00250                                      bool word_end) const {
00251     if (edge_ref == NO_EDGE) return NO_EDGE;
00252     EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
00253     return (marker_flag_from_edge_rec(*edge_rec) &&
00254             unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
00255             word_end == end_of_word_from_edge_rec(*edge_rec)) ?
00256             edge_ref : NO_EDGE;
00257   }
00258 
00259   // Adds a word to the Trie (creates the necessary nodes and edges).
00260   //
00261   // If repetitions vector is not NULL, each entry in the vector indicates
00262   // whether the unichar id with the corresponding index in the word is allowed
00263   // to repeat an unlimited number of times. For each entry that is true, MARKER
00264   // flag of the corresponding edge created for this unichar id is set to true).
00265   //
00266   // Return true if add succeeded, false otherwise (e.g. when a word contained
00267   // an invalid unichar id or the trie was getting too large and was cleared).
00268   bool add_word_to_dawg(const WERD_CHOICE &word,
00269                         const GenericVector<bool> *repetitions);
00270   bool add_word_to_dawg(const WERD_CHOICE &word) {
00271     return add_word_to_dawg(word, NULL);
00272   }
00273 
00274  protected:
00275   // The structure of an EDGE_REF for Trie edges is as follows:
00276   // [LETTER_START_BIT, flag_start_bit_):
00277   //                             edge index in *_edges in a TRIE_NODE_RECORD
00278   // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector)
00279   //
00280   // With this arrangement there are enough bits to represent edge indices
00281   // (each node can have at most unicharset_size_ forward edges and
00282   // the position of flag_start_bit is set to be log2(unicharset_size_)).
00283   // It is also possible to accommodate a maximum number of nodes that is at
00284   // least as large as that of the SquishedDawg representation (in SquishedDawg
00285   // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent
00286   // the next node index).
00287   //
00288 
00289   // Returns the pointer to EDGE_RECORD after decoding the location
00290   // of the edge from the information in the given EDGE_REF.
00291   // This function assumes that EDGE_REF holds valid node/edge indices.
00292   inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const {
00293     int edge_index = static_cast<int>(
00294       (edge_ref & letter_mask_) >> LETTER_START_BIT);
00295     int node_index = static_cast<int>(
00296       (edge_ref & deref_node_index_mask_) >> flag_start_bit_);
00297     TRIE_NODE_RECORD *node_rec = nodes_[node_index];
00298     return &(node_rec->forward_edges[edge_index]);
00299   }
00301   inline EDGE_REF make_edge_ref(NODE_REF node_index,
00302                                 EDGE_INDEX edge_index) const {
00303     return ((node_index << flag_start_bit_) |
00304             (edge_index << LETTER_START_BIT));
00305   }
00307   inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats,
00308                         int direction, bool word_end, UNICHAR_ID unichar_id) {
00309     EDGE_RECORD flags = 0;
00310     if (repeats) flags |= MARKER_FLAG;
00311     if (word_end) flags |= WERD_END_FLAG;
00312     if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG;
00313     *edge = ((nxt << next_node_start_bit_) |
00314              (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
00315              (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
00316   }
00318   inline void print_edge_rec(const EDGE_RECORD &edge_rec) const {
00319     tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
00320             marker_flag_from_edge_rec(edge_rec) ? "R," : "",
00321             (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
00322             end_of_word_from_edge_rec(edge_rec) ? ",E" : "",
00323             unichar_id_from_edge_rec(edge_rec));
00324   }
00325   // Returns true if the next node in recorded the given EDGE_RECORD
00326   // has exactly one forward edge.
00327   inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) {
00328     NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
00329     return (node_ref != NO_EDGE &&
00330             nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
00331   }
00332 
00333   // Prints the contents of the Trie.
00334   // At most max_num_edges will be printed for each node.
00335   void print_all(const char* msg, int max_num_edges) {
00336     tprintf("\n__________________________\n%s\n", msg);
00337     for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
00338     tprintf("__________________________\n");
00339   }
00340 
00341   // Finds the edge with the given direction, word_end and unichar_id
00342   // in the node indicated by node_ref. Fills in the pointer to the
00343   // EDGE_RECORD and the index of the edge with the the values
00344   // corresponding to the edge found. Returns true if an edge was found.
00345   bool edge_char_of(NODE_REF node_ref, NODE_REF next_node,
00346                     int direction, bool word_end, UNICHAR_ID unichar_id,
00347                     EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const;
00348 
00349   // Adds an single edge linkage between node1 and node2 in the direction
00350   // indicated by direction argument.
00351   bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats,
00352                         int direction, bool word_end,
00353                         UNICHAR_ID unichar_id);
00354 
00355   // Adds forward edge linkage from node1 to node2 and the corresponding
00356   // backward edge linkage in the other direction.
00357   bool add_new_edge(NODE_REF node1, NODE_REF node2,
00358                     bool repeats, bool word_end, UNICHAR_ID unichar_id) {
00359     return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE,
00360                              word_end, unichar_id) &&
00361             add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE,
00362                              word_end, unichar_id));
00363   }
00364 
00365   // Sets the word ending flags in an already existing edge pair.
00366   // Returns true on success.
00367   void add_word_ending(EDGE_RECORD *edge,
00368                        NODE_REF the_next_node,
00369                        bool repeats,
00370                        UNICHAR_ID unichar_id);
00371 
00372   // Allocates space for a new node in the Trie.
00373   NODE_REF new_dawg_node();
00374 
00375   // Removes a single edge linkage to between node1 and node2 in the
00376   // direction indicated by direction argument.
00377   void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction,
00378                            bool word_end, UNICHAR_ID unichar_id);
00379 
00380   // Removes forward edge linkage from node1 to node2 and the corresponding
00381   // backward edge linkage in the other direction.
00382   void remove_edge(NODE_REF node1, NODE_REF node2,
00383                    bool word_end, UNICHAR_ID unichar_id) {
00384     remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
00385     remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
00386   }
00387 
00388   // Compares edge1 and edge2 in the given node to see if they point to two
00389   // next nodes that could be collapsed. If they do, performs the reduction
00390   // and returns true.
00391   bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1,
00392                                  const EDGE_RECORD &edge2);
00393 
00394   // Assuming that edge_index indicates the first edge in a group of edges
00395   // in this node with a particular letter value, looks through these edges
00396   // to see if any of them can be collapsed. If so does it. Returns to the
00397   // caller when all edges with this letter have been reduced.
00398   // Returns true if further reduction is possible with this same letter.
00399   bool reduce_lettered_edges(EDGE_INDEX edge_index,
00400                              UNICHAR_ID unichar_id,
00401                              NODE_REF node,
00402                              EDGE_VECTOR* backward_edges,
00403                              NODE_MARKER reduced_nodes);
00404 
00411   void sort_edges(EDGE_VECTOR *edges);
00412 
00414   void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes);
00415 
00416   // Returns the pattern unichar id for the given character class code.
00417   UNICHAR_ID character_class_to_pattern(char ch);
00418 
00419   // Member variables
00420   TRIE_NODES nodes_;              // vector of nodes in the Trie
00421   uinT64 num_edges_;              // sum of all edges (forward and backward)
00422   uinT64 deref_direction_mask_;   // mask for EDGE_REF to extract direction
00423   uinT64 deref_node_index_mask_;  // mask for EDGE_REF to extract node index
00424   // Freelist of edges in the root backwards node that were previously zeroed.
00425   GenericVector<EDGE_INDEX> root_back_freelist_;
00426   // Variables for translating character class codes denoted in user patterns
00427   // file to the unichar ids used to represent them in a Trie.
00428   bool initialized_patterns_;
00429   UNICHAR_ID alpha_pattern_;
00430   UNICHAR_ID digit_pattern_;
00431   UNICHAR_ID alphanum_pattern_;
00432   UNICHAR_ID punc_pattern_;
00433   UNICHAR_ID lower_pattern_;
00434   UNICHAR_ID upper_pattern_;
00435 };
00436 }  // namespace tesseract
00437 
00438 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines