tesseract 3.04.01

dict/dawg.cpp

Go to the documentation of this file.
00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        dawg.c  (Formerly dawg.c)
00005  * Description:  Use a Directed Accyclic Word Graph
00006  * Author:       Mark Seaman, OCR Technology
00007  * Created:      Fri Oct 16 14:37:00 1987
00008  * Modified:     Wed Jul 24 16:59:16 1991 (Mark Seaman) marks@hpgrlt
00009  * Language:     C
00010  * Package:      N/A
00011  * Status:       Reusable Software Component
00012  *
00013  * (c) Copyright 1987, Hewlett-Packard Company.
00014  ** Licensed under the Apache License, Version 2.0 (the "License");
00015  ** you may not use this file except in compliance with the License.
00016  ** You may obtain a copy of the License at
00017  ** http://www.apache.org/licenses/LICENSE-2.0
00018  ** Unless required by applicable law or agreed to in writing, software
00019  ** distributed under the License is distributed on an "AS IS" BASIS,
00020  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021  ** See the License for the specific language governing permissions and
00022  ** limitations under the License.
00023  *
00024  *********************************************************************************/
00025 /*----------------------------------------------------------------------
00026               I n c l u d e s
00027 ----------------------------------------------------------------------*/
00028 
00029 #ifdef _MSC_VER
00030 #pragma warning(disable:4244)  // Conversion warnings
00031 #pragma warning(disable:4800)  // int/bool warnings
00032 #endif
00033 #include "dawg.h"
00034 
00035 #include "cutil.h"
00036 #include "dict.h"
00037 #include "emalloc.h"
00038 #include "freelist.h"
00039 #include "helpers.h"
00040 #include "strngs.h"
00041 #include "tesscallback.h"
00042 #include "tprintf.h"
00043 
00044 /*----------------------------------------------------------------------
00045               F u n c t i o n s   f o r   D a w g
00046 ----------------------------------------------------------------------*/
00047 namespace tesseract {
00048 
00049 bool Dawg::prefix_in_dawg(const WERD_CHOICE &word,
00050                           bool requires_complete) const {
00051   if (word.length() == 0) return !requires_complete;
00052   NODE_REF node = 0;
00053   int end_index = word.length() - 1;
00054   for (int i = 0; i < end_index; i++) {
00055     EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
00056     if (edge == NO_EDGE) {
00057       return false;
00058     }
00059     if ((node = next_node(edge)) == 0) {
00060       // This only happens if all words following this edge terminate --
00061       // there are no larger words.  See Trie::add_word_to_dawg()
00062       return false;
00063     }
00064   }
00065   // Now check the last character.
00066   return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
00067       NO_EDGE;
00068 }
00069 
00070 bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
00071   return prefix_in_dawg(word, true);
00072 }
00073 
00074 int Dawg::check_for_words(const char *filename,
00075                           const UNICHARSET &unicharset,
00076                           bool enable_wildcard) const {
00077   if (filename == NULL) return 0;
00078 
00079   FILE       *word_file;
00080   char       string [CHARS_PER_LINE];
00081   int misses = 0;
00082   UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);
00083 
00084   word_file = open_file (filename, "r");
00085 
00086   while (fgets (string, CHARS_PER_LINE, word_file) != NULL) {
00087     chomp_string(string);  // remove newline
00088     WERD_CHOICE word(string, unicharset);
00089     if (word.length() > 0 &&
00090         !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
00091       if (!match_words(&word, 0, 0,
00092                        enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
00093         tprintf("Missing word: %s\n", string);
00094         ++misses;
00095       }
00096     } else {
00097       tprintf("Failed to create a valid word from %s\n", string);
00098     }
00099   }
00100   fclose (word_file);
00101   // Make sure the user sees this with fprintf instead of tprintf.
00102   if (debug_level_) tprintf("Number of lost words=%d\n", misses);
00103   return misses;
00104 }
00105 
00106 void Dawg::iterate_words(const UNICHARSET &unicharset,
00107                          TessCallback1<const WERD_CHOICE *> *cb) const {
00108   WERD_CHOICE word(&unicharset);
00109   iterate_words_rec(word, 0, cb);
00110 }
00111 
00112 void CallWithUTF8(TessCallback1<const char *> *cb, const WERD_CHOICE *wc) {
00113   STRING s;
00114   wc->string_and_lengths(&s, NULL);
00115   cb->Run(s.string());
00116 }
00117 
00118 void Dawg::iterate_words(const UNICHARSET &unicharset,
00119                          TessCallback1<const char *> *cb) const {
00120   TessCallback1<const WERD_CHOICE *> *shim =
00121       NewPermanentTessCallback(CallWithUTF8, cb);
00122   WERD_CHOICE word(&unicharset);
00123   iterate_words_rec(word, 0, shim);
00124   delete shim;
00125 }
00126 
00127 void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
00128                              NODE_REF to_explore,
00129                              TessCallback1<const WERD_CHOICE *> *cb) const {
00130   NodeChildVector children;
00131   this->unichar_ids_of(to_explore, &children, false);
00132   for (int i = 0; i < children.size(); i++) {
00133     WERD_CHOICE next_word(word_so_far);
00134     next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
00135     if (this->end_of_word(children[i].edge_ref)) {
00136       cb->Run(&next_word);
00137     }
00138     NODE_REF next = next_node(children[i].edge_ref);
00139     if (next != 0) {
00140       iterate_words_rec(next_word, next, cb);
00141     }
00142   }
00143 }
00144 
00145 bool Dawg::match_words(WERD_CHOICE *word, inT32 index,
00146                        NODE_REF node, UNICHAR_ID wildcard) const {
00147   EDGE_REF edge;
00148   inT32 word_end;
00149 
00150   if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
00151     bool any_matched = false;
00152     NodeChildVector vec;
00153     this->unichar_ids_of(node, &vec, false);
00154     for (int i = 0; i < vec.size(); ++i) {
00155       word->set_unichar_id(vec[i].unichar_id, index);
00156       if (match_words(word, index, node, wildcard))
00157         any_matched = true;
00158     }
00159     word->set_unichar_id(wildcard, index);
00160     return any_matched;
00161   } else {
00162     word_end = index == word->length() - 1;
00163     edge = edge_char_of(node, word->unichar_id(index), word_end);
00164     if (edge != NO_EDGE) {  // normal edge in DAWG
00165       node = next_node(edge);
00166       if (word_end) {
00167         if (debug_level_ > 1) word->print("match_words() found: ");
00168         return true;
00169       } else if (node != 0) {
00170         return match_words(word, index+1, node, wildcard);
00171       }
00172     }
00173   }
00174   return false;
00175 }
00176 
00177 void Dawg::init(DawgType type, const STRING &lang,
00178                 PermuterType perm, int unicharset_size, int debug_level) {
00179   type_ = type;
00180   lang_ = lang;
00181   perm_ = perm;
00182   ASSERT_HOST(unicharset_size > 0);
00183   unicharset_size_ = unicharset_size;
00184   // Set bit masks. We will use the value unicharset_size_ as a null char, so
00185   // the actual number of unichars is unicharset_size_ + 1.
00186   flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0));
00187   next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS;
00188   letter_mask_ = ~(~0ull << flag_start_bit_);
00189   next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS);
00190   flags_mask_ = ~(letter_mask_ | next_node_mask_);
00191 
00192   debug_level_ = debug_level;
00193 }
00194 
00195 
00196 /*----------------------------------------------------------------------
00197          F u n c t i o n s   f o r   S q u i s h e d    D a w g
00198 ----------------------------------------------------------------------*/
00199 
00200 SquishedDawg::~SquishedDawg() { memfree(edges_); }
00201 
00202 EDGE_REF SquishedDawg::edge_char_of(NODE_REF node,
00203                                     UNICHAR_ID unichar_id,
00204                                     bool word_end) const {
00205   EDGE_REF edge = node;
00206   if (node == 0) {  // binary search
00207     EDGE_REF start = 0;
00208     EDGE_REF end = num_forward_edges_in_node0 - 1;
00209     int compare;
00210     while (start <= end) {
00211       edge = (start + end) >> 1;  // (start + end) / 2
00212       compare = given_greater_than_edge_rec(NO_EDGE, word_end,
00213                                             unichar_id, edges_[edge]);
00214       if (compare == 0) {  // given == vec[k]
00215         return edge;
00216       } else if (compare == 1) {  // given > vec[k]
00217         start = edge + 1;
00218       } else {  // given < vec[k]
00219         end = edge - 1;
00220       }
00221     }
00222   } else {  // linear search
00223     if (edge != NO_EDGE && edge_occupied(edge)) {
00224       do {
00225         if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
00226             (!word_end || end_of_word_from_edge_rec(edges_[edge])))
00227           return (edge);
00228       } while (!last_edge(edge++));
00229     }
00230   }
00231   return (NO_EDGE);  // not found
00232 }
00233 
00234 inT32 SquishedDawg::num_forward_edges(NODE_REF node) const {
00235   EDGE_REF   edge = node;
00236   inT32        num  = 0;
00237 
00238   if (forward_edge (edge)) {
00239     do {
00240       num++;
00241     } while (!last_edge(edge++));
00242   }
00243 
00244   return (num);
00245 }
00246 
00247 void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {
00248   if (node == NO_EDGE) return;  // nothing to print
00249 
00250   EDGE_REF   edge = node;
00251   const char       *forward_string  = "FORWARD";
00252   const char       *backward_string = "       ";
00253 
00254   const char       *last_string     = "LAST";
00255   const char       *not_last_string = "    ";
00256 
00257   const char       *eow_string      = "EOW";
00258   const char       *not_eow_string  = "   ";
00259 
00260   const char       *direction;
00261   const char       *is_last;
00262   const char       *eow;
00263 
00264   UNICHAR_ID unichar_id;
00265 
00266   if (edge_occupied(edge)) {
00267     do {
00268       direction =
00269         forward_edge(edge) ? forward_string : backward_string;
00270       is_last = last_edge(edge) ? last_string : not_last_string;
00271       eow = end_of_word(edge) ? eow_string : not_eow_string;
00272 
00273       unichar_id = edge_letter(edge);
00274       tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
00275               edge, next_node(edge), unichar_id,
00276               direction, is_last, eow);
00277 
00278       if (edge - node > max_num_edges) return;
00279     } while (!last_edge(edge++));
00280 
00281     if (edge < num_edges_ &&
00282         edge_occupied(edge) && backward_edge(edge)) {
00283       do {
00284         direction =
00285           forward_edge(edge) ? forward_string : backward_string;
00286         is_last = last_edge(edge) ? last_string : not_last_string;
00287         eow = end_of_word(edge) ? eow_string : not_eow_string;
00288 
00289         unichar_id = edge_letter(edge);
00290         tprintf(REFFORMAT " : next = " REFFORMAT
00291                 ", unichar_id = %d, %s %s %s\n",
00292                 edge, next_node(edge), unichar_id,
00293                 direction, is_last, eow);
00294 
00295         if (edge - node > MAX_NODE_EDGES_DISPLAY) return;
00296       } while (!last_edge(edge++));
00297     }
00298   }
00299   else {
00300     tprintf(REFFORMAT " : no edges in this node\n", node);
00301   }
00302   tprintf("\n");
00303 }
00304 
00305 void SquishedDawg::print_edge(EDGE_REF edge) const {
00306   if (edge == NO_EDGE) {
00307     tprintf("NO_EDGE\n");
00308   } else {
00309     tprintf(REFFORMAT " : next = " REFFORMAT
00310             ", unichar_id = '%d', %s %s %s\n", edge,
00311             next_node(edge), edge_letter(edge),
00312             (forward_edge(edge) ? "FORWARD" : "       "),
00313             (last_edge(edge) ? "LAST"    : "    "),
00314             (end_of_word(edge) ? "EOW"     : ""));
00315   }
00316 }
00317 
00318 void SquishedDawg::read_squished_dawg(FILE *file,
00319                                       DawgType type,
00320                                       const STRING &lang,
00321                                       PermuterType perm,
00322                                       int debug_level) {
00323   if (debug_level) tprintf("Reading squished dawg\n");
00324 
00325   // Read the magic number and if it does not match kDawgMagicNumber
00326   // set swap to true to indicate that we need to switch endianness.
00327   inT16 magic;
00328   fread(&magic, sizeof(inT16), 1, file);
00329   bool swap = (magic != kDawgMagicNumber);
00330 
00331   int unicharset_size;
00332   fread(&unicharset_size, sizeof(inT32), 1, file);
00333   fread(&num_edges_, sizeof(inT32), 1, file);
00334 
00335   if (swap) {
00336     ReverseN(&unicharset_size, sizeof(unicharset_size));
00337     ReverseN(&num_edges_, sizeof(num_edges_));
00338   }
00339   ASSERT_HOST(num_edges_ > 0);  // DAWG should not be empty
00340   Dawg::init(type, lang, perm, unicharset_size, debug_level);
00341 
00342   edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_);
00343   fread(&edges_[0], sizeof(EDGE_RECORD), num_edges_, file);
00344   EDGE_REF edge;
00345   if (swap) {
00346     for (edge = 0; edge < num_edges_; ++edge) {
00347       ReverseN(&edges_[edge], sizeof(edges_[edge]));
00348     }
00349   }
00350   if (debug_level > 2) {
00351     tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
00352             type_, lang_.string(), perm_, unicharset_size_, num_edges_);
00353     for (edge = 0; edge < num_edges_; ++edge)
00354       print_edge(edge);
00355   }
00356 }
00357 
00358 NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {
00359   EDGE_REF   edge;
00360   NODE_MAP   node_map;
00361   inT32       node_counter;
00362   inT32       num_edges;
00363 
00364   node_map = (NODE_MAP) malloc(sizeof(EDGE_REF) * num_edges_);
00365 
00366   for (edge = 0; edge < num_edges_; edge++)       // init all slots
00367     node_map [edge] = -1;
00368 
00369   node_counter = num_forward_edges(0);
00370 
00371   *num_nodes   = 0;
00372   for (edge = 0; edge < num_edges_; edge++) {     // search all slots
00373 
00374     if (forward_edge(edge)) {
00375       (*num_nodes)++;                          // count nodes links
00376       node_map[edge] = (edge ? node_counter : 0);
00377       num_edges = num_forward_edges(edge);
00378       if (edge != 0) node_counter += num_edges;
00379       edge += num_edges;
00380       if (edge >= num_edges_) break;
00381       if (backward_edge(edge)) while (!last_edge(edge++));
00382       edge--;
00383     }
00384   }
00385   return (node_map);
00386 }
00387 
00388 void SquishedDawg::write_squished_dawg(FILE *file) {
00389   EDGE_REF    edge;
00390   inT32       num_edges;
00391   inT32       node_count = 0;
00392   NODE_MAP    node_map;
00393   EDGE_REF    old_index;
00394   EDGE_RECORD temp_record;
00395 
00396   if (debug_level_) tprintf("write_squished_dawg\n");
00397 
00398   node_map = build_node_map(&node_count);
00399 
00400   // Write the magic number to help detecting a change in endianness.
00401   inT16 magic = kDawgMagicNumber;
00402   fwrite(&magic, sizeof(inT16), 1, file);
00403   fwrite(&unicharset_size_, sizeof(inT32), 1, file);
00404 
00405   // Count the number of edges in this Dawg.
00406   num_edges = 0;
00407   for (edge=0; edge < num_edges_; edge++)
00408     if (forward_edge(edge))
00409       num_edges++;
00410 
00411   fwrite(&num_edges, sizeof(inT32), 1, file);  // write edge count to file
00412 
00413   if (debug_level_) {
00414     tprintf("%d nodes in DAWG\n", node_count);
00415     tprintf("%d edges in DAWG\n", num_edges);
00416   }
00417 
00418   for (edge = 0; edge < num_edges_; edge++) {
00419     if (forward_edge(edge)) {  // write forward edges
00420       do {
00421         old_index = next_node_from_edge_rec(edges_[edge]);
00422         set_next_node(edge, node_map[old_index]);
00423         temp_record = edges_[edge];
00424         fwrite(&(temp_record), sizeof(EDGE_RECORD), 1, file);
00425         set_next_node(edge, old_index);
00426       } while (!last_edge(edge++));
00427 
00428       if (edge >= num_edges_) break;
00429       if (backward_edge(edge))  // skip back links
00430         while (!last_edge(edge++));
00431 
00432       edge--;
00433     }
00434   }
00435   free(node_map);
00436 }
00437 
00438 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines