|
tesseract 3.04.01
|
00001 // Copyright 2010 Google Inc. All Rights Reserved. 00002 // Author: rays@google.com (Ray Smith) 00004 // File: shapetable.h 00005 // Description: Class to map a classifier shape index to unicharset 00006 // indices and font indices. 00007 // Author: Ray Smith 00008 // Created: Thu Oct 28 17:46:32 PDT 2010 00009 // 00010 // (C) Copyright 2010, Google Inc. 00011 // Licensed under the Apache License, Version 2.0 (the "License"); 00012 // you may not use this file except in compliance with the License. 00013 // You may obtain a copy of the License at 00014 // http://www.apache.org/licenses/LICENSE-2.0 00015 // Unless required by applicable law or agreed to in writing, software 00016 // distributed under the License is distributed on an "AS IS" BASIS, 00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00018 // See the License for the specific language governing permissions and 00019 // limitations under the License. 00020 // 00022 00023 #ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_ 00024 #define TESSERACT_CLASSIFY_SHAPETABLE_H_ 00025 00026 #include "bitvector.h" 00027 #include "fontinfo.h" 00028 #include "genericheap.h" 00029 #include "genericvector.h" 00030 #include "intmatcher.h" 00031 00032 class STRING; 00033 class UNICHARSET; 00034 00035 namespace tesseract { 00036 00037 class ShapeTable; 00038 00039 // Simple struct to hold a single classifier unichar selection, a corresponding 00040 // rating, and a list of appropriate fonts. 00041 struct UnicharRating { 00042 UnicharRating() 00043 : unichar_id(0), rating(0.0f), adapted(false), config(0), 00044 feature_misses(0) {} 00045 UnicharRating(int u, float r) 00046 : unichar_id(u), rating(r), adapted(false), config(0), feature_misses(0) {} 00047 00048 // Print debug info. 00049 void Print() const { 00050 tprintf("Unichar-id=%d, rating=%g, adapted=%d, config=%d, misses=%d," 00051 " %d fonts\n", unichar_id, rating, adapted, config, feature_misses, 00052 fonts.size()); 00053 } 00054 00055 // Sort function to sort ratings appropriately by descending rating. 00056 static int SortDescendingRating(const void* t1, const void* t2) { 00057 const UnicharRating* a = reinterpret_cast<const UnicharRating *>(t1); 00058 const UnicharRating* b = reinterpret_cast<const UnicharRating *>(t2); 00059 if (a->rating > b->rating) { 00060 return -1; 00061 } else if (a->rating < b->rating) { 00062 return 1; 00063 } else { 00064 return a->unichar_id - b->unichar_id; 00065 } 00066 } 00067 // Helper function to get the index of the first result with the required 00068 // unichar_id. If the results are sorted by rating, this will also be the 00069 // best result with the required unichar_id. 00070 // Returns -1 if the unichar_id is not found 00071 static int FirstResultWithUnichar(const GenericVector<UnicharRating>& results, 00072 UNICHAR_ID unichar_id); 00073 00074 // Index into some UNICHARSET table indicates the class of the answer. 00075 UNICHAR_ID unichar_id; 00076 // Rating from classifier with 1.0 perfect and 0.0 impossible. 00077 // Call it a probability if you must. 00078 float rating; 00079 // True if this result is from the adaptive classifier. 00080 bool adapted; 00081 // Index of best matching font configuration of result. 00082 uinT8 config; 00083 // Number of features that were total misses - were liked by no classes. 00084 uinT16 feature_misses; 00085 // Unsorted collection of fontinfo ids and scores. Note that a raw result 00086 // from the IntegerMatch will contain config ids, that require transforming 00087 // to fontinfo ids via fontsets and (possibly) shapetable. 00088 GenericVector<ScoredFont> fonts; 00089 }; 00090 00091 // Classifier result from a low-level classification is an index into some 00092 // ShapeTable and a rating. 00093 struct ShapeRating { 00094 ShapeRating() 00095 : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f), 00096 joined(false), broken(false) {} 00097 ShapeRating(int s, float r) 00098 : shape_id(s), rating(r), raw(1.0f), font(0.0f), 00099 joined(false), broken(false) {} 00100 00101 // Sort function to sort ratings appropriately by descending rating. 00102 static int SortDescendingRating(const void* t1, const void* t2) { 00103 const ShapeRating* a = reinterpret_cast<const ShapeRating *>(t1); 00104 const ShapeRating* b = reinterpret_cast<const ShapeRating *>(t2); 00105 if (a->rating > b->rating) { 00106 return -1; 00107 } else if (a->rating < b->rating) { 00108 return 1; 00109 } else { 00110 return a->shape_id - b->shape_id; 00111 } 00112 } 00113 // Helper function to get the index of the first result with the required 00114 // unichar_id. If the results are sorted by rating, this will also be the 00115 // best result with the required unichar_id. 00116 // Returns -1 if the unichar_id is not found 00117 static int FirstResultWithUnichar(const GenericVector<ShapeRating>& results, 00118 const ShapeTable& shape_table, 00119 UNICHAR_ID unichar_id); 00120 00121 // Index into some shape table indicates the class of the answer. 00122 int shape_id; 00123 // Rating from classifier with 1.0 perfect and 0.0 impossible. 00124 // Call it a probability if you must. 00125 float rating; 00126 // Subsidiary rating that a classifier may use internally. 00127 float raw; 00128 // Subsidiary rating that a classifier may use internally. 00129 float font; 00130 // Flag indicating that the input may be joined. 00131 bool joined; 00132 // Flag indicating that the input may be broken (a fragment). 00133 bool broken; 00134 }; 00135 00136 // Simple struct to hold an entry for a heap-based priority queue of 00137 // ShapeRating. 00138 struct ShapeQueueEntry { 00139 ShapeQueueEntry() : result(ShapeRating(0, 0.0f)), level(0) {} 00140 ShapeQueueEntry(const ShapeRating& rating, int level0) 00141 : result(rating), level(level0) {} 00142 00143 // Sort by decreasing rating and decreasing level for equal rating. 00144 bool operator<(const ShapeQueueEntry& other) const { 00145 if (result.rating > other.result.rating) return true; 00146 if (result.rating == other.result.rating) 00147 return level > other.level; 00148 return false; 00149 } 00150 00151 // Output from classifier. 00152 ShapeRating result; 00153 // Which level in the tree did this come from? 00154 int level; 00155 }; 00156 typedef GenericHeap<ShapeQueueEntry> ShapeQueue; 00157 00158 // Simple struct to hold a set of fonts associated with a single unichar-id. 00159 // A vector of UnicharAndFonts makes a shape. 00160 struct UnicharAndFonts { 00161 UnicharAndFonts() : unichar_id(0) { 00162 } 00163 UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) { 00164 font_ids.push_back(font_id); 00165 } 00166 00167 // Writes to the given file. Returns false in case of error. 00168 bool Serialize(FILE* fp) const; 00169 // Reads from the given file. Returns false in case of error. 00170 // If swap is true, assumes a big/little-endian swap is needed. 00171 bool DeSerialize(bool swap, FILE* fp); 00172 00173 // Sort function to sort a pair of UnicharAndFonts by unichar_id. 00174 static int SortByUnicharId(const void* v1, const void* v2); 00175 00176 GenericVector<inT32> font_ids; 00177 inT32 unichar_id; 00178 }; 00179 00180 // A Shape is a collection of unichar-ids and a list of fonts associated with 00181 // each, organized as a vector of UnicharAndFonts. Conceptually a Shape is 00182 // a classifiable unit, and represents a group of characters or parts of 00183 // characters that have a similar or identical shape. Shapes/ShapeTables may 00184 // be organized hierarchically from identical shapes at the leaves to vaguely 00185 // similar shapes near the root. 00186 class Shape { 00187 public: 00188 Shape() : destination_index_(-1) {} 00189 00190 // Writes to the given file. Returns false in case of error. 00191 bool Serialize(FILE* fp) const; 00192 // Reads from the given file. Returns false in case of error. 00193 // If swap is true, assumes a big/little-endian swap is needed. 00194 bool DeSerialize(bool swap, FILE* fp); 00195 00196 int destination_index() const { 00197 return destination_index_; 00198 } 00199 void set_destination_index(int index) { 00200 destination_index_ = index; 00201 } 00202 int size() const { 00203 return unichars_.size(); 00204 } 00205 // Returns a UnicharAndFonts entry for the given index, which must be 00206 // in the range [0, size()). 00207 const UnicharAndFonts& operator[](int index) const { 00208 return unichars_[index]; 00209 } 00210 // Sets the unichar_id of the given index to the new unichar_id. 00211 void SetUnicharId(int index, int unichar_id) { 00212 unichars_[index].unichar_id = unichar_id; 00213 } 00214 // Adds a font_id for the given unichar_id. If the unichar_id is not 00215 // in the shape, it is added. 00216 void AddToShape(int unichar_id, int font_id); 00217 // Adds everything in other to this. 00218 void AddShape(const Shape& other); 00219 // Returns true if the shape contains the given unichar_id, font_id pair. 00220 bool ContainsUnicharAndFont(int unichar_id, int font_id) const; 00221 // Returns true if the shape contains the given unichar_id, ignoring font. 00222 bool ContainsUnichar(int unichar_id) const; 00223 // Returns true if the shape contains the given font, ignoring unichar_id. 00224 bool ContainsFont(int font_id) const; 00225 // Returns true if the shape contains the given font properties, ignoring 00226 // unichar_id. 00227 bool ContainsFontProperties(const FontInfoTable& font_table, 00228 uinT32 properties) const; 00229 // Returns true if the shape contains multiple different font properties, 00230 // ignoring unichar_id. 00231 bool ContainsMultipleFontProperties(const FontInfoTable& font_table) const; 00232 // Returns true if this shape is equal to other (ignoring order of unichars 00233 // and fonts). 00234 bool operator==(const Shape& other) const; 00235 // Returns true if this is a subset (including equal) of other. 00236 bool IsSubsetOf(const Shape& other) const; 00237 // Returns true if the lists of unichar ids are the same in this and other, 00238 // ignoring fonts. 00239 // NOT const, as it will sort the unichars on demand. 00240 bool IsEqualUnichars(Shape* other); 00241 00242 private: 00243 // Sorts the unichars_ vector by unichar. 00244 void SortUnichars(); 00245 00246 // Flag indicates that the unichars are sorted, allowing faster set 00247 // operations with another shape. 00248 bool unichars_sorted_; 00249 // If this Shape is part of a ShapeTable the destiation_index_ is the index 00250 // of some other shape in the ShapeTable with which this shape is merged. 00251 int destination_index_; 00252 // Array of unichars, each with a set of fonts. Each unichar has at most 00253 // one entry in the vector. 00254 GenericVector<UnicharAndFonts> unichars_; 00255 }; 00256 00257 // ShapeTable is a class to encapsulate the triple indirection that is 00258 // used here. 00259 // ShapeTable is a vector of shapes. 00260 // Each shape is a vector of UnicharAndFonts representing the set of unichars 00261 // that the shape represents. 00262 // Each UnicharAndFonts also lists the fonts of the unichar_id that were 00263 // mapped to the shape during training. 00264 class ShapeTable { 00265 public: 00266 ShapeTable(); 00267 // The UNICHARSET reference supplied here, or in set_unicharset below must 00268 // exist for the entire life of the ShapeTable. It is used only by DebugStr. 00269 explicit ShapeTable(const UNICHARSET& unicharset); 00270 00271 // Writes to the given file. Returns false in case of error. 00272 bool Serialize(FILE* fp) const; 00273 // Reads from the given file. Returns false in case of error. 00274 // If swap is true, assumes a big/little-endian swap is needed. 00275 bool DeSerialize(bool swap, FILE* fp); 00276 00277 // Accessors. 00278 int NumShapes() const { 00279 return shape_table_.size(); 00280 } 00281 const UNICHARSET& unicharset() const { 00282 return *unicharset_; 00283 } 00284 // Returns the number of fonts used in this ShapeTable, computing it if 00285 // necessary. 00286 int NumFonts() const; 00287 // Shapetable takes a pointer to the UNICHARSET, so it must persist for the 00288 // entire life of the ShapeTable. 00289 void set_unicharset(const UNICHARSET& unicharset) { 00290 unicharset_ = &unicharset; 00291 } 00292 // Re-indexes the class_ids in the shapetable according to the given map. 00293 // Useful in conjunction with set_unicharset. 00294 void ReMapClassIds(const GenericVector<int>& unicharset_map); 00295 // Returns a string listing the classes/fonts in a shape. 00296 STRING DebugStr(int shape_id) const; 00297 // Returns a debug string summarizing the table. 00298 STRING SummaryStr() const; 00299 00300 // Adds a new shape starting with the given unichar_id and font_id. 00301 // Returns the assigned index. 00302 int AddShape(int unichar_id, int font_id); 00303 // Adds a copy of the given shape unless it is already present. 00304 // Returns the assigned index or index of existing shape if already present. 00305 int AddShape(const Shape& other); 00306 // Removes the shape given by the shape index. All indices above are changed! 00307 void DeleteShape(int shape_id); 00308 // Adds a font_id to the given existing shape index for the given 00309 // unichar_id. If the unichar_id is not in the shape, it is added. 00310 void AddToShape(int shape_id, int unichar_id, int font_id); 00311 // Adds the given shape to the existing shape with the given index. 00312 void AddShapeToShape(int shape_id, const Shape& other); 00313 // Returns the id of the shape that contains the given unichar and font. 00314 // If not found, returns -1. 00315 // If font_id < 0, the font_id is ignored and the first shape that matches 00316 // the unichar_id is returned. 00317 int FindShape(int unichar_id, int font_id) const; 00318 // Returns the first unichar_id and font_id in the given shape. 00319 void GetFirstUnicharAndFont(int shape_id, 00320 int* unichar_id, int* font_id) const; 00321 00322 // Accessors for the Shape with the given shape_id. 00323 const Shape& GetShape(int shape_id) const { 00324 return *shape_table_[shape_id]; 00325 } 00326 Shape* MutableShape(int shape_id) { 00327 return shape_table_[shape_id]; 00328 } 00329 00330 // Expands all the classes/fonts in the shape individually to build 00331 // a ShapeTable. 00332 int BuildFromShape(const Shape& shape, const ShapeTable& master_shapes); 00333 00334 // Returns true if the shapes are already merged. 00335 bool AlreadyMerged(int shape_id1, int shape_id2) const; 00336 // Returns true if any shape contains multiple unichars. 00337 bool AnyMultipleUnichars() const; 00338 // Returns the maximum number of unichars over all shapes. 00339 int MaxNumUnichars() const; 00340 // Merges shapes with a common unichar over the [start, end) interval. 00341 // Assumes single unichar per shape. 00342 void ForceFontMerges(int start, int end); 00343 // Returns the number of unichars in the master shape. 00344 int MasterUnicharCount(int shape_id) const; 00345 // Returns the sum of the font counts in the master shape. 00346 int MasterFontCount(int shape_id) const; 00347 // Returns the number of unichars that would result from merging the shapes. 00348 int MergedUnicharCount(int shape_id1, int shape_id2) const; 00349 // Merges two shape_ids, leaving shape_id2 marked as merged. 00350 void MergeShapes(int shape_id1, int shape_id2); 00351 // Swaps two shape_ids. 00352 void SwapShapes(int shape_id1, int shape_id2); 00353 // Appends the master shapes from other to this. 00354 // Used to create a clean ShapeTable from a merged one, or to create a 00355 // copy of a ShapeTable. 00356 // If not NULL, shape_map is set to map other shape_ids to this's shape_ids. 00357 void AppendMasterShapes(const ShapeTable& other, 00358 GenericVector<int>* shape_map); 00359 // Returns the number of master shapes remaining after merging. 00360 int NumMasterShapes() const; 00361 // Returns the destination of this shape, (if merged), taking into account 00362 // the fact that the destination may itself have been merged. 00363 // For a non-merged shape, returns the input shape_id. 00364 int MasterDestinationIndex(int shape_id) const; 00365 00366 // Returns false if the unichars in neither shape is a subset of the other.. 00367 bool SubsetUnichar(int shape_id1, int shape_id2) const; 00368 // Returns false if the unichars in neither shape is a subset of the other.. 00369 bool MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id) const; 00370 // Returns true if the unichar sets are equal between the shapes. 00371 bool EqualUnichars(int shape_id1, int shape_id2) const; 00372 bool MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id) const; 00373 // Returns true if there is a common unichar between the shapes. 00374 bool CommonUnichars(int shape_id1, int shape_id2) const; 00375 // Returns true if there is a common font id between the shapes. 00376 bool CommonFont(int shape_id1, int shape_id2) const; 00377 00378 // Adds the unichars of the given shape_id to the vector of results. Any 00379 // unichar_id that is already present just has the fonts added to the 00380 // font set for that result without adding a new entry in the vector. 00381 // NOTE: it is assumed that the results are given to this function in order 00382 // of decreasing rating. 00383 // The unichar_map vector indicates the index of the results entry containing 00384 // each unichar, or -1 if the unichar is not yet included in results. 00385 void AddShapeToResults(const ShapeRating& shape_rating, 00386 GenericVector<int>* unichar_map, 00387 GenericVector<UnicharRating>* results) const; 00388 00389 private: 00390 // Adds the given unichar_id to the results if needed, updating unichar_map 00391 // and returning the index of unichar in results. 00392 int AddUnicharToResults(int unichar_id, float rating, 00393 GenericVector<int>* unichar_map, 00394 GenericVector<UnicharRating>* results) const; 00395 00396 // Pointer to a provided unicharset used only by the Debugstr member. 00397 const UNICHARSET* unicharset_; 00398 // Vector of pointers to the Shapes in this ShapeTable. 00399 PointerVector<Shape> shape_table_; 00400 00401 // Cached data calculated on demand. 00402 mutable int num_fonts_; 00403 }; 00404 00405 } // namespace tesseract. 00406 00407 #endif // TESSERACT_CLASSIFY_SHAPETABLE_H_