tesseract 3.04.01

classify/shapetable.h

Go to the documentation of this file.
00001 // Copyright 2010 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00004 // File:        shapetable.h
00005 // Description: Class to map a classifier shape index to unicharset
00006 //              indices and font indices.
00007 // Author:      Ray Smith
00008 // Created:     Thu Oct 28 17:46:32 PDT 2010
00009 //
00010 // (C) Copyright 2010, Google Inc.
00011 // Licensed under the Apache License, Version 2.0 (the "License");
00012 // you may not use this file except in compliance with the License.
00013 // You may obtain a copy of the License at
00014 // http://www.apache.org/licenses/LICENSE-2.0
00015 // Unless required by applicable law or agreed to in writing, software
00016 // distributed under the License is distributed on an "AS IS" BASIS,
00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00018 // See the License for the specific language governing permissions and
00019 // limitations under the License.
00020 //
00022 
00023 #ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
00024 #define TESSERACT_CLASSIFY_SHAPETABLE_H_
00025 
00026 #include "bitvector.h"
00027 #include "fontinfo.h"
00028 #include "genericheap.h"
00029 #include "genericvector.h"
00030 #include "intmatcher.h"
00031 
00032 class STRING;
00033 class UNICHARSET;
00034 
00035 namespace tesseract {
00036 
00037 class ShapeTable;
00038 
00039 // Simple struct to hold a single classifier unichar selection, a corresponding
00040 // rating, and a list of appropriate fonts.
00041 struct UnicharRating {
00042   UnicharRating()
00043     : unichar_id(0), rating(0.0f), adapted(false), config(0),
00044       feature_misses(0) {}
00045   UnicharRating(int u, float r)
00046     : unichar_id(u), rating(r), adapted(false), config(0), feature_misses(0) {}
00047 
00048   // Print debug info.
00049   void Print() const {
00050     tprintf("Unichar-id=%d, rating=%g, adapted=%d, config=%d, misses=%d,"
00051             " %d fonts\n", unichar_id, rating, adapted, config, feature_misses,
00052             fonts.size());
00053   }
00054 
00055   // Sort function to sort ratings appropriately by descending rating.
00056   static int SortDescendingRating(const void* t1, const void* t2) {
00057     const UnicharRating* a = reinterpret_cast<const UnicharRating *>(t1);
00058     const UnicharRating* b = reinterpret_cast<const UnicharRating *>(t2);
00059     if (a->rating > b->rating) {
00060       return -1;
00061     } else if (a->rating < b->rating) {
00062       return 1;
00063     } else {
00064       return a->unichar_id - b->unichar_id;
00065     }
00066   }
00067   // Helper function to get the index of the first result with the required
00068   // unichar_id. If the results are sorted by rating, this will also be the
00069   // best result with the required unichar_id.
00070   // Returns -1 if the unichar_id is not found
00071   static int FirstResultWithUnichar(const GenericVector<UnicharRating>& results,
00072                                     UNICHAR_ID unichar_id);
00073 
00074   // Index into some UNICHARSET table indicates the class of the answer.
00075   UNICHAR_ID unichar_id;
00076   // Rating from classifier with 1.0 perfect and 0.0 impossible.
00077   // Call it a probability if you must.
00078   float rating;
00079   // True if this result is from the adaptive classifier.
00080   bool adapted;
00081   // Index of best matching font configuration of result.
00082   uinT8 config;
00083   // Number of features that were total misses - were liked by no classes.
00084   uinT16 feature_misses;
00085   // Unsorted collection of fontinfo ids and scores. Note that a raw result
00086   // from the IntegerMatch will contain config ids, that require transforming
00087   // to fontinfo ids via fontsets and (possibly) shapetable.
00088   GenericVector<ScoredFont> fonts;
00089 };
00090 
00091 // Classifier result from a low-level classification is an index into some
00092 // ShapeTable and a rating.
00093 struct ShapeRating {
00094   ShapeRating()
00095     : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f),
00096       joined(false), broken(false) {}
00097   ShapeRating(int s, float r)
00098     : shape_id(s), rating(r), raw(1.0f), font(0.0f),
00099       joined(false), broken(false) {}
00100 
00101   // Sort function to sort ratings appropriately by descending rating.
00102   static int SortDescendingRating(const void* t1, const void* t2) {
00103     const ShapeRating* a = reinterpret_cast<const ShapeRating *>(t1);
00104     const ShapeRating* b = reinterpret_cast<const ShapeRating *>(t2);
00105     if (a->rating > b->rating) {
00106       return -1;
00107     } else if (a->rating < b->rating) {
00108       return 1;
00109     } else {
00110       return a->shape_id - b->shape_id;
00111     }
00112   }
00113   // Helper function to get the index of the first result with the required
00114   // unichar_id. If the results are sorted by rating, this will also be the
00115   // best result with the required unichar_id.
00116   // Returns -1 if the unichar_id is not found
00117   static int FirstResultWithUnichar(const GenericVector<ShapeRating>& results,
00118                                     const ShapeTable& shape_table,
00119                                     UNICHAR_ID unichar_id);
00120 
00121   // Index into some shape table indicates the class of the answer.
00122   int shape_id;
00123   // Rating from classifier with 1.0 perfect and 0.0 impossible.
00124   // Call it a probability if you must.
00125   float rating;
00126   // Subsidiary rating that a classifier may use internally.
00127   float raw;
00128   // Subsidiary rating that a classifier may use internally.
00129   float font;
00130   // Flag indicating that the input may be joined.
00131   bool joined;
00132   // Flag indicating that the input may be broken (a fragment).
00133   bool broken;
00134 };
00135 
00136 // Simple struct to hold an entry for a heap-based priority queue of
00137 // ShapeRating.
00138 struct ShapeQueueEntry {
00139   ShapeQueueEntry() : result(ShapeRating(0, 0.0f)), level(0) {}
00140   ShapeQueueEntry(const ShapeRating& rating, int level0)
00141     : result(rating), level(level0) {}
00142 
00143   // Sort by decreasing rating and decreasing level for equal rating.
00144   bool operator<(const ShapeQueueEntry& other) const {
00145     if (result.rating > other.result.rating) return true;
00146     if (result.rating == other.result.rating)
00147       return level > other.level;
00148     return false;
00149   }
00150 
00151   // Output from classifier.
00152   ShapeRating result;
00153   // Which level in the tree did this come from?
00154   int level;
00155 };
00156 typedef GenericHeap<ShapeQueueEntry> ShapeQueue;
00157 
00158 // Simple struct to hold a set of fonts associated with a single unichar-id.
00159 // A vector of UnicharAndFonts makes a shape.
00160 struct UnicharAndFonts {
00161   UnicharAndFonts() : unichar_id(0) {
00162   }
00163   UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) {
00164     font_ids.push_back(font_id);
00165   }
00166 
00167   // Writes to the given file. Returns false in case of error.
00168   bool Serialize(FILE* fp) const;
00169   // Reads from the given file. Returns false in case of error.
00170   // If swap is true, assumes a big/little-endian swap is needed.
00171   bool DeSerialize(bool swap, FILE* fp);
00172 
00173   // Sort function to sort a pair of UnicharAndFonts by unichar_id.
00174   static int SortByUnicharId(const void* v1, const void* v2);
00175 
00176   GenericVector<inT32> font_ids;
00177   inT32 unichar_id;
00178 };
00179 
00180 // A Shape is a collection of unichar-ids and a list of fonts associated with
00181 // each, organized as a vector of UnicharAndFonts. Conceptually a Shape is
00182 // a classifiable unit, and represents a group of characters or parts of
00183 // characters that have a similar or identical shape. Shapes/ShapeTables may
00184 // be organized hierarchically from identical shapes at the leaves to vaguely
00185 // similar shapes near the root.
00186 class Shape {
00187  public:
00188   Shape() : destination_index_(-1) {}
00189 
00190   // Writes to the given file. Returns false in case of error.
00191   bool Serialize(FILE* fp) const;
00192   // Reads from the given file. Returns false in case of error.
00193   // If swap is true, assumes a big/little-endian swap is needed.
00194   bool DeSerialize(bool swap, FILE* fp);
00195 
00196   int destination_index() const {
00197     return destination_index_;
00198   }
00199   void set_destination_index(int index) {
00200     destination_index_ = index;
00201   }
00202   int size() const {
00203     return unichars_.size();
00204   }
00205   // Returns a UnicharAndFonts entry for the given index, which must be
00206   // in the range [0, size()).
00207   const UnicharAndFonts& operator[](int index) const {
00208     return unichars_[index];
00209   }
00210   // Sets the unichar_id of the given index to the new unichar_id.
00211   void SetUnicharId(int index, int unichar_id) {
00212     unichars_[index].unichar_id = unichar_id;
00213   }
00214   // Adds a font_id for the given unichar_id. If the unichar_id is not
00215   // in the shape, it is added.
00216   void AddToShape(int unichar_id, int font_id);
00217   // Adds everything in other to this.
00218   void AddShape(const Shape& other);
00219   // Returns true if the shape contains the given unichar_id, font_id pair.
00220   bool ContainsUnicharAndFont(int unichar_id, int font_id) const;
00221   // Returns true if the shape contains the given unichar_id, ignoring font.
00222   bool ContainsUnichar(int unichar_id) const;
00223   // Returns true if the shape contains the given font, ignoring unichar_id.
00224   bool ContainsFont(int font_id) const;
00225   // Returns true if the shape contains the given font properties, ignoring
00226   // unichar_id.
00227   bool ContainsFontProperties(const FontInfoTable& font_table,
00228                               uinT32 properties) const;
00229   // Returns true if the shape contains multiple different font properties,
00230   // ignoring unichar_id.
00231   bool ContainsMultipleFontProperties(const FontInfoTable& font_table) const;
00232   // Returns true if this shape is equal to other (ignoring order of unichars
00233   // and fonts).
00234   bool operator==(const Shape& other) const;
00235   // Returns true if this is a subset (including equal) of other.
00236   bool IsSubsetOf(const Shape& other) const;
00237   // Returns true if the lists of unichar ids are the same in this and other,
00238   // ignoring fonts.
00239   // NOT const, as it will sort the unichars on demand.
00240   bool IsEqualUnichars(Shape* other);
00241 
00242  private:
00243   // Sorts the unichars_ vector by unichar.
00244   void SortUnichars();
00245 
00246   // Flag indicates that the unichars are sorted, allowing faster set
00247   // operations with another shape.
00248   bool unichars_sorted_;
00249   // If this Shape is part of a ShapeTable the destiation_index_ is the index
00250   // of some other shape in the ShapeTable with which this shape is merged.
00251   int destination_index_;
00252   // Array of unichars, each with a set of fonts. Each unichar has at most
00253   // one entry in the vector.
00254   GenericVector<UnicharAndFonts> unichars_;
00255 };
00256 
00257 // ShapeTable is a class to encapsulate the triple indirection that is
00258 // used here.
00259 // ShapeTable is a vector of shapes.
00260 // Each shape is a vector of UnicharAndFonts representing the set of unichars
00261 // that the shape represents.
00262 // Each UnicharAndFonts also lists the fonts of the unichar_id that were
00263 // mapped to the shape during training.
00264 class ShapeTable {
00265  public:
00266   ShapeTable();
00267   // The UNICHARSET reference supplied here, or in set_unicharset below must
00268   // exist for the entire life of the ShapeTable. It is used only by DebugStr.
00269   explicit ShapeTable(const UNICHARSET& unicharset);
00270 
00271   // Writes to the given file. Returns false in case of error.
00272   bool Serialize(FILE* fp) const;
00273   // Reads from the given file. Returns false in case of error.
00274   // If swap is true, assumes a big/little-endian swap is needed.
00275   bool DeSerialize(bool swap, FILE* fp);
00276 
00277   // Accessors.
00278   int NumShapes() const {
00279     return shape_table_.size();
00280   }
00281   const UNICHARSET& unicharset() const {
00282     return *unicharset_;
00283   }
00284   // Returns the number of fonts used in this ShapeTable, computing it if
00285   // necessary.
00286   int NumFonts() const;
00287   // Shapetable takes a pointer to the UNICHARSET, so it must persist for the
00288   // entire life of the ShapeTable.
00289   void set_unicharset(const UNICHARSET& unicharset) {
00290     unicharset_ = &unicharset;
00291   }
00292   // Re-indexes the class_ids in the shapetable according to the given map.
00293   // Useful in conjunction with set_unicharset.
00294   void ReMapClassIds(const GenericVector<int>& unicharset_map);
00295   // Returns a string listing the classes/fonts in a shape.
00296   STRING DebugStr(int shape_id) const;
00297   // Returns a debug string summarizing the table.
00298   STRING SummaryStr() const;
00299 
00300   // Adds a new shape starting with the given unichar_id and font_id.
00301   // Returns the assigned index.
00302   int AddShape(int unichar_id, int font_id);
00303   // Adds a copy of the given shape unless it is already present.
00304   // Returns the assigned index or index of existing shape if already present.
00305   int AddShape(const Shape& other);
00306   // Removes the shape given by the shape index. All indices above are changed!
00307   void DeleteShape(int shape_id);
00308   // Adds a font_id to the given existing shape index for the given
00309   // unichar_id. If the unichar_id is not in the shape, it is added.
00310   void AddToShape(int shape_id, int unichar_id, int font_id);
00311   // Adds the given shape to the existing shape with the given index.
00312   void AddShapeToShape(int shape_id, const Shape& other);
00313   // Returns the id of the shape that contains the given unichar and font.
00314   // If not found, returns -1.
00315   // If font_id < 0, the font_id is ignored and the first shape that matches
00316   // the unichar_id is returned.
00317   int FindShape(int unichar_id, int font_id) const;
00318   // Returns the first unichar_id and font_id in the given shape.
00319   void GetFirstUnicharAndFont(int shape_id,
00320                               int* unichar_id, int* font_id) const;
00321 
00322   // Accessors for the Shape with the given shape_id.
00323   const Shape& GetShape(int shape_id) const {
00324     return *shape_table_[shape_id];
00325   }
00326   Shape* MutableShape(int shape_id) {
00327     return shape_table_[shape_id];
00328   }
00329 
00330   // Expands all the classes/fonts in the shape individually to build
00331   // a ShapeTable.
00332   int BuildFromShape(const Shape& shape, const ShapeTable& master_shapes);
00333 
00334   // Returns true if the shapes are already merged.
00335   bool AlreadyMerged(int shape_id1, int shape_id2) const;
00336   // Returns true if any shape contains multiple unichars.
00337   bool AnyMultipleUnichars() const;
00338   // Returns the maximum number of unichars over all shapes.
00339   int MaxNumUnichars() const;
00340   // Merges shapes with a common unichar over the [start, end) interval.
00341   // Assumes single unichar per shape.
00342   void ForceFontMerges(int start, int end);
00343   // Returns the number of unichars in the master shape.
00344   int MasterUnicharCount(int shape_id) const;
00345   // Returns the sum of the font counts in the master shape.
00346   int MasterFontCount(int shape_id) const;
00347   // Returns the number of unichars that would result from merging the shapes.
00348   int MergedUnicharCount(int shape_id1, int shape_id2) const;
00349   // Merges two shape_ids, leaving shape_id2 marked as merged.
00350   void MergeShapes(int shape_id1, int shape_id2);
00351   // Swaps two shape_ids.
00352   void SwapShapes(int shape_id1, int shape_id2);
00353   // Appends the master shapes from other to this.
00354   // Used to create a clean ShapeTable from a merged one, or to create a
00355   // copy of a ShapeTable.
00356   // If not NULL, shape_map is set to map other shape_ids to this's shape_ids.
00357   void AppendMasterShapes(const ShapeTable& other,
00358                           GenericVector<int>* shape_map);
00359   // Returns the number of master shapes remaining after merging.
00360   int NumMasterShapes() const;
00361   // Returns the destination of this shape, (if merged), taking into account
00362   // the fact that the destination may itself have been merged.
00363   // For a non-merged shape, returns the input shape_id.
00364   int MasterDestinationIndex(int shape_id) const;
00365 
00366   // Returns false if the unichars in neither shape is a subset of the other..
00367   bool SubsetUnichar(int shape_id1, int shape_id2) const;
00368   // Returns false if the unichars in neither shape is a subset of the other..
00369   bool MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id) const;
00370   // Returns true if the unichar sets are equal between the shapes.
00371   bool EqualUnichars(int shape_id1, int shape_id2) const;
00372   bool MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id) const;
00373   // Returns true if there is a common unichar between the shapes.
00374   bool CommonUnichars(int shape_id1, int shape_id2) const;
00375   // Returns true if there is a common font id between the shapes.
00376   bool CommonFont(int shape_id1, int shape_id2) const;
00377 
00378   // Adds the unichars of the given shape_id to the vector of results. Any
00379   // unichar_id that is already present just has the fonts added to the
00380   // font set for that result without adding a new entry in the vector.
00381   // NOTE: it is assumed that the results are given to this function in order
00382   // of decreasing rating.
00383   // The unichar_map vector indicates the index of the results entry containing
00384   // each unichar, or -1 if the unichar is not yet included in results.
00385   void AddShapeToResults(const ShapeRating& shape_rating,
00386                          GenericVector<int>* unichar_map,
00387                          GenericVector<UnicharRating>* results) const;
00388 
00389  private:
00390   // Adds the given unichar_id to the results if needed, updating unichar_map
00391   // and returning the index of unichar in results.
00392   int AddUnicharToResults(int unichar_id, float rating,
00393                           GenericVector<int>* unichar_map,
00394                           GenericVector<UnicharRating>* results) const;
00395 
00396   // Pointer to a provided unicharset used only by the Debugstr member.
00397   const UNICHARSET* unicharset_;
00398   // Vector of pointers to the Shapes in this ShapeTable.
00399   PointerVector<Shape> shape_table_;
00400 
00401   // Cached data calculated on demand.
00402   mutable int num_fonts_;
00403 };
00404 
00405 }  // namespace tesseract.
00406 
00407 #endif  // TESSERACT_CLASSIFY_SHAPETABLE_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines