tesseract 3.04.01

ccstruct/fontinfo.h

Go to the documentation of this file.
00001 
00002 // File:        fontinfo.h
00003 // Description: Font information classes abstracted from intproto.h/cpp.
00004 // Author:      rays@google.com (Ray Smith)
00005 // Created:     Tue May 17 17:08:01 PDT 2011
00006 //
00007 // (C) Copyright 2011, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 
00021 #ifndef TESSERACT_CCSTRUCT_FONTINFO_H_
00022 #define TESSERACT_CCSTRUCT_FONTINFO_H_
00023 
00024 #include "genericvector.h"
00025 #include "host.h"
00026 #include "unichar.h"
00027 
00028 template <typename T> class UnicityTable;
00029 
00030 namespace tesseract {
00031 
00032 class BitVector;
00033 
00034 // Simple struct to hold a font and a score. The scores come from the low-level
00035 // integer matcher, so they are in the uinT16 range. Fonts are an index to
00036 // fontinfo_table.
00037 // These get copied around a lot, so best to keep them small.
00038 struct ScoredFont {
00039   ScoredFont() : fontinfo_id(-1), score(0) {}
00040   ScoredFont(int font_id, uinT16 classifier_score)
00041       : fontinfo_id(font_id), score(classifier_score) {}
00042 
00043   // Index into fontinfo table, but inside the classifier, may be a shapetable
00044   // index.
00045   inT32 fontinfo_id;
00046   // Raw score from the low-level classifier.
00047   uinT16 score;
00048 };
00049 
00050 // Struct for information about spacing between characters in a particular font.
00051 struct FontSpacingInfo {
00052   inT16 x_gap_before;
00053   inT16 x_gap_after;
00054   GenericVector<UNICHAR_ID> kerned_unichar_ids;
00055   GenericVector<inT16> kerned_x_gaps;
00056 };
00057 
00058 /*
00059  * font_properties contains properties about boldness, italicness, fixed pitch,
00060  * serif, fraktur
00061  */
00062 struct FontInfo {
00063   FontInfo() : name(NULL), properties(0), universal_id(0), spacing_vec(NULL) {}
00064   ~FontInfo() {}
00065 
00066   // Writes to the given file. Returns false in case of error.
00067   bool Serialize(FILE* fp) const;
00068   // Reads from the given file. Returns false in case of error.
00069   // If swap is true, assumes a big/little-endian swap is needed.
00070   bool DeSerialize(bool swap, FILE* fp);
00071 
00072   // Reserves unicharset_size spots in spacing_vec.
00073   void init_spacing(int unicharset_size) {
00074     spacing_vec = new GenericVector<FontSpacingInfo *>();
00075     spacing_vec->init_to_size(unicharset_size, NULL);
00076   }
00077   // Adds the given pointer to FontSpacingInfo to spacing_vec member
00078   // (FontInfo class takes ownership of the pointer).
00079   // Note: init_spacing should be called before calling this function.
00080   void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info) {
00081     ASSERT_HOST(spacing_vec != NULL && spacing_vec->size() > uch_id);
00082     (*spacing_vec)[uch_id] = spacing_info;
00083   }
00084 
00085   // Returns the pointer to FontSpacingInfo for the given UNICHAR_ID.
00086   const FontSpacingInfo *get_spacing(UNICHAR_ID uch_id) const {
00087     return (spacing_vec == NULL || spacing_vec->size() <= uch_id) ?
00088         NULL : (*spacing_vec)[uch_id];
00089   }
00090 
00091   // Fills spacing with the value of the x gap expected between the two given
00092   // UNICHAR_IDs. Returns true on success.
00093   bool get_spacing(UNICHAR_ID prev_uch_id,
00094                    UNICHAR_ID uch_id,
00095                    int *spacing) const {
00096     const FontSpacingInfo *prev_fsi = this->get_spacing(prev_uch_id);
00097     const FontSpacingInfo *fsi = this->get_spacing(uch_id);
00098     if (prev_fsi == NULL || fsi == NULL) return false;
00099     int i = 0;
00100     for (; i < prev_fsi->kerned_unichar_ids.size(); ++i) {
00101       if (prev_fsi->kerned_unichar_ids[i] == uch_id) break;
00102     }
00103     if (i < prev_fsi->kerned_unichar_ids.size()) {
00104       *spacing = prev_fsi->kerned_x_gaps[i];
00105     } else {
00106       *spacing = prev_fsi->x_gap_after + fsi->x_gap_before;
00107     }
00108     return true;
00109   }
00110 
00111   bool is_italic() const { return properties & 1; }
00112   bool is_bold() const { return (properties & 2) != 0; }
00113   bool is_fixed_pitch() const { return (properties & 4) != 0; }
00114   bool is_serif() const { return (properties & 8) != 0; }
00115   bool is_fraktur() const { return (properties & 16) != 0; }
00116 
00117   char* name;
00118   uinT32 properties;
00119   // The universal_id is a field reserved for the initialization process
00120   // to assign a unique id number to all fonts loaded for the current
00121   // combination of languages. This id will then be returned by
00122   // ResultIterator::WordFontAttributes.
00123   inT32 universal_id;
00124   // Horizontal spacing between characters (indexed by UNICHAR_ID).
00125   GenericVector<FontSpacingInfo *> *spacing_vec;
00126 };
00127 
00128 // Every class (character) owns a FontSet that represents all the fonts that can
00129 // render this character.
00130 // Since almost all the characters from the same script share the same set of
00131 // fonts, the sets are shared over multiple classes (see
00132 // Classify::fontset_table_). Thus, a class only store an id to a set.
00133 // Because some fonts cannot render just one character of a set, there are a
00134 // lot of FontSet that differ only by one font. Rather than storing directly
00135 // the FontInfo in the FontSet structure, it's better to share FontInfos among
00136 // FontSets (Classify::fontinfo_table_).
00137 struct FontSet {
00138   int           size;
00139   int*          configs;  // FontInfo ids
00140 };
00141 
00142 // Class that adds a bit of functionality on top of GenericVector to
00143 // implement a table of FontInfo that replaces UniCityTable<FontInfo>.
00144 // TODO(rays) change all references once all existing traineddata files
00145 // are replaced.
00146 class FontInfoTable : public GenericVector<FontInfo> {
00147  public:
00148   FontInfoTable();
00149   ~FontInfoTable();
00150 
00151   // Writes to the given file. Returns false in case of error.
00152   bool Serialize(FILE* fp) const;
00153   // Reads from the given file. Returns false in case of error.
00154   // If swap is true, assumes a big/little-endian swap is needed.
00155   bool DeSerialize(bool swap, FILE* fp);
00156 
00157   // Returns true if the given set of fonts includes one with the same
00158   // properties as font_id.
00159   bool SetContainsFontProperties(
00160       int font_id, const GenericVector<ScoredFont>& font_set) const;
00161   // Returns true if the given set of fonts includes multiple properties.
00162   bool SetContainsMultipleFontProperties(
00163       const GenericVector<ScoredFont>& font_set) const;
00164 
00165   // Moves any non-empty FontSpacingInfo entries from other to this.
00166   void MoveSpacingInfoFrom(FontInfoTable* other);
00167   // Moves this to the target unicity table.
00168   void MoveTo(UnicityTable<FontInfo>* target);
00169 };
00170 
00171 // Compare FontInfo structures.
00172 bool CompareFontInfo(const FontInfo& fi1, const FontInfo& fi2);
00173 // Compare FontSet structures.
00174 bool CompareFontSet(const FontSet& fs1, const FontSet& fs2);
00175 // Deletion callbacks for GenericVector.
00176 void FontInfoDeleteCallback(FontInfo f);
00177 void FontSetDeleteCallback(FontSet fs);
00178 
00179 // Callbacks used by UnicityTable to read/write FontInfo/FontSet structures.
00180 bool read_info(FILE* f, FontInfo* fi, bool swap);
00181 bool write_info(FILE* f, const FontInfo& fi);
00182 bool read_spacing_info(FILE *f, FontInfo* fi, bool swap);
00183 bool write_spacing_info(FILE* f, const FontInfo& fi);
00184 bool read_set(FILE* f, FontSet* fs, bool swap);
00185 bool write_set(FILE* f, const FontSet& fs);
00186 
00187 }  // namespace tesseract.
00188 
00189 #endif /* THIRD_PARTY_TESSERACT_CCSTRUCT_FONTINFO_H_ */
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines