|
tesseract 3.04.01
|
00001 00002 // File: fontinfo.h 00003 // Description: Font information classes abstracted from intproto.h/cpp. 00004 // Author: rays@google.com (Ray Smith) 00005 // Created: Tue May 17 17:08:01 PDT 2011 00006 // 00007 // (C) Copyright 2011, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 00021 #ifndef TESSERACT_CCSTRUCT_FONTINFO_H_ 00022 #define TESSERACT_CCSTRUCT_FONTINFO_H_ 00023 00024 #include "genericvector.h" 00025 #include "host.h" 00026 #include "unichar.h" 00027 00028 template <typename T> class UnicityTable; 00029 00030 namespace tesseract { 00031 00032 class BitVector; 00033 00034 // Simple struct to hold a font and a score. The scores come from the low-level 00035 // integer matcher, so they are in the uinT16 range. Fonts are an index to 00036 // fontinfo_table. 00037 // These get copied around a lot, so best to keep them small. 00038 struct ScoredFont { 00039 ScoredFont() : fontinfo_id(-1), score(0) {} 00040 ScoredFont(int font_id, uinT16 classifier_score) 00041 : fontinfo_id(font_id), score(classifier_score) {} 00042 00043 // Index into fontinfo table, but inside the classifier, may be a shapetable 00044 // index. 00045 inT32 fontinfo_id; 00046 // Raw score from the low-level classifier. 00047 uinT16 score; 00048 }; 00049 00050 // Struct for information about spacing between characters in a particular font. 00051 struct FontSpacingInfo { 00052 inT16 x_gap_before; 00053 inT16 x_gap_after; 00054 GenericVector<UNICHAR_ID> kerned_unichar_ids; 00055 GenericVector<inT16> kerned_x_gaps; 00056 }; 00057 00058 /* 00059 * font_properties contains properties about boldness, italicness, fixed pitch, 00060 * serif, fraktur 00061 */ 00062 struct FontInfo { 00063 FontInfo() : name(NULL), properties(0), universal_id(0), spacing_vec(NULL) {} 00064 ~FontInfo() {} 00065 00066 // Writes to the given file. Returns false in case of error. 00067 bool Serialize(FILE* fp) const; 00068 // Reads from the given file. Returns false in case of error. 00069 // If swap is true, assumes a big/little-endian swap is needed. 00070 bool DeSerialize(bool swap, FILE* fp); 00071 00072 // Reserves unicharset_size spots in spacing_vec. 00073 void init_spacing(int unicharset_size) { 00074 spacing_vec = new GenericVector<FontSpacingInfo *>(); 00075 spacing_vec->init_to_size(unicharset_size, NULL); 00076 } 00077 // Adds the given pointer to FontSpacingInfo to spacing_vec member 00078 // (FontInfo class takes ownership of the pointer). 00079 // Note: init_spacing should be called before calling this function. 00080 void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info) { 00081 ASSERT_HOST(spacing_vec != NULL && spacing_vec->size() > uch_id); 00082 (*spacing_vec)[uch_id] = spacing_info; 00083 } 00084 00085 // Returns the pointer to FontSpacingInfo for the given UNICHAR_ID. 00086 const FontSpacingInfo *get_spacing(UNICHAR_ID uch_id) const { 00087 return (spacing_vec == NULL || spacing_vec->size() <= uch_id) ? 00088 NULL : (*spacing_vec)[uch_id]; 00089 } 00090 00091 // Fills spacing with the value of the x gap expected between the two given 00092 // UNICHAR_IDs. Returns true on success. 00093 bool get_spacing(UNICHAR_ID prev_uch_id, 00094 UNICHAR_ID uch_id, 00095 int *spacing) const { 00096 const FontSpacingInfo *prev_fsi = this->get_spacing(prev_uch_id); 00097 const FontSpacingInfo *fsi = this->get_spacing(uch_id); 00098 if (prev_fsi == NULL || fsi == NULL) return false; 00099 int i = 0; 00100 for (; i < prev_fsi->kerned_unichar_ids.size(); ++i) { 00101 if (prev_fsi->kerned_unichar_ids[i] == uch_id) break; 00102 } 00103 if (i < prev_fsi->kerned_unichar_ids.size()) { 00104 *spacing = prev_fsi->kerned_x_gaps[i]; 00105 } else { 00106 *spacing = prev_fsi->x_gap_after + fsi->x_gap_before; 00107 } 00108 return true; 00109 } 00110 00111 bool is_italic() const { return properties & 1; } 00112 bool is_bold() const { return (properties & 2) != 0; } 00113 bool is_fixed_pitch() const { return (properties & 4) != 0; } 00114 bool is_serif() const { return (properties & 8) != 0; } 00115 bool is_fraktur() const { return (properties & 16) != 0; } 00116 00117 char* name; 00118 uinT32 properties; 00119 // The universal_id is a field reserved for the initialization process 00120 // to assign a unique id number to all fonts loaded for the current 00121 // combination of languages. This id will then be returned by 00122 // ResultIterator::WordFontAttributes. 00123 inT32 universal_id; 00124 // Horizontal spacing between characters (indexed by UNICHAR_ID). 00125 GenericVector<FontSpacingInfo *> *spacing_vec; 00126 }; 00127 00128 // Every class (character) owns a FontSet that represents all the fonts that can 00129 // render this character. 00130 // Since almost all the characters from the same script share the same set of 00131 // fonts, the sets are shared over multiple classes (see 00132 // Classify::fontset_table_). Thus, a class only store an id to a set. 00133 // Because some fonts cannot render just one character of a set, there are a 00134 // lot of FontSet that differ only by one font. Rather than storing directly 00135 // the FontInfo in the FontSet structure, it's better to share FontInfos among 00136 // FontSets (Classify::fontinfo_table_). 00137 struct FontSet { 00138 int size; 00139 int* configs; // FontInfo ids 00140 }; 00141 00142 // Class that adds a bit of functionality on top of GenericVector to 00143 // implement a table of FontInfo that replaces UniCityTable<FontInfo>. 00144 // TODO(rays) change all references once all existing traineddata files 00145 // are replaced. 00146 class FontInfoTable : public GenericVector<FontInfo> { 00147 public: 00148 FontInfoTable(); 00149 ~FontInfoTable(); 00150 00151 // Writes to the given file. Returns false in case of error. 00152 bool Serialize(FILE* fp) const; 00153 // Reads from the given file. Returns false in case of error. 00154 // If swap is true, assumes a big/little-endian swap is needed. 00155 bool DeSerialize(bool swap, FILE* fp); 00156 00157 // Returns true if the given set of fonts includes one with the same 00158 // properties as font_id. 00159 bool SetContainsFontProperties( 00160 int font_id, const GenericVector<ScoredFont>& font_set) const; 00161 // Returns true if the given set of fonts includes multiple properties. 00162 bool SetContainsMultipleFontProperties( 00163 const GenericVector<ScoredFont>& font_set) const; 00164 00165 // Moves any non-empty FontSpacingInfo entries from other to this. 00166 void MoveSpacingInfoFrom(FontInfoTable* other); 00167 // Moves this to the target unicity table. 00168 void MoveTo(UnicityTable<FontInfo>* target); 00169 }; 00170 00171 // Compare FontInfo structures. 00172 bool CompareFontInfo(const FontInfo& fi1, const FontInfo& fi2); 00173 // Compare FontSet structures. 00174 bool CompareFontSet(const FontSet& fs1, const FontSet& fs2); 00175 // Deletion callbacks for GenericVector. 00176 void FontInfoDeleteCallback(FontInfo f); 00177 void FontSetDeleteCallback(FontSet fs); 00178 00179 // Callbacks used by UnicityTable to read/write FontInfo/FontSet structures. 00180 bool read_info(FILE* f, FontInfo* fi, bool swap); 00181 bool write_info(FILE* f, const FontInfo& fi); 00182 bool read_spacing_info(FILE *f, FontInfo* fi, bool swap); 00183 bool write_spacing_info(FILE* f, const FontInfo& fi); 00184 bool read_set(FILE* f, FontSet* fs, bool swap); 00185 bool write_set(FILE* f, const FontSet& fs); 00186 00187 } // namespace tesseract. 00188 00189 #endif /* THIRD_PARTY_TESSERACT_CCSTRUCT_FONTINFO_H_ */