|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: pango_font_info.h 00003 * Description: Font-related objects and helper functions 00004 * Author: Ranjith Unnikrishnan 00005 * Created: Mon Nov 18 2013 00006 * 00007 * (C) Copyright 2013, Google Inc. 00008 * Licensed under the Apache License, Version 2.0 (the "License"); 00009 * you may not use this file except in compliance with the License. 00010 * You may obtain a copy of the License at 00011 * http://www.apache.org/licenses/LICENSE-2.0 00012 * Unless required by applicable law or agreed to in writing, software 00013 * distributed under the License is distributed on an "AS IS" BASIS, 00014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 * See the License for the specific language governing permissions and 00016 * limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifndef TESSERACT_TRAINING_PANGO_FONT_INFO_H_ 00021 #define TESSERACT_TRAINING_PANGO_FONT_INFO_H_ 00022 00023 #include <string> 00024 #include <utility> 00025 #include <vector> 00026 00027 #include "hashfn.h" 00028 #include "host.h" 00029 #include "util.h" 00030 #include "pango/pango-font.h" 00031 00032 typedef signed int char32; 00033 00034 namespace tesseract { 00035 00036 // Data holder class for a font, intended to avoid having to work with Pango or 00037 // FontConfig-specific objects directly. 00038 class PangoFontInfo { 00039 public: 00040 enum FontTypeEnum { 00041 UNKNOWN, 00042 SERIF, 00043 SANS_SERIF, 00044 DECORATIVE, 00045 }; 00046 PangoFontInfo(); 00047 // Initialize from parsing a font description name, defined as a string of the 00048 // format: 00049 // "FamilyName [FaceName] [PointSize]" 00050 // where a missing FaceName implies the default regular face. 00051 // eg. "Arial Italic 12", "Verdana" 00052 // 00053 // FaceName is a combination of: 00054 // [StyleName] [Variant] [Weight] [Stretch] 00055 // with (all optional) Pango-defined values of: 00056 // StyleName: Oblique, Italic 00057 // Variant : Small-Caps 00058 // Weight : Ultra-Light, Light, Medium, Semi-Bold, Bold, Ultra-Bold, Heavy 00059 // Stretch : Ultra-Condensed, Extra-Condensed, Condensed, Semi-Condensed, 00060 // Semi-Expanded, Expanded, Extra-Expanded, Ultra-Expanded. 00061 explicit PangoFontInfo(const string& name); 00062 bool ParseFontDescriptionName(const string& name); 00063 00064 // Returns true if the font have codepoint coverage for the specified text. 00065 bool CoversUTF8Text(const char* utf8_text, int byte_length) const; 00066 // Modifies string to remove unicode points that are not covered by the 00067 // font. Returns the number of characters dropped. 00068 int DropUncoveredChars(string* utf8_text) const; 00069 00070 // Returns true if the entire string can be rendered by the font with full 00071 // character coverage and no unknown glyph or dotted-circle glyph 00072 // substitutions on encountering a badly formed unicode sequence. 00073 // If true, returns individual graphemes. Any whitespace characters in the 00074 // original string are also included in the list. 00075 bool CanRenderString(const char* utf8_word, int len, 00076 vector<string>* graphemes) const; 00077 bool CanRenderString(const char* utf8_word, int len) const; 00078 00079 // Retrieves the x_bearing and x_advance for the given utf8 character in the 00080 // font. Returns false if the glyph for the character could not be found in 00081 // the font. 00082 // Ref: http://freetype.sourceforge.net/freetype2/docs/glyphs/glyphs-3.html 00083 bool GetSpacingProperties(const string& utf8_char, 00084 int* x_bearing, int* x_advance) const; 00085 00086 // Initializes FontConfig by setting its environment variable and creating 00087 // a fonts.conf file that points to the given fonts_dir. Once initialized, 00088 // it is not re-initialized unless force_clear is true. 00089 static void InitFontConfig(bool force_clear, const string& fonts_dir); 00090 00091 // Accessors 00092 string DescriptionName() const; 00093 // Font Family name eg. "Arial" 00094 const string& family_name() const { return family_name_; } 00095 // Size in points (1/72"), rounded to the nearest integer. 00096 const int font_size() const { return font_size_; } 00097 const bool is_bold() const { return is_bold_; } 00098 const bool is_italic() const { return is_italic_; } 00099 const bool is_smallcaps() const { return is_smallcaps_; } 00100 const bool is_monospace() const { return is_monospace_; } 00101 const bool is_fraktur() const { return is_fraktur_; } 00102 const FontTypeEnum font_type() const { return font_type_; } 00103 00104 const int resolution() const { return resolution_; } 00105 void set_resolution(const int resolution) { 00106 resolution_ = resolution; 00107 } 00108 00109 private: 00110 friend class FontUtils; 00111 void Clear(); 00112 bool ParseFontDescription(const PangoFontDescription* desc); 00113 // Returns the PangoFont structure corresponding to the closest available font 00114 // in the font map. 00115 PangoFont* ToPangoFont() const; 00116 00117 // Font properties set automatically from parsing the font description name. 00118 string family_name_; 00119 int font_size_; 00120 bool is_bold_; 00121 bool is_italic_; 00122 bool is_smallcaps_; 00123 bool is_monospace_; 00124 bool is_fraktur_; 00125 FontTypeEnum font_type_; 00126 // The Pango description that was used to initialize the instance. 00127 PangoFontDescription* desc_; 00128 // Default output resolution to assume for GetSpacingProperties() and any 00129 // other methods that returns pixel values. 00130 int resolution_; 00131 // Fontconfig operates through an environment variable, so it intrinsically 00132 // cannot be thread-friendly, but you can serialize multiple independent 00133 // font configurations by calling InitFontConfig(true, path). 00134 static bool fontconfig_initialized_; 00135 00136 private: 00137 PangoFontInfo(const PangoFontInfo&); 00138 void operator=(const PangoFontInfo&); 00139 }; 00140 00141 // Static utility methods for querying font availability and font-selection 00142 // based on codepoint coverage. 00143 class FontUtils { 00144 public: 00145 // Returns true if the font of the given description name is available in the 00146 // target directory specified by --fonts_dir 00147 static bool IsAvailableFont(const char* font_desc) { 00148 return IsAvailableFont(font_desc, NULL); 00149 } 00150 // Returns true if the font of the given description name is available in the 00151 // target directory specified by --fonts_dir. If false is returned, and 00152 // best_match is not NULL, the closest matching font is returned there. 00153 static bool IsAvailableFont(const char* font_desc, string* best_match); 00154 // Outputs description names of available fonts. 00155 static const vector<string>& ListAvailableFonts(); 00156 00157 // Picks font among available fonts that covers and can render the given word, 00158 // and returns the font description name and the decomposition of the word to 00159 // graphemes. Returns false if no suitable font was found. 00160 static bool SelectFont(const char* utf8_word, const int utf8_len, 00161 string* font_name, vector<string>* graphemes); 00162 00163 // Picks font among all_fonts that covers and can render the given word, 00164 // and returns the font description name and the decomposition of the word to 00165 // graphemes. Returns false if no suitable font was found. 00166 static bool SelectFont(const char* utf8_word, const int utf8_len, 00167 const vector<string>& all_fonts, 00168 string* font_name, vector<string>* graphemes); 00169 00170 // Returns a bitmask where the value of true at index 'n' implies that unicode 00171 // value 'n' is renderable by at least one available font. 00172 static void GetAllRenderableCharacters(vector<bool>* unichar_bitmap); 00173 // Variant of the above function that inspects only the provided font names. 00174 static void GetAllRenderableCharacters(const vector<string>& font_names, 00175 vector<bool>* unichar_bitmap); 00176 static void GetAllRenderableCharacters(const string& font_name, 00177 vector<bool>* unichar_bitmap); 00178 00179 // NOTE: The following utilities were written to be backward compatible with 00180 // StringRender. 00181 00182 // BestFonts returns a font name and a bit vector of the characters it 00183 // can render for the fonts that score within some fraction of the best 00184 // font on the characters in the given hash map. 00185 // In the flags vector, each flag is set according to whether the 00186 // corresponding character (in order of iterating ch_map) can be rendered. 00187 // The return string is a list of the acceptable fonts that were used. 00188 static string BestFonts(const unordered_map<char32, inT64>& ch_map, 00189 vector<std::pair<const char*, vector<bool> > >* font_flag); 00190 00191 // FontScore returns the weighted renderability score of the given 00192 // hash map character table in the given font. The unweighted score 00193 // is also returned in raw_score. 00194 // The values in the bool vector ch_flags correspond to whether the 00195 // corresponding character (in order of iterating ch_map) can be rendered. 00196 static int FontScore(const unordered_map<char32, inT64>& ch_map, 00197 const string& fontname, int* raw_score, 00198 vector<bool>* ch_flags); 00199 00200 // PangoFontInfo is reinitialized, so clear the static list of fonts. 00201 static void ReInit(); 00202 00203 private: 00204 static vector<string> available_fonts_; // cache list 00205 }; 00206 } // namespace tesseract 00207 00208 #endif // TESSERACT_TRAINING_PANGO_FONT_INFO_H_