tesseract 3.04.01

training/pango_font_info.h

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        pango_font_info.h
00003  * Description: Font-related objects and helper functions
00004  * Author:      Ranjith Unnikrishnan
00005  * Created:     Mon Nov 18 2013
00006  *
00007  * (C) Copyright 2013, Google Inc.
00008  * Licensed under the Apache License, Version 2.0 (the "License");
00009  * you may not use this file except in compliance with the License.
00010  * You may obtain a copy of the License at
00011  * http://www.apache.org/licenses/LICENSE-2.0
00012  * Unless required by applicable law or agreed to in writing, software
00013  * distributed under the License is distributed on an "AS IS" BASIS,
00014  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  * See the License for the specific language governing permissions and
00016  * limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifndef TESSERACT_TRAINING_PANGO_FONT_INFO_H_
00021 #define TESSERACT_TRAINING_PANGO_FONT_INFO_H_
00022 
00023 #include <string>
00024 #include <utility>
00025 #include <vector>
00026 
00027 #include "hashfn.h"
00028 #include "host.h"
00029 #include "util.h"
00030 #include "pango/pango-font.h"
00031 
00032 typedef signed int char32;
00033 
00034 namespace tesseract {
00035 
00036 // Data holder class for a font, intended to avoid having to work with Pango or
00037 // FontConfig-specific objects directly.
00038 class PangoFontInfo {
00039  public:
00040   enum FontTypeEnum {
00041     UNKNOWN,
00042     SERIF,
00043     SANS_SERIF,
00044     DECORATIVE,
00045   };
00046   PangoFontInfo();
00047   // Initialize from parsing a font description name, defined as a string of the
00048   // format:
00049   //   "FamilyName [FaceName] [PointSize]"
00050   // where a missing FaceName implies the default regular face.
00051   // eg. "Arial Italic 12", "Verdana"
00052   //
00053   // FaceName is a combination of:
00054   //   [StyleName] [Variant] [Weight] [Stretch]
00055   // with (all optional) Pango-defined values of:
00056   // StyleName: Oblique, Italic
00057   // Variant  : Small-Caps
00058   // Weight   : Ultra-Light, Light, Medium, Semi-Bold, Bold, Ultra-Bold, Heavy
00059   // Stretch  : Ultra-Condensed, Extra-Condensed, Condensed, Semi-Condensed,
00060   //            Semi-Expanded, Expanded, Extra-Expanded, Ultra-Expanded.
00061   explicit PangoFontInfo(const string& name);
00062   bool ParseFontDescriptionName(const string& name);
00063 
00064   // Returns true if the font have codepoint coverage for the specified text.
00065   bool CoversUTF8Text(const char* utf8_text, int byte_length) const;
00066   // Modifies string to remove unicode points that are not covered by the
00067   // font. Returns the number of characters dropped.
00068   int DropUncoveredChars(string* utf8_text) const;
00069 
00070   // Returns true if the entire string can be rendered by the font with full
00071   // character coverage and no unknown glyph or dotted-circle glyph
00072   // substitutions on encountering a badly formed unicode sequence.
00073   // If true, returns individual graphemes. Any whitespace characters in the
00074   // original string are also included in the list.
00075   bool CanRenderString(const char* utf8_word, int len,
00076                        vector<string>* graphemes) const;
00077   bool CanRenderString(const char* utf8_word, int len) const;
00078 
00079   // Retrieves the x_bearing and x_advance for the given utf8 character in the
00080   // font. Returns false if the glyph for the character could not be found in
00081   // the font.
00082   // Ref: http://freetype.sourceforge.net/freetype2/docs/glyphs/glyphs-3.html
00083   bool GetSpacingProperties(const string& utf8_char,
00084                             int* x_bearing, int* x_advance) const;
00085 
00086   // Initializes FontConfig by setting its environment variable and creating
00087   // a fonts.conf file that points to the given fonts_dir. Once initialized,
00088   // it is not re-initialized unless force_clear is true.
00089   static void InitFontConfig(bool force_clear, const string& fonts_dir);
00090 
00091   // Accessors
00092   string DescriptionName() const;
00093   // Font Family name eg. "Arial"
00094   const string& family_name() const    { return family_name_; }
00095   // Size in points (1/72"), rounded to the nearest integer.
00096   const int font_size() const          { return font_size_; }
00097   const bool is_bold() const           { return is_bold_; }
00098   const bool is_italic() const         { return is_italic_; }
00099   const bool is_smallcaps() const      { return is_smallcaps_; }
00100   const bool is_monospace() const      { return is_monospace_; }
00101   const bool is_fraktur() const        { return is_fraktur_; }
00102   const FontTypeEnum font_type() const { return font_type_; }
00103 
00104   const int resolution() const         { return resolution_; }
00105   void set_resolution(const int resolution) {
00106     resolution_ = resolution;
00107   }
00108 
00109  private:
00110   friend class FontUtils;
00111   void Clear();
00112   bool ParseFontDescription(const PangoFontDescription* desc);
00113   // Returns the PangoFont structure corresponding to the closest available font
00114   // in the font map.
00115   PangoFont* ToPangoFont() const;
00116 
00117   // Font properties set automatically from parsing the font description name.
00118   string family_name_;
00119   int font_size_;
00120   bool is_bold_;
00121   bool is_italic_;
00122   bool is_smallcaps_;
00123   bool is_monospace_;
00124   bool is_fraktur_;
00125   FontTypeEnum font_type_;
00126   // The Pango description that was used to initialize the instance.
00127   PangoFontDescription* desc_;
00128   // Default output resolution to assume for GetSpacingProperties() and any
00129   // other methods that returns pixel values.
00130   int resolution_;
00131   // Fontconfig operates through an environment variable, so it intrinsically
00132   // cannot be thread-friendly, but you can serialize multiple independent
00133   // font configurations by calling InitFontConfig(true, path).
00134   static bool fontconfig_initialized_;
00135 
00136  private:
00137   PangoFontInfo(const PangoFontInfo&);
00138   void operator=(const PangoFontInfo&);
00139 };
00140 
00141 // Static utility methods for querying font availability and font-selection
00142 // based on codepoint coverage.
00143 class FontUtils {
00144  public:
00145   // Returns true if the font of the given description name is available in the
00146   // target directory specified by --fonts_dir
00147   static bool IsAvailableFont(const char* font_desc) {
00148     return IsAvailableFont(font_desc, NULL);
00149   }
00150   // Returns true if the font of the given description name is available in the
00151   // target directory specified by --fonts_dir. If false is returned, and
00152   // best_match is not NULL, the closest matching font is returned there.
00153   static bool IsAvailableFont(const char* font_desc, string* best_match);
00154   // Outputs description names of available fonts.
00155   static const vector<string>& ListAvailableFonts();
00156 
00157   // Picks font among available fonts that covers and can render the given word,
00158   // and returns the font description name and the decomposition of the word to
00159   // graphemes. Returns false if no suitable font was found.
00160   static bool SelectFont(const char* utf8_word, const int utf8_len,
00161                          string* font_name, vector<string>* graphemes);
00162 
00163   // Picks font among all_fonts that covers and can render the given word,
00164   // and returns the font description name and the decomposition of the word to
00165   // graphemes. Returns false if no suitable font was found.
00166   static bool SelectFont(const char* utf8_word, const int utf8_len,
00167                          const vector<string>& all_fonts,
00168                          string* font_name, vector<string>* graphemes);
00169 
00170   // Returns a bitmask where the value of true at index 'n' implies that unicode
00171   // value 'n' is renderable by at least one available font.
00172   static void GetAllRenderableCharacters(vector<bool>* unichar_bitmap);
00173   // Variant of the above function that inspects only the provided font names.
00174   static void GetAllRenderableCharacters(const vector<string>& font_names,
00175                                          vector<bool>* unichar_bitmap);
00176   static void GetAllRenderableCharacters(const string& font_name,
00177                                          vector<bool>* unichar_bitmap);
00178 
00179   // NOTE: The following utilities were written to be backward compatible with
00180   // StringRender.
00181 
00182   // BestFonts returns a font name and a bit vector of the characters it
00183   // can render for the fonts that score within some fraction of the best
00184   // font on the characters in the given hash map.
00185   // In the flags vector, each flag is set according to whether the
00186   // corresponding character (in order of iterating ch_map) can be rendered.
00187   // The return string is a list of the acceptable fonts that were used.
00188   static string BestFonts(const unordered_map<char32, inT64>& ch_map,
00189       vector<std::pair<const char*, vector<bool> > >* font_flag);
00190 
00191   // FontScore returns the weighted renderability score of the given
00192   // hash map character table in the given font. The unweighted score
00193   // is also returned in raw_score.
00194   // The values in the bool vector ch_flags correspond to whether the
00195   // corresponding character (in order of iterating ch_map) can be rendered.
00196   static int FontScore(const unordered_map<char32, inT64>& ch_map,
00197                        const string& fontname, int* raw_score,
00198                        vector<bool>* ch_flags);
00199 
00200   // PangoFontInfo is reinitialized, so clear the static list of fonts.
00201   static void ReInit();
00202 
00203  private:
00204   static vector<string> available_fonts_;  // cache list
00205 };
00206 }  // namespace tesseract
00207 
00208 #endif  // TESSERACT_TRAINING_PANGO_FONT_INFO_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines