tesseract 3.04.01

training/pango_font_info.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        pango_font_info.cpp
00003  * Description: Font-related objects and helper functions
00004  * Author:      Ranjith Unnikrishnan
00005  * Created:     Mon Nov 18 2013
00006  *
00007  * (C) Copyright 2013, Google Inc.
00008  * Licensed under the Apache License, Version 2.0 (the "License");
00009  * you may not use this file except in compliance with the License.
00010  * You may obtain a copy of the License at
00011  * http://www.apache.org/licenses/LICENSE-2.0
00012  * Unless required by applicable law or agreed to in writing, software
00013  * distributed under the License is distributed on an "AS IS" BASIS,
00014  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  * See the License for the specific language governing permissions and
00016  * limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 // Include automatically generated configuration file if running autoconf.
00021 #ifdef HAVE_CONFIG_H
00022 #include "config_auto.h"
00023 #endif
00024 
00025 #if (defined MINGW) || (defined __CYGWIN__)
00026 // workaround for stdlib.h and putenv
00027 #undef __STRICT_ANSI__
00028 #include "strcasestr.h"
00029 #endif  // MINGW/Cygwin
00030 #include <stdlib.h>
00031 #include <stdio.h>
00032 #include <string.h>
00033 #include <sys/param.h>
00034 #include <algorithm>
00035 
00036 #include "pango_font_info.h"
00037 #include "commandlineflags.h"
00038 #include "fileio.h"
00039 #include "normstrngs.h"
00040 #include "tlog.h"
00041 #include "unichar.h"
00042 #include "util.h"
00043 #include "pango/pango.h"
00044 #include "pango/pangocairo.h"
00045 #include "pango/pangofc-font.h"
00046 
00047 STRING_PARAM_FLAG(fonts_dir, "/auto/ocr-data/tesstraining/fonts",
00048                   "Overrides system default font location");
00049 STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp",
00050                   "Overrides fontconfig default temporary dir");
00051 BOOL_PARAM_FLAG(fontconfig_refresh_cache, false,
00052                 "Does a one-time deletion of cache files from the "
00053                 "fontconfig_tmpdir before initializing fontconfig.");
00054 BOOL_PARAM_FLAG(fontconfig_refresh_config_file, true,
00055                 "Does a one-time reset of the fontconfig config file to point"
00056                 " to fonts_dir before initializing fontconfig. Set to true"
00057                 " if fontconfig_refresh_cache is true. Set it to false to use"
00058                 " multiple instances in separate processes without having to"
00059                 " rescan the fonts_dir, using a previously setup font cache");
00060 
00061 #ifndef USE_STD_NAMESPACE
00062 #include "ocr/trainingdata/typesetting/legacy_fonts.h"
00063 BOOL_PARAM_FLAG(use_only_legacy_fonts, false,
00064                 "Overrides --fonts_dir and sets the known universe of fonts to"
00065                 "the list in legacy_fonts.h");
00066 #else
00067 using std::pair;
00068 #endif
00069 
00070 namespace tesseract {
00071 
00072 // Default assumed output resolution. Required only for providing font metrics
00073 // in pixels.
00074 const int kDefaultResolution = 300;
00075 
00076 bool PangoFontInfo::fontconfig_initialized_ = false;
00077 
00078 PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) {
00079   Clear();
00080 }
00081 
00082 PangoFontInfo::PangoFontInfo(const string& desc)
00083     : desc_(NULL), resolution_(kDefaultResolution) {
00084   if (!ParseFontDescriptionName(desc)) {
00085     tprintf("ERROR: Could not parse %s\n", desc.c_str());
00086     Clear();
00087   }
00088 }
00089 
00090 void PangoFontInfo::Clear() {
00091   font_size_ = 0;
00092   is_bold_ = false;
00093   is_italic_ = false;
00094   is_smallcaps_ = false;
00095   is_monospace_ = false;
00096   family_name_.clear();
00097   font_type_ = UNKNOWN;
00098   if (desc_) {
00099     pango_font_description_free(desc_);
00100     desc_ = NULL;
00101   }
00102 }
00103 
00104 string PangoFontInfo::DescriptionName() const {
00105   if (!desc_) return "";
00106   char* desc_str = pango_font_description_to_string(desc_);
00107   string desc_name(desc_str);
00108   g_free(desc_str);
00109   return desc_name;
00110 }
00111 
00112 // Initializes Fontconfig for use by writing a fake fonts.conf file into the
00113 // FLAGS_fontconfigs_tmpdir directory, that points to the supplied
00114 // fonts_dir, and then overrides the FONTCONFIG_PATH environment variable
00115 // to point to this fonts.conf file. If force_clear, the cache is refreshed
00116 // even if it has already been initialized.
00117 void PangoFontInfo::InitFontConfig(bool force_clear, const string& fonts_dir) {
00118   if ((fontconfig_initialized_ && !force_clear) || fonts_dir.empty()) {
00119     fontconfig_initialized_ = true;
00120     return;
00121   }
00122   if (FLAGS_fontconfig_refresh_cache || force_clear) {
00123     File::DeleteMatchingFiles(File::JoinPath(
00124         FLAGS_fontconfig_tmpdir.c_str(), "*cache-?").c_str());
00125   }
00126   if (FLAGS_fontconfig_refresh_config_file || FLAGS_fontconfig_refresh_cache ||
00127       force_clear) {
00128     const int MAX_FONTCONF_FILESIZE = 1024;
00129     char fonts_conf_template[MAX_FONTCONF_FILESIZE];
00130     snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
00131              "<?xml version=\"1.0\"?>\n"
00132              "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
00133              "<fontconfig>\n"
00134              "<dir>%s</dir>\n"
00135              "<cachedir>%s</cachedir>\n"
00136              "<config></config>\n"
00137              "</fontconfig>", fonts_dir.c_str(),
00138              FLAGS_fontconfig_tmpdir.c_str());
00139     string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(),
00140                                             "fonts.conf");
00141     File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
00142   }
00143 #ifdef _WIN32
00144   std::string env("FONTCONFIG_PATH=");
00145   env.append(FLAGS_fontconfig_tmpdir.c_str());
00146   putenv(env.c_str());
00147   putenv("LANG=en_US.utf8");
00148 #else
00149   setenv("FONTCONFIG_PATH", FLAGS_fontconfig_tmpdir.c_str(), true);
00150   // Fix the locale so that the reported font names are consistent.
00151   setenv("LANG", "en_US.utf8", true);
00152 #endif  // _WIN32
00153   if (!fontconfig_initialized_ || force_clear) {
00154     if (FcInitReinitialize() != FcTrue) {
00155       tprintf("FcInitiReinitialize failed!!\n");
00156     }
00157   }
00158   fontconfig_initialized_ = true;
00159   FontUtils::ReInit();
00160 }
00161 
00162 static void ListFontFamilies(PangoFontFamily*** families,
00163                              int* n_families) {
00164   PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir.c_str());
00165   PangoFontMap* font_map = pango_cairo_font_map_get_default();
00166   DISABLE_HEAP_LEAK_CHECK;
00167   pango_font_map_list_families(font_map, families, n_families);
00168 }
00169 
00170 // Inspects whether a given font family is monospace. If the font is not
00171 // available, it cannot make a decision and returns false by default.
00172 static bool IsMonospaceFontFamily(const char* family_name) {
00173   PangoFontFamily** families = 0;
00174   int n_families = 0;
00175   bool is_monospace = false;
00176   ListFontFamilies(&families, &n_families);
00177   ASSERT_HOST(n_families > 0);
00178   bool found = false;
00179   for (int i = 0; i < n_families; ++i) {
00180     if (!strcasecmp(family_name, pango_font_family_get_name(families[i]))) {
00181       is_monospace = pango_font_family_is_monospace(families[i]);
00182       found = true;
00183       break;
00184     }
00185   }
00186   if (!found) {
00187     tlog(1, "Could not find monospace property of family %s\n", family_name);
00188   }
00189   g_free(families);
00190   return is_monospace;
00191 }
00192 
00193 bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) {
00194   Clear();
00195   const char* family = pango_font_description_get_family(desc);
00196   if (!family) {
00197     char* desc_str = pango_font_description_to_string(desc);
00198     tprintf("WARNING: Could not parse family name from description: '%s'\n",
00199             desc_str);
00200     g_free(desc_str);
00201     return false;
00202   }
00203   family_name_ = string(family);
00204   desc_ = pango_font_description_copy(desc);
00205   is_monospace_ = IsMonospaceFontFamily(family);
00206 
00207   // Set font size in points
00208   font_size_ = pango_font_description_get_size(desc);
00209   if (!pango_font_description_get_size_is_absolute(desc)) {
00210     font_size_ /= PANGO_SCALE;
00211   }
00212 
00213   PangoStyle style = pango_font_description_get_style(desc);
00214   is_italic_ = (PANGO_STYLE_ITALIC == style ||
00215                 PANGO_STYLE_OBLIQUE == style);
00216   is_smallcaps_ = (pango_font_description_get_variant(desc)
00217                    == PANGO_VARIANT_SMALL_CAPS);
00218 
00219   is_bold_ = (pango_font_description_get_weight(desc) >= PANGO_WEIGHT_BOLD);
00220   // We don't have a way to detect whether a font is of type Fraktur. The fonts
00221   // we currently use all have "Fraktur" in their family name, so we do a
00222   // fragile but functional check for that here.
00223   is_fraktur_ = (strcasestr(family, "Fraktur") != NULL);
00224   return true;
00225 }
00226 
00227 bool PangoFontInfo::ParseFontDescriptionName(const string& name) {
00228   PangoFontDescription *desc = pango_font_description_from_string(name.c_str());
00229   bool success = ParseFontDescription(desc);
00230   pango_font_description_free(desc);
00231   return success;
00232 }
00233 
00234 // Returns the PangoFont structure corresponding to the closest available font
00235 // in the font map. Note that if the font is wholly missing, this could
00236 // correspond to a completely different font family and face.
00237 PangoFont* PangoFontInfo::ToPangoFont() const {
00238   InitFontConfig(false, FLAGS_fonts_dir.c_str());
00239   PangoFontMap* font_map = pango_cairo_font_map_get_default();
00240   PangoContext* context = pango_context_new();
00241   pango_cairo_context_set_resolution(context, resolution_);
00242   pango_context_set_font_map(context, font_map);
00243   PangoFont* font = NULL;
00244   {
00245     DISABLE_HEAP_LEAK_CHECK;
00246     font = pango_font_map_load_font(font_map, context, desc_);
00247   }
00248   g_object_unref(context);
00249   return font;
00250 }
00251 
00252 bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const {
00253   PangoFont* font = ToPangoFont();
00254   PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
00255   for (UNICHAR::const_iterator it = UNICHAR::begin(utf8_text, byte_length);
00256        it != UNICHAR::end(utf8_text, byte_length);
00257        ++it) {
00258     if (IsWhitespace(*it) || pango_is_zero_width(*it))
00259       continue;
00260     if (pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
00261       char tmp[5];
00262       int len = it.get_utf8(tmp);
00263       tmp[len] = '\0';
00264       tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
00265       return false;
00266     }
00267   }
00268   return true;
00269 }
00270 
00271 // This variant of strncpy permits src and dest to overlap. It will copy the
00272 // first byte first.
00273 static char* my_strnmove(char* dest, const char* src, size_t n) {
00274   char* ret = dest;
00275 
00276   // Copy characters until n reaches zero or the src byte is a nul.
00277   do {
00278     *dest = *src;
00279     --n;
00280     ++dest;
00281     ++src;
00282   } while (n && src[0]);
00283 
00284   // If we reached a nul byte and there are more 'n' left, zero them out.
00285   while (n) {
00286     *dest = '\0';
00287     --n;
00288     ++dest;
00289   }
00290   return ret;
00291 }
00292 
00293 int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
00294   PangoFont* font = ToPangoFont();
00295   PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
00296   int num_dropped_chars = 0;
00297   // Maintain two iterators that point into the string. For space efficiency, we
00298   // will repeatedly copy one covered UTF8 character from one to the other, and
00299   // at the end resize the string to the right length.
00300   char* out = const_cast<char*>(utf8_text->c_str());
00301   const UNICHAR::const_iterator it_begin =
00302       UNICHAR::begin(utf8_text->c_str(), utf8_text->length());
00303   const UNICHAR::const_iterator it_end =
00304       UNICHAR::end(utf8_text->c_str(), utf8_text->length());
00305   for (UNICHAR::const_iterator it = it_begin; it != it_end;) {
00306     // Skip bad utf-8.
00307     if (!it.is_legal()) {
00308       ++it;  // One suitable error message will still be issued.
00309       continue;
00310     }
00311     int unicode = *it;
00312     int utf8_len = it.utf8_len();
00313     const char* utf8_char = it.utf8_data();
00314     // Move it forward before the data gets modified.
00315     ++it;
00316     if (!IsWhitespace(unicode) && !pango_is_zero_width(unicode) &&
00317         pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) {
00318       if (TLOG_IS_ON(2)) {
00319         UNICHAR unichar(unicode);
00320         char* str = unichar.utf8_str();
00321         tlog(2, "'%s' (U+%x) not covered by font\n", str, unicode);
00322         delete[] str;
00323       }
00324       ++num_dropped_chars;
00325       continue;
00326     }
00327     my_strnmove(out, utf8_char, utf8_len);
00328     out += utf8_len;
00329   }
00330   utf8_text->resize(out - utf8_text->c_str());
00331   return num_dropped_chars;
00332 }
00333 
00334 bool PangoFontInfo::GetSpacingProperties(const string& utf8_char,
00335                                          int* x_bearing, int* x_advance) const {
00336   // Convert to equivalent PangoFont structure
00337   PangoFont* font = ToPangoFont();
00338   // Find the glyph index in the font for the supplied utf8 character.
00339   int total_advance = 0;
00340   int min_bearing = 0;
00341   // Handle multi-unicode strings by reporting the left-most position of the
00342   // x-bearing, and right-most position of the x-advance if the string were to
00343   // be rendered.
00344   const UNICHAR::const_iterator it_begin = UNICHAR::begin(utf8_char.c_str(),
00345                                                           utf8_char.length());
00346   const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_char.c_str(),
00347                                                       utf8_char.length());
00348   for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
00349     PangoGlyph glyph_index = pango_fc_font_get_glyph(
00350         reinterpret_cast<PangoFcFont*>(font), *it);
00351     if (!glyph_index) {
00352       // Glyph for given unicode character doesn't exist in font.
00353       return false;
00354     }
00355     // Find the ink glyph extents for the glyph
00356     PangoRectangle ink_rect, logical_rect;
00357     pango_font_get_glyph_extents(font, glyph_index, &ink_rect, &logical_rect);
00358     pango_extents_to_pixels(&ink_rect, NULL);
00359     pango_extents_to_pixels(&logical_rect, NULL);
00360 
00361     int bearing = total_advance + PANGO_LBEARING(ink_rect);
00362     if (it == it_begin || bearing < min_bearing) {
00363       min_bearing = bearing;
00364     }
00365     total_advance += PANGO_RBEARING(logical_rect);
00366   }
00367   *x_bearing = min_bearing;
00368   *x_advance = total_advance;
00369   return true;
00370 }
00371 
00372 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len) const {
00373   vector<string> graphemes;
00374   return CanRenderString(utf8_word, len, &graphemes);
00375 }
00376 
00377 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
00378                                     vector<string>* graphemes) const {
00379   if (graphemes) graphemes->clear();
00380   // We check for font coverage of the text first, as otherwise Pango could
00381   // (undesirably) fall back to another font that does have the required
00382   // coverage.
00383   if (!CoversUTF8Text(utf8_word, len)) {
00384     return false;
00385   }
00386   // U+25CC dotted circle character that often (but not always) gets rendered
00387   // when there is an illegal grapheme sequence.
00388   const char32 kDottedCircleGlyph = 9676;
00389   bool bad_glyph = false;
00390   PangoFontMap* font_map = pango_cairo_font_map_get_default();
00391   PangoContext* context = pango_context_new();
00392   pango_context_set_font_map(context, font_map);
00393   PangoLayout* layout;
00394   {
00395     // Pango is not relasing the cached layout.
00396     DISABLE_HEAP_LEAK_CHECK;
00397     layout = pango_layout_new(context);
00398   }
00399   if (desc_) {
00400     pango_layout_set_font_description(layout, desc_);
00401   } else {
00402     PangoFontDescription *desc = pango_font_description_from_string(
00403         DescriptionName().c_str());
00404     pango_layout_set_font_description(layout, desc);
00405     pango_font_description_free(desc);
00406   }
00407   pango_layout_set_text(layout, utf8_word, len);
00408   PangoLayoutIter* run_iter = NULL;
00409   { // Fontconfig caches some information here that is not freed before exit.
00410     DISABLE_HEAP_LEAK_CHECK;
00411     run_iter = pango_layout_get_iter(layout);
00412   }
00413   do {
00414     PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
00415     if (!run) {
00416       tlog(2, "Found end of line NULL run marker\n");
00417       continue;
00418     }
00419     PangoGlyph dotted_circle_glyph;
00420     PangoFont* font = run->item->analysis.font;
00421     dotted_circle_glyph = pango_fc_font_get_glyph(
00422         reinterpret_cast<PangoFcFont*>(font), kDottedCircleGlyph);
00423     if (TLOG_IS_ON(2)) {
00424       PangoFontDescription* desc = pango_font_describe(font);
00425       char* desc_str = pango_font_description_to_string(desc);
00426       tlog(2, "Desc of font in run: %s\n", desc_str);
00427       g_free(desc_str);
00428       pango_font_description_free(desc);
00429     }
00430 
00431     PangoGlyphItemIter cluster_iter;
00432     gboolean have_cluster;
00433     for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
00434                                                          run, utf8_word);
00435          have_cluster && !bad_glyph;
00436          have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
00437       const int start_byte_index = cluster_iter.start_index;
00438       const int end_byte_index = cluster_iter.end_index;
00439       int start_glyph_index = cluster_iter.start_glyph;
00440       int end_glyph_index = cluster_iter.end_glyph;
00441       string cluster_text = string(utf8_word + start_byte_index,
00442                                    end_byte_index - start_byte_index);
00443       if (graphemes) graphemes->push_back(cluster_text);
00444       if (IsUTF8Whitespace(cluster_text.c_str())) {
00445         tlog(2, "Skipping whitespace\n");
00446         continue;
00447       }
00448       if (TLOG_IS_ON(2)) {
00449         printf("start_byte=%d end_byte=%d start_glyph=%d end_glyph=%d ",
00450                start_byte_index, end_byte_index,
00451                start_glyph_index, end_glyph_index);
00452       }
00453       for (int i = start_glyph_index,
00454                step = (end_glyph_index > start_glyph_index) ? 1 : -1;
00455            !bad_glyph && i != end_glyph_index; i+= step) {
00456         const bool unknown_glyph =
00457             (cluster_iter.glyph_item->glyphs->glyphs[i].glyph &
00458              PANGO_GLYPH_UNKNOWN_FLAG);
00459         const bool illegal_glyph =
00460             (cluster_iter.glyph_item->glyphs->glyphs[i].glyph ==
00461              dotted_circle_glyph);
00462         bad_glyph = unknown_glyph || illegal_glyph;
00463         if (TLOG_IS_ON(2)) {
00464           printf("(%d=%d)", cluster_iter.glyph_item->glyphs->glyphs[i].glyph,
00465                  bad_glyph ? 1 : 0);
00466         }
00467       }
00468       if (TLOG_IS_ON(2)) {
00469         printf("  '%s'\n", cluster_text.c_str());
00470       }
00471       if (bad_glyph)
00472         tlog(1, "Found illegal glyph!\n");
00473     }
00474   } while (!bad_glyph && pango_layout_iter_next_run(run_iter));
00475 
00476   pango_layout_iter_free(run_iter);
00477   g_object_unref(context);
00478   g_object_unref(layout);
00479   if (bad_glyph && graphemes) graphemes->clear();
00480   return !bad_glyph;
00481 }
00482 
00483 
00484 // ------------------------ FontUtils ------------------------------------
00485 vector<string> FontUtils::available_fonts_;  // cache list
00486 
00487 // Returns whether the specified font description is available in the fonts
00488 // directory.
00489 //
00490 // The generated list of font families and faces includes "synthesized" font
00491 // faces that are not truly loadable. Pango versions >=1.18 have a
00492 // pango_font_face_is_synthesized method that can be used to prune the list.
00493 // Until then, we are restricted to using a hack where we try to load the font
00494 // from the font_map, and then check what we loaded to see if it has the
00495 // description we expected. If it is not, then the font is deemed unavailable.
00496 /* static */
00497 bool FontUtils::IsAvailableFont(const char* input_query_desc,
00498                                 string* best_match) {
00499   string query_desc(input_query_desc);
00500   if (PANGO_VERSION <= 12005) {
00501     // Strip commas and any ' Medium' substring in the name.
00502     query_desc.erase(std::remove(query_desc.begin(), query_desc.end(), ','),
00503                      query_desc.end());
00504     const string kMediumStr = " Medium";
00505     std::size_t found = query_desc.find(kMediumStr);
00506     if (found != std::string::npos) {
00507       query_desc.erase(found, kMediumStr.length());
00508     }
00509   }
00510 
00511   PangoFontDescription *desc = pango_font_description_from_string(
00512       query_desc.c_str());
00513   PangoFont* selected_font = NULL;
00514   {
00515     PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir.c_str());
00516     PangoFontMap* font_map = pango_cairo_font_map_get_default();
00517     PangoContext* context = pango_context_new();
00518     pango_context_set_font_map(context, font_map);
00519     {
00520       DISABLE_HEAP_LEAK_CHECK;
00521       selected_font = pango_font_map_load_font(font_map, context, desc);
00522     }
00523     g_object_unref(context);
00524   }
00525   if (selected_font == NULL) {
00526     pango_font_description_free(desc);
00527     return false;
00528   }
00529   PangoFontDescription* selected_desc = pango_font_describe(selected_font);
00530 
00531   bool equal = pango_font_description_equal(desc, selected_desc);
00532   tlog(3, "query weight = %d \t selected weight =%d\n",
00533        pango_font_description_get_weight(desc),
00534        pango_font_description_get_weight(selected_desc));
00535 
00536   char* selected_desc_str = pango_font_description_to_string(selected_desc);
00537   tlog(2, "query_desc: '%s' Selected: 's'\n", query_desc.c_str(),
00538        selected_desc_str);
00539   if (!equal && best_match != NULL) {
00540     *best_match = selected_desc_str;
00541     // Clip the ending ' 0' if there is one. It seems that, if there is no
00542     // point size on the end of the fontname, then Pango always appends ' 0'.
00543     int len = best_match->size();
00544     if (len > 2 && best_match->at(len - 1) == '0' &&
00545         best_match->at(len - 2) == ' ') {
00546       *best_match = best_match->substr(0, len - 2);
00547     }
00548   }
00549   g_free(selected_desc_str);
00550   pango_font_description_free(selected_desc);
00551   g_object_unref(selected_font);
00552   pango_font_description_free(desc);
00553   return equal;
00554 }
00555 
00556 static bool ShouldIgnoreFontFamilyName(const char* query) {
00557   static const char* kIgnoredFamilyNames[]
00558       = { "Sans", "Serif", "Monospace", NULL };
00559   const char** list = kIgnoredFamilyNames;
00560   for (; *list != NULL; ++list) {
00561     if (!strcmp(*list, query))
00562       return true;
00563   }
00564   return false;
00565 }
00566 
00567 // Outputs description names of available fonts.
00568 /* static */
00569 const vector<string>& FontUtils::ListAvailableFonts() {
00570   if (available_fonts_.size()) {
00571     return available_fonts_;
00572   }
00573 #ifndef USE_STD_NAMESPACE
00574   if (FLAGS_use_only_legacy_fonts) {
00575     // Restrict view to list of fonts in legacy_fonts.h
00576     tprintf("Using list of legacy fonts only\n");
00577     const int kNumFontLists = 4;
00578     for (int i = 0; i < kNumFontLists; ++i) {
00579       for (int j = 0; kFontlists[i][j] != NULL; ++j) {
00580         available_fonts_.push_back(kFontlists[i][j]);
00581       }
00582     }
00583     return available_fonts_;
00584   }
00585 #endif
00586 
00587   PangoFontFamily** families = 0;
00588   int n_families = 0;
00589   ListFontFamilies(&families, &n_families);
00590   for (int i = 0; i < n_families; ++i) {
00591     const char* family_name = pango_font_family_get_name(families[i]);
00592     tlog(2, "Listing family %s\n", family_name);
00593     if (ShouldIgnoreFontFamilyName(family_name)) {
00594       continue;
00595     }
00596 
00597     int n_faces;
00598     PangoFontFace** faces = NULL;
00599     pango_font_family_list_faces(families[i], &faces, &n_faces);
00600     for (int j = 0; j < n_faces; ++j) {
00601       PangoFontDescription* desc = pango_font_face_describe(faces[j]);
00602       char* desc_str = pango_font_description_to_string(desc);
00603       if (IsAvailableFont(desc_str)) {
00604         available_fonts_.push_back(desc_str);
00605       }
00606       pango_font_description_free(desc);
00607       g_free(desc_str);
00608     }
00609     g_free(faces);
00610   }
00611   g_free(families);
00612   sort(available_fonts_.begin(), available_fonts_.end());
00613   return available_fonts_;
00614 }
00615 
00616 
00617 static void CharCoverageMapToBitmap(PangoCoverage* coverage,
00618                                     vector<bool>* unichar_bitmap) {
00619   const int kMinUnicodeValue = 33;
00620   const int kMaxUnicodeValue = 0x10FFFF;
00621   unichar_bitmap->resize(kMaxUnicodeValue + 1, false);
00622   // Mark off characters that the font can render.
00623   for (int i = kMinUnicodeValue; i <= kMaxUnicodeValue; ++i) {
00624     if (IsInterchangeValid(i)) {
00625       (*unichar_bitmap)[i]
00626           = (pango_coverage_get(coverage, i) == PANGO_COVERAGE_EXACT);
00627     }
00628   }
00629 }
00630 
00631 /* static */
00632 void FontUtils::GetAllRenderableCharacters(vector<bool>* unichar_bitmap) {
00633   const vector<string>& all_fonts = ListAvailableFonts();
00634   return GetAllRenderableCharacters(all_fonts, unichar_bitmap);
00635 }
00636 
00637 /* static */
00638 void FontUtils::GetAllRenderableCharacters(const string& font_name,
00639                                            vector<bool>* unichar_bitmap) {
00640   PangoFontInfo font_info(font_name);
00641   PangoCoverage* coverage = pango_font_get_coverage(
00642       font_info.ToPangoFont(), NULL);
00643   CharCoverageMapToBitmap(coverage, unichar_bitmap);
00644 }
00645 
00646 /* static */
00647 void FontUtils::GetAllRenderableCharacters(const vector<string>& fonts,
00648                                            vector<bool>* unichar_bitmap) {
00649   // Form the union of coverage maps from the fonts
00650   PangoCoverage* all_coverage = pango_coverage_new();
00651   tlog(1, "Processing %d fonts\n", fonts.size());
00652   for (int i = 0; i < fonts.size(); ++i) {
00653     PangoFontInfo font_info(fonts[i]);
00654     PangoCoverage* coverage = pango_font_get_coverage(
00655         font_info.ToPangoFont(), NULL);
00656     // Mark off characters that any font can render.
00657     pango_coverage_max(all_coverage, coverage);
00658   }
00659   CharCoverageMapToBitmap(all_coverage, unichar_bitmap);
00660   pango_coverage_unref(all_coverage);
00661 }
00662 
00663 
00664 // Utilities written to be backward compatible with StringRender
00665 
00666 /* static */
00667 int FontUtils::FontScore(const unordered_map<char32, inT64>& ch_map,
00668                          const string& fontname,
00669                          int* raw_score,
00670                          vector<bool>* ch_flags) {
00671   PangoFontInfo font_info;
00672   if (!font_info.ParseFontDescriptionName(fontname)) {
00673     tprintf("ERROR: Could not parse %s\n", fontname.c_str());
00674   }
00675   PangoFont* font = font_info.ToPangoFont();
00676   PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
00677 
00678   if (ch_flags) {
00679     ch_flags->clear();
00680     ch_flags->reserve(ch_map.size());
00681   }
00682   *raw_score = 0;
00683   int ok_chars = 0;
00684   for (unordered_map<char32, inT64>::const_iterator it = ch_map.begin();
00685        it != ch_map.end(); ++it) {
00686     bool covered = (IsWhitespace(it->first) ||
00687                     (pango_coverage_get(coverage, it->first)
00688                      == PANGO_COVERAGE_EXACT));
00689     if (covered) {
00690       ++(*raw_score);
00691       ok_chars += it->second;
00692     }
00693     if (ch_flags) {
00694       ch_flags->push_back(covered);
00695     }
00696   }
00697   return ok_chars;
00698 }
00699 
00700 
00701 /* static */
00702 string FontUtils::BestFonts(const unordered_map<char32, inT64>& ch_map,
00703                             vector<pair<const char*, vector<bool> > >* fonts) {
00704   const double kMinOKFraction = 0.99;
00705   // Weighted fraction of characters that must be renderable in a font to make
00706   // it OK even if the raw count is not good.
00707   const double kMinWeightedFraction = 0.99995;
00708 
00709   fonts->clear();
00710   vector<vector<bool> > font_flags;
00711   vector<int> font_scores;
00712   vector<int> raw_scores;
00713   int most_ok_chars = 0;
00714   int best_raw_score = 0;
00715   const vector<string>& font_names = FontUtils::ListAvailableFonts();
00716   for (int i = 0; i < font_names.size(); ++i) {
00717     vector<bool> ch_flags;
00718     int raw_score = 0;
00719     int ok_chars = FontScore(ch_map, font_names[i], &raw_score, &ch_flags);
00720     most_ok_chars = MAX(ok_chars, most_ok_chars);
00721     best_raw_score = MAX(raw_score, best_raw_score);
00722 
00723     font_flags.push_back(ch_flags);
00724     font_scores.push_back(ok_chars);
00725     raw_scores.push_back(raw_score);
00726   }
00727 
00728   // Now select the fonts with a score above a threshold fraction
00729   // of both the raw and weighted best scores. To prevent bogus fonts being
00730   // selected for CJK, we require a high fraction (kMinOKFraction = 0.99) of
00731   // BOTH weighted and raw scores.
00732   // In low character-count scripts, the issue is more getting enough fonts,
00733   // when only 1 or 2 might have all those rare dingbats etc in them, so we
00734   // allow a font with a very high weighted (coverage) score
00735   // (kMinWeightedFraction = 0.99995) to be used even if its raw score is poor.
00736   int least_good_enough = static_cast<int>(most_ok_chars * kMinOKFraction);
00737   int least_raw_enough = static_cast<int>(best_raw_score * kMinOKFraction);
00738   int override_enough = static_cast<int>(most_ok_chars * kMinWeightedFraction);
00739 
00740   string font_list;
00741   for (int i = 0; i < font_names.size(); ++i) {
00742     int score = font_scores[i];
00743     int raw_score = raw_scores[i];
00744     if ((score >= least_good_enough && raw_score >= least_raw_enough) ||
00745         score >= override_enough) {
00746       fonts->push_back(make_pair(font_names[i].c_str(), font_flags[i]));
00747       tlog(1, "OK font %s = %.4f%%, raw = %d = %.2f%%\n",
00748            font_names[i].c_str(),
00749            100.0 * score / most_ok_chars,
00750            raw_score, 100.0 * raw_score / best_raw_score);
00751       font_list += font_names[i];
00752       font_list += "\n";
00753     } else if (score >= least_good_enough || raw_score >= least_raw_enough) {
00754       tlog(1, "Runner-up font %s = %.4f%%, raw = %d = %.2f%%\n",
00755            font_names[i].c_str(),
00756            100.0 * score / most_ok_chars,
00757            raw_score, 100.0 * raw_score / best_raw_score);
00758     }
00759   }
00760   return font_list;
00761 }
00762 
00763 /* static */
00764 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
00765                            string* font_name, vector<string>* graphemes) {
00766   return SelectFont(utf8_word, utf8_len, ListAvailableFonts(), font_name,
00767                     graphemes);
00768 }
00769 
00770 /* static */
00771 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
00772                            const vector<string>& all_fonts,
00773                            string* font_name, vector<string>* graphemes) {
00774   if (font_name) font_name->clear();
00775   if (graphemes) graphemes->clear();
00776   for (int i = 0; i < all_fonts.size(); ++i) {
00777     PangoFontInfo font;
00778     vector<string> found_graphemes;
00779     ASSERT_HOST_MSG(font.ParseFontDescriptionName(all_fonts[i]),
00780                     "Could not parse font desc name %s\n",
00781                     all_fonts[i].c_str());
00782     if (font.CanRenderString(utf8_word, utf8_len, &found_graphemes)) {
00783       if (graphemes) graphemes->swap(found_graphemes);
00784       if (font_name) *font_name = all_fonts[i];
00785       return true;
00786     }
00787   }
00788   return false;
00789 }
00790 
00791 // PangoFontInfo is reinitialized, so clear the static list of fonts.
00792 /* static */
00793 void FontUtils::ReInit() { available_fonts_.clear(); }
00794 
00795 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines