|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: pango_font_info.cpp 00003 * Description: Font-related objects and helper functions 00004 * Author: Ranjith Unnikrishnan 00005 * Created: Mon Nov 18 2013 00006 * 00007 * (C) Copyright 2013, Google Inc. 00008 * Licensed under the Apache License, Version 2.0 (the "License"); 00009 * you may not use this file except in compliance with the License. 00010 * You may obtain a copy of the License at 00011 * http://www.apache.org/licenses/LICENSE-2.0 00012 * Unless required by applicable law or agreed to in writing, software 00013 * distributed under the License is distributed on an "AS IS" BASIS, 00014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 * See the License for the specific language governing permissions and 00016 * limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 // Include automatically generated configuration file if running autoconf. 00021 #ifdef HAVE_CONFIG_H 00022 #include "config_auto.h" 00023 #endif 00024 00025 #if (defined MINGW) || (defined __CYGWIN__) 00026 // workaround for stdlib.h and putenv 00027 #undef __STRICT_ANSI__ 00028 #include "strcasestr.h" 00029 #endif // MINGW/Cygwin 00030 #include <stdlib.h> 00031 #include <stdio.h> 00032 #include <string.h> 00033 #include <sys/param.h> 00034 #include <algorithm> 00035 00036 #include "pango_font_info.h" 00037 #include "commandlineflags.h" 00038 #include "fileio.h" 00039 #include "normstrngs.h" 00040 #include "tlog.h" 00041 #include "unichar.h" 00042 #include "util.h" 00043 #include "pango/pango.h" 00044 #include "pango/pangocairo.h" 00045 #include "pango/pangofc-font.h" 00046 00047 STRING_PARAM_FLAG(fonts_dir, "/auto/ocr-data/tesstraining/fonts", 00048 "Overrides system default font location"); 00049 STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp", 00050 "Overrides fontconfig default temporary dir"); 00051 BOOL_PARAM_FLAG(fontconfig_refresh_cache, false, 00052 "Does a one-time deletion of cache files from the " 00053 "fontconfig_tmpdir before initializing fontconfig."); 00054 BOOL_PARAM_FLAG(fontconfig_refresh_config_file, true, 00055 "Does a one-time reset of the fontconfig config file to point" 00056 " to fonts_dir before initializing fontconfig. Set to true" 00057 " if fontconfig_refresh_cache is true. Set it to false to use" 00058 " multiple instances in separate processes without having to" 00059 " rescan the fonts_dir, using a previously setup font cache"); 00060 00061 #ifndef USE_STD_NAMESPACE 00062 #include "ocr/trainingdata/typesetting/legacy_fonts.h" 00063 BOOL_PARAM_FLAG(use_only_legacy_fonts, false, 00064 "Overrides --fonts_dir and sets the known universe of fonts to" 00065 "the list in legacy_fonts.h"); 00066 #else 00067 using std::pair; 00068 #endif 00069 00070 namespace tesseract { 00071 00072 // Default assumed output resolution. Required only for providing font metrics 00073 // in pixels. 00074 const int kDefaultResolution = 300; 00075 00076 bool PangoFontInfo::fontconfig_initialized_ = false; 00077 00078 PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) { 00079 Clear(); 00080 } 00081 00082 PangoFontInfo::PangoFontInfo(const string& desc) 00083 : desc_(NULL), resolution_(kDefaultResolution) { 00084 if (!ParseFontDescriptionName(desc)) { 00085 tprintf("ERROR: Could not parse %s\n", desc.c_str()); 00086 Clear(); 00087 } 00088 } 00089 00090 void PangoFontInfo::Clear() { 00091 font_size_ = 0; 00092 is_bold_ = false; 00093 is_italic_ = false; 00094 is_smallcaps_ = false; 00095 is_monospace_ = false; 00096 family_name_.clear(); 00097 font_type_ = UNKNOWN; 00098 if (desc_) { 00099 pango_font_description_free(desc_); 00100 desc_ = NULL; 00101 } 00102 } 00103 00104 string PangoFontInfo::DescriptionName() const { 00105 if (!desc_) return ""; 00106 char* desc_str = pango_font_description_to_string(desc_); 00107 string desc_name(desc_str); 00108 g_free(desc_str); 00109 return desc_name; 00110 } 00111 00112 // Initializes Fontconfig for use by writing a fake fonts.conf file into the 00113 // FLAGS_fontconfigs_tmpdir directory, that points to the supplied 00114 // fonts_dir, and then overrides the FONTCONFIG_PATH environment variable 00115 // to point to this fonts.conf file. If force_clear, the cache is refreshed 00116 // even if it has already been initialized. 00117 void PangoFontInfo::InitFontConfig(bool force_clear, const string& fonts_dir) { 00118 if ((fontconfig_initialized_ && !force_clear) || fonts_dir.empty()) { 00119 fontconfig_initialized_ = true; 00120 return; 00121 } 00122 if (FLAGS_fontconfig_refresh_cache || force_clear) { 00123 File::DeleteMatchingFiles(File::JoinPath( 00124 FLAGS_fontconfig_tmpdir.c_str(), "*cache-?").c_str()); 00125 } 00126 if (FLAGS_fontconfig_refresh_config_file || FLAGS_fontconfig_refresh_cache || 00127 force_clear) { 00128 const int MAX_FONTCONF_FILESIZE = 1024; 00129 char fonts_conf_template[MAX_FONTCONF_FILESIZE]; 00130 snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE, 00131 "<?xml version=\"1.0\"?>\n" 00132 "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n" 00133 "<fontconfig>\n" 00134 "<dir>%s</dir>\n" 00135 "<cachedir>%s</cachedir>\n" 00136 "<config></config>\n" 00137 "</fontconfig>", fonts_dir.c_str(), 00138 FLAGS_fontconfig_tmpdir.c_str()); 00139 string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(), 00140 "fonts.conf"); 00141 File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file); 00142 } 00143 #ifdef _WIN32 00144 std::string env("FONTCONFIG_PATH="); 00145 env.append(FLAGS_fontconfig_tmpdir.c_str()); 00146 putenv(env.c_str()); 00147 putenv("LANG=en_US.utf8"); 00148 #else 00149 setenv("FONTCONFIG_PATH", FLAGS_fontconfig_tmpdir.c_str(), true); 00150 // Fix the locale so that the reported font names are consistent. 00151 setenv("LANG", "en_US.utf8", true); 00152 #endif // _WIN32 00153 if (!fontconfig_initialized_ || force_clear) { 00154 if (FcInitReinitialize() != FcTrue) { 00155 tprintf("FcInitiReinitialize failed!!\n"); 00156 } 00157 } 00158 fontconfig_initialized_ = true; 00159 FontUtils::ReInit(); 00160 } 00161 00162 static void ListFontFamilies(PangoFontFamily*** families, 00163 int* n_families) { 00164 PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir.c_str()); 00165 PangoFontMap* font_map = pango_cairo_font_map_get_default(); 00166 DISABLE_HEAP_LEAK_CHECK; 00167 pango_font_map_list_families(font_map, families, n_families); 00168 } 00169 00170 // Inspects whether a given font family is monospace. If the font is not 00171 // available, it cannot make a decision and returns false by default. 00172 static bool IsMonospaceFontFamily(const char* family_name) { 00173 PangoFontFamily** families = 0; 00174 int n_families = 0; 00175 bool is_monospace = false; 00176 ListFontFamilies(&families, &n_families); 00177 ASSERT_HOST(n_families > 0); 00178 bool found = false; 00179 for (int i = 0; i < n_families; ++i) { 00180 if (!strcasecmp(family_name, pango_font_family_get_name(families[i]))) { 00181 is_monospace = pango_font_family_is_monospace(families[i]); 00182 found = true; 00183 break; 00184 } 00185 } 00186 if (!found) { 00187 tlog(1, "Could not find monospace property of family %s\n", family_name); 00188 } 00189 g_free(families); 00190 return is_monospace; 00191 } 00192 00193 bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) { 00194 Clear(); 00195 const char* family = pango_font_description_get_family(desc); 00196 if (!family) { 00197 char* desc_str = pango_font_description_to_string(desc); 00198 tprintf("WARNING: Could not parse family name from description: '%s'\n", 00199 desc_str); 00200 g_free(desc_str); 00201 return false; 00202 } 00203 family_name_ = string(family); 00204 desc_ = pango_font_description_copy(desc); 00205 is_monospace_ = IsMonospaceFontFamily(family); 00206 00207 // Set font size in points 00208 font_size_ = pango_font_description_get_size(desc); 00209 if (!pango_font_description_get_size_is_absolute(desc)) { 00210 font_size_ /= PANGO_SCALE; 00211 } 00212 00213 PangoStyle style = pango_font_description_get_style(desc); 00214 is_italic_ = (PANGO_STYLE_ITALIC == style || 00215 PANGO_STYLE_OBLIQUE == style); 00216 is_smallcaps_ = (pango_font_description_get_variant(desc) 00217 == PANGO_VARIANT_SMALL_CAPS); 00218 00219 is_bold_ = (pango_font_description_get_weight(desc) >= PANGO_WEIGHT_BOLD); 00220 // We don't have a way to detect whether a font is of type Fraktur. The fonts 00221 // we currently use all have "Fraktur" in their family name, so we do a 00222 // fragile but functional check for that here. 00223 is_fraktur_ = (strcasestr(family, "Fraktur") != NULL); 00224 return true; 00225 } 00226 00227 bool PangoFontInfo::ParseFontDescriptionName(const string& name) { 00228 PangoFontDescription *desc = pango_font_description_from_string(name.c_str()); 00229 bool success = ParseFontDescription(desc); 00230 pango_font_description_free(desc); 00231 return success; 00232 } 00233 00234 // Returns the PangoFont structure corresponding to the closest available font 00235 // in the font map. Note that if the font is wholly missing, this could 00236 // correspond to a completely different font family and face. 00237 PangoFont* PangoFontInfo::ToPangoFont() const { 00238 InitFontConfig(false, FLAGS_fonts_dir.c_str()); 00239 PangoFontMap* font_map = pango_cairo_font_map_get_default(); 00240 PangoContext* context = pango_context_new(); 00241 pango_cairo_context_set_resolution(context, resolution_); 00242 pango_context_set_font_map(context, font_map); 00243 PangoFont* font = NULL; 00244 { 00245 DISABLE_HEAP_LEAK_CHECK; 00246 font = pango_font_map_load_font(font_map, context, desc_); 00247 } 00248 g_object_unref(context); 00249 return font; 00250 } 00251 00252 bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const { 00253 PangoFont* font = ToPangoFont(); 00254 PangoCoverage* coverage = pango_font_get_coverage(font, NULL); 00255 for (UNICHAR::const_iterator it = UNICHAR::begin(utf8_text, byte_length); 00256 it != UNICHAR::end(utf8_text, byte_length); 00257 ++it) { 00258 if (IsWhitespace(*it) || pango_is_zero_width(*it)) 00259 continue; 00260 if (pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) { 00261 char tmp[5]; 00262 int len = it.get_utf8(tmp); 00263 tmp[len] = '\0'; 00264 tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it); 00265 return false; 00266 } 00267 } 00268 return true; 00269 } 00270 00271 // This variant of strncpy permits src and dest to overlap. It will copy the 00272 // first byte first. 00273 static char* my_strnmove(char* dest, const char* src, size_t n) { 00274 char* ret = dest; 00275 00276 // Copy characters until n reaches zero or the src byte is a nul. 00277 do { 00278 *dest = *src; 00279 --n; 00280 ++dest; 00281 ++src; 00282 } while (n && src[0]); 00283 00284 // If we reached a nul byte and there are more 'n' left, zero them out. 00285 while (n) { 00286 *dest = '\0'; 00287 --n; 00288 ++dest; 00289 } 00290 return ret; 00291 } 00292 00293 int PangoFontInfo::DropUncoveredChars(string* utf8_text) const { 00294 PangoFont* font = ToPangoFont(); 00295 PangoCoverage* coverage = pango_font_get_coverage(font, NULL); 00296 int num_dropped_chars = 0; 00297 // Maintain two iterators that point into the string. For space efficiency, we 00298 // will repeatedly copy one covered UTF8 character from one to the other, and 00299 // at the end resize the string to the right length. 00300 char* out = const_cast<char*>(utf8_text->c_str()); 00301 const UNICHAR::const_iterator it_begin = 00302 UNICHAR::begin(utf8_text->c_str(), utf8_text->length()); 00303 const UNICHAR::const_iterator it_end = 00304 UNICHAR::end(utf8_text->c_str(), utf8_text->length()); 00305 for (UNICHAR::const_iterator it = it_begin; it != it_end;) { 00306 // Skip bad utf-8. 00307 if (!it.is_legal()) { 00308 ++it; // One suitable error message will still be issued. 00309 continue; 00310 } 00311 int unicode = *it; 00312 int utf8_len = it.utf8_len(); 00313 const char* utf8_char = it.utf8_data(); 00314 // Move it forward before the data gets modified. 00315 ++it; 00316 if (!IsWhitespace(unicode) && !pango_is_zero_width(unicode) && 00317 pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) { 00318 if (TLOG_IS_ON(2)) { 00319 UNICHAR unichar(unicode); 00320 char* str = unichar.utf8_str(); 00321 tlog(2, "'%s' (U+%x) not covered by font\n", str, unicode); 00322 delete[] str; 00323 } 00324 ++num_dropped_chars; 00325 continue; 00326 } 00327 my_strnmove(out, utf8_char, utf8_len); 00328 out += utf8_len; 00329 } 00330 utf8_text->resize(out - utf8_text->c_str()); 00331 return num_dropped_chars; 00332 } 00333 00334 bool PangoFontInfo::GetSpacingProperties(const string& utf8_char, 00335 int* x_bearing, int* x_advance) const { 00336 // Convert to equivalent PangoFont structure 00337 PangoFont* font = ToPangoFont(); 00338 // Find the glyph index in the font for the supplied utf8 character. 00339 int total_advance = 0; 00340 int min_bearing = 0; 00341 // Handle multi-unicode strings by reporting the left-most position of the 00342 // x-bearing, and right-most position of the x-advance if the string were to 00343 // be rendered. 00344 const UNICHAR::const_iterator it_begin = UNICHAR::begin(utf8_char.c_str(), 00345 utf8_char.length()); 00346 const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_char.c_str(), 00347 utf8_char.length()); 00348 for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) { 00349 PangoGlyph glyph_index = pango_fc_font_get_glyph( 00350 reinterpret_cast<PangoFcFont*>(font), *it); 00351 if (!glyph_index) { 00352 // Glyph for given unicode character doesn't exist in font. 00353 return false; 00354 } 00355 // Find the ink glyph extents for the glyph 00356 PangoRectangle ink_rect, logical_rect; 00357 pango_font_get_glyph_extents(font, glyph_index, &ink_rect, &logical_rect); 00358 pango_extents_to_pixels(&ink_rect, NULL); 00359 pango_extents_to_pixels(&logical_rect, NULL); 00360 00361 int bearing = total_advance + PANGO_LBEARING(ink_rect); 00362 if (it == it_begin || bearing < min_bearing) { 00363 min_bearing = bearing; 00364 } 00365 total_advance += PANGO_RBEARING(logical_rect); 00366 } 00367 *x_bearing = min_bearing; 00368 *x_advance = total_advance; 00369 return true; 00370 } 00371 00372 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len) const { 00373 vector<string> graphemes; 00374 return CanRenderString(utf8_word, len, &graphemes); 00375 } 00376 00377 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len, 00378 vector<string>* graphemes) const { 00379 if (graphemes) graphemes->clear(); 00380 // We check for font coverage of the text first, as otherwise Pango could 00381 // (undesirably) fall back to another font that does have the required 00382 // coverage. 00383 if (!CoversUTF8Text(utf8_word, len)) { 00384 return false; 00385 } 00386 // U+25CC dotted circle character that often (but not always) gets rendered 00387 // when there is an illegal grapheme sequence. 00388 const char32 kDottedCircleGlyph = 9676; 00389 bool bad_glyph = false; 00390 PangoFontMap* font_map = pango_cairo_font_map_get_default(); 00391 PangoContext* context = pango_context_new(); 00392 pango_context_set_font_map(context, font_map); 00393 PangoLayout* layout; 00394 { 00395 // Pango is not relasing the cached layout. 00396 DISABLE_HEAP_LEAK_CHECK; 00397 layout = pango_layout_new(context); 00398 } 00399 if (desc_) { 00400 pango_layout_set_font_description(layout, desc_); 00401 } else { 00402 PangoFontDescription *desc = pango_font_description_from_string( 00403 DescriptionName().c_str()); 00404 pango_layout_set_font_description(layout, desc); 00405 pango_font_description_free(desc); 00406 } 00407 pango_layout_set_text(layout, utf8_word, len); 00408 PangoLayoutIter* run_iter = NULL; 00409 { // Fontconfig caches some information here that is not freed before exit. 00410 DISABLE_HEAP_LEAK_CHECK; 00411 run_iter = pango_layout_get_iter(layout); 00412 } 00413 do { 00414 PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter); 00415 if (!run) { 00416 tlog(2, "Found end of line NULL run marker\n"); 00417 continue; 00418 } 00419 PangoGlyph dotted_circle_glyph; 00420 PangoFont* font = run->item->analysis.font; 00421 dotted_circle_glyph = pango_fc_font_get_glyph( 00422 reinterpret_cast<PangoFcFont*>(font), kDottedCircleGlyph); 00423 if (TLOG_IS_ON(2)) { 00424 PangoFontDescription* desc = pango_font_describe(font); 00425 char* desc_str = pango_font_description_to_string(desc); 00426 tlog(2, "Desc of font in run: %s\n", desc_str); 00427 g_free(desc_str); 00428 pango_font_description_free(desc); 00429 } 00430 00431 PangoGlyphItemIter cluster_iter; 00432 gboolean have_cluster; 00433 for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter, 00434 run, utf8_word); 00435 have_cluster && !bad_glyph; 00436 have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) { 00437 const int start_byte_index = cluster_iter.start_index; 00438 const int end_byte_index = cluster_iter.end_index; 00439 int start_glyph_index = cluster_iter.start_glyph; 00440 int end_glyph_index = cluster_iter.end_glyph; 00441 string cluster_text = string(utf8_word + start_byte_index, 00442 end_byte_index - start_byte_index); 00443 if (graphemes) graphemes->push_back(cluster_text); 00444 if (IsUTF8Whitespace(cluster_text.c_str())) { 00445 tlog(2, "Skipping whitespace\n"); 00446 continue; 00447 } 00448 if (TLOG_IS_ON(2)) { 00449 printf("start_byte=%d end_byte=%d start_glyph=%d end_glyph=%d ", 00450 start_byte_index, end_byte_index, 00451 start_glyph_index, end_glyph_index); 00452 } 00453 for (int i = start_glyph_index, 00454 step = (end_glyph_index > start_glyph_index) ? 1 : -1; 00455 !bad_glyph && i != end_glyph_index; i+= step) { 00456 const bool unknown_glyph = 00457 (cluster_iter.glyph_item->glyphs->glyphs[i].glyph & 00458 PANGO_GLYPH_UNKNOWN_FLAG); 00459 const bool illegal_glyph = 00460 (cluster_iter.glyph_item->glyphs->glyphs[i].glyph == 00461 dotted_circle_glyph); 00462 bad_glyph = unknown_glyph || illegal_glyph; 00463 if (TLOG_IS_ON(2)) { 00464 printf("(%d=%d)", cluster_iter.glyph_item->glyphs->glyphs[i].glyph, 00465 bad_glyph ? 1 : 0); 00466 } 00467 } 00468 if (TLOG_IS_ON(2)) { 00469 printf(" '%s'\n", cluster_text.c_str()); 00470 } 00471 if (bad_glyph) 00472 tlog(1, "Found illegal glyph!\n"); 00473 } 00474 } while (!bad_glyph && pango_layout_iter_next_run(run_iter)); 00475 00476 pango_layout_iter_free(run_iter); 00477 g_object_unref(context); 00478 g_object_unref(layout); 00479 if (bad_glyph && graphemes) graphemes->clear(); 00480 return !bad_glyph; 00481 } 00482 00483 00484 // ------------------------ FontUtils ------------------------------------ 00485 vector<string> FontUtils::available_fonts_; // cache list 00486 00487 // Returns whether the specified font description is available in the fonts 00488 // directory. 00489 // 00490 // The generated list of font families and faces includes "synthesized" font 00491 // faces that are not truly loadable. Pango versions >=1.18 have a 00492 // pango_font_face_is_synthesized method that can be used to prune the list. 00493 // Until then, we are restricted to using a hack where we try to load the font 00494 // from the font_map, and then check what we loaded to see if it has the 00495 // description we expected. If it is not, then the font is deemed unavailable. 00496 /* static */ 00497 bool FontUtils::IsAvailableFont(const char* input_query_desc, 00498 string* best_match) { 00499 string query_desc(input_query_desc); 00500 if (PANGO_VERSION <= 12005) { 00501 // Strip commas and any ' Medium' substring in the name. 00502 query_desc.erase(std::remove(query_desc.begin(), query_desc.end(), ','), 00503 query_desc.end()); 00504 const string kMediumStr = " Medium"; 00505 std::size_t found = query_desc.find(kMediumStr); 00506 if (found != std::string::npos) { 00507 query_desc.erase(found, kMediumStr.length()); 00508 } 00509 } 00510 00511 PangoFontDescription *desc = pango_font_description_from_string( 00512 query_desc.c_str()); 00513 PangoFont* selected_font = NULL; 00514 { 00515 PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir.c_str()); 00516 PangoFontMap* font_map = pango_cairo_font_map_get_default(); 00517 PangoContext* context = pango_context_new(); 00518 pango_context_set_font_map(context, font_map); 00519 { 00520 DISABLE_HEAP_LEAK_CHECK; 00521 selected_font = pango_font_map_load_font(font_map, context, desc); 00522 } 00523 g_object_unref(context); 00524 } 00525 if (selected_font == NULL) { 00526 pango_font_description_free(desc); 00527 return false; 00528 } 00529 PangoFontDescription* selected_desc = pango_font_describe(selected_font); 00530 00531 bool equal = pango_font_description_equal(desc, selected_desc); 00532 tlog(3, "query weight = %d \t selected weight =%d\n", 00533 pango_font_description_get_weight(desc), 00534 pango_font_description_get_weight(selected_desc)); 00535 00536 char* selected_desc_str = pango_font_description_to_string(selected_desc); 00537 tlog(2, "query_desc: '%s' Selected: 's'\n", query_desc.c_str(), 00538 selected_desc_str); 00539 if (!equal && best_match != NULL) { 00540 *best_match = selected_desc_str; 00541 // Clip the ending ' 0' if there is one. It seems that, if there is no 00542 // point size on the end of the fontname, then Pango always appends ' 0'. 00543 int len = best_match->size(); 00544 if (len > 2 && best_match->at(len - 1) == '0' && 00545 best_match->at(len - 2) == ' ') { 00546 *best_match = best_match->substr(0, len - 2); 00547 } 00548 } 00549 g_free(selected_desc_str); 00550 pango_font_description_free(selected_desc); 00551 g_object_unref(selected_font); 00552 pango_font_description_free(desc); 00553 return equal; 00554 } 00555 00556 static bool ShouldIgnoreFontFamilyName(const char* query) { 00557 static const char* kIgnoredFamilyNames[] 00558 = { "Sans", "Serif", "Monospace", NULL }; 00559 const char** list = kIgnoredFamilyNames; 00560 for (; *list != NULL; ++list) { 00561 if (!strcmp(*list, query)) 00562 return true; 00563 } 00564 return false; 00565 } 00566 00567 // Outputs description names of available fonts. 00568 /* static */ 00569 const vector<string>& FontUtils::ListAvailableFonts() { 00570 if (available_fonts_.size()) { 00571 return available_fonts_; 00572 } 00573 #ifndef USE_STD_NAMESPACE 00574 if (FLAGS_use_only_legacy_fonts) { 00575 // Restrict view to list of fonts in legacy_fonts.h 00576 tprintf("Using list of legacy fonts only\n"); 00577 const int kNumFontLists = 4; 00578 for (int i = 0; i < kNumFontLists; ++i) { 00579 for (int j = 0; kFontlists[i][j] != NULL; ++j) { 00580 available_fonts_.push_back(kFontlists[i][j]); 00581 } 00582 } 00583 return available_fonts_; 00584 } 00585 #endif 00586 00587 PangoFontFamily** families = 0; 00588 int n_families = 0; 00589 ListFontFamilies(&families, &n_families); 00590 for (int i = 0; i < n_families; ++i) { 00591 const char* family_name = pango_font_family_get_name(families[i]); 00592 tlog(2, "Listing family %s\n", family_name); 00593 if (ShouldIgnoreFontFamilyName(family_name)) { 00594 continue; 00595 } 00596 00597 int n_faces; 00598 PangoFontFace** faces = NULL; 00599 pango_font_family_list_faces(families[i], &faces, &n_faces); 00600 for (int j = 0; j < n_faces; ++j) { 00601 PangoFontDescription* desc = pango_font_face_describe(faces[j]); 00602 char* desc_str = pango_font_description_to_string(desc); 00603 if (IsAvailableFont(desc_str)) { 00604 available_fonts_.push_back(desc_str); 00605 } 00606 pango_font_description_free(desc); 00607 g_free(desc_str); 00608 } 00609 g_free(faces); 00610 } 00611 g_free(families); 00612 sort(available_fonts_.begin(), available_fonts_.end()); 00613 return available_fonts_; 00614 } 00615 00616 00617 static void CharCoverageMapToBitmap(PangoCoverage* coverage, 00618 vector<bool>* unichar_bitmap) { 00619 const int kMinUnicodeValue = 33; 00620 const int kMaxUnicodeValue = 0x10FFFF; 00621 unichar_bitmap->resize(kMaxUnicodeValue + 1, false); 00622 // Mark off characters that the font can render. 00623 for (int i = kMinUnicodeValue; i <= kMaxUnicodeValue; ++i) { 00624 if (IsInterchangeValid(i)) { 00625 (*unichar_bitmap)[i] 00626 = (pango_coverage_get(coverage, i) == PANGO_COVERAGE_EXACT); 00627 } 00628 } 00629 } 00630 00631 /* static */ 00632 void FontUtils::GetAllRenderableCharacters(vector<bool>* unichar_bitmap) { 00633 const vector<string>& all_fonts = ListAvailableFonts(); 00634 return GetAllRenderableCharacters(all_fonts, unichar_bitmap); 00635 } 00636 00637 /* static */ 00638 void FontUtils::GetAllRenderableCharacters(const string& font_name, 00639 vector<bool>* unichar_bitmap) { 00640 PangoFontInfo font_info(font_name); 00641 PangoCoverage* coverage = pango_font_get_coverage( 00642 font_info.ToPangoFont(), NULL); 00643 CharCoverageMapToBitmap(coverage, unichar_bitmap); 00644 } 00645 00646 /* static */ 00647 void FontUtils::GetAllRenderableCharacters(const vector<string>& fonts, 00648 vector<bool>* unichar_bitmap) { 00649 // Form the union of coverage maps from the fonts 00650 PangoCoverage* all_coverage = pango_coverage_new(); 00651 tlog(1, "Processing %d fonts\n", fonts.size()); 00652 for (int i = 0; i < fonts.size(); ++i) { 00653 PangoFontInfo font_info(fonts[i]); 00654 PangoCoverage* coverage = pango_font_get_coverage( 00655 font_info.ToPangoFont(), NULL); 00656 // Mark off characters that any font can render. 00657 pango_coverage_max(all_coverage, coverage); 00658 } 00659 CharCoverageMapToBitmap(all_coverage, unichar_bitmap); 00660 pango_coverage_unref(all_coverage); 00661 } 00662 00663 00664 // Utilities written to be backward compatible with StringRender 00665 00666 /* static */ 00667 int FontUtils::FontScore(const unordered_map<char32, inT64>& ch_map, 00668 const string& fontname, 00669 int* raw_score, 00670 vector<bool>* ch_flags) { 00671 PangoFontInfo font_info; 00672 if (!font_info.ParseFontDescriptionName(fontname)) { 00673 tprintf("ERROR: Could not parse %s\n", fontname.c_str()); 00674 } 00675 PangoFont* font = font_info.ToPangoFont(); 00676 PangoCoverage* coverage = pango_font_get_coverage(font, NULL); 00677 00678 if (ch_flags) { 00679 ch_flags->clear(); 00680 ch_flags->reserve(ch_map.size()); 00681 } 00682 *raw_score = 0; 00683 int ok_chars = 0; 00684 for (unordered_map<char32, inT64>::const_iterator it = ch_map.begin(); 00685 it != ch_map.end(); ++it) { 00686 bool covered = (IsWhitespace(it->first) || 00687 (pango_coverage_get(coverage, it->first) 00688 == PANGO_COVERAGE_EXACT)); 00689 if (covered) { 00690 ++(*raw_score); 00691 ok_chars += it->second; 00692 } 00693 if (ch_flags) { 00694 ch_flags->push_back(covered); 00695 } 00696 } 00697 return ok_chars; 00698 } 00699 00700 00701 /* static */ 00702 string FontUtils::BestFonts(const unordered_map<char32, inT64>& ch_map, 00703 vector<pair<const char*, vector<bool> > >* fonts) { 00704 const double kMinOKFraction = 0.99; 00705 // Weighted fraction of characters that must be renderable in a font to make 00706 // it OK even if the raw count is not good. 00707 const double kMinWeightedFraction = 0.99995; 00708 00709 fonts->clear(); 00710 vector<vector<bool> > font_flags; 00711 vector<int> font_scores; 00712 vector<int> raw_scores; 00713 int most_ok_chars = 0; 00714 int best_raw_score = 0; 00715 const vector<string>& font_names = FontUtils::ListAvailableFonts(); 00716 for (int i = 0; i < font_names.size(); ++i) { 00717 vector<bool> ch_flags; 00718 int raw_score = 0; 00719 int ok_chars = FontScore(ch_map, font_names[i], &raw_score, &ch_flags); 00720 most_ok_chars = MAX(ok_chars, most_ok_chars); 00721 best_raw_score = MAX(raw_score, best_raw_score); 00722 00723 font_flags.push_back(ch_flags); 00724 font_scores.push_back(ok_chars); 00725 raw_scores.push_back(raw_score); 00726 } 00727 00728 // Now select the fonts with a score above a threshold fraction 00729 // of both the raw and weighted best scores. To prevent bogus fonts being 00730 // selected for CJK, we require a high fraction (kMinOKFraction = 0.99) of 00731 // BOTH weighted and raw scores. 00732 // In low character-count scripts, the issue is more getting enough fonts, 00733 // when only 1 or 2 might have all those rare dingbats etc in them, so we 00734 // allow a font with a very high weighted (coverage) score 00735 // (kMinWeightedFraction = 0.99995) to be used even if its raw score is poor. 00736 int least_good_enough = static_cast<int>(most_ok_chars * kMinOKFraction); 00737 int least_raw_enough = static_cast<int>(best_raw_score * kMinOKFraction); 00738 int override_enough = static_cast<int>(most_ok_chars * kMinWeightedFraction); 00739 00740 string font_list; 00741 for (int i = 0; i < font_names.size(); ++i) { 00742 int score = font_scores[i]; 00743 int raw_score = raw_scores[i]; 00744 if ((score >= least_good_enough && raw_score >= least_raw_enough) || 00745 score >= override_enough) { 00746 fonts->push_back(make_pair(font_names[i].c_str(), font_flags[i])); 00747 tlog(1, "OK font %s = %.4f%%, raw = %d = %.2f%%\n", 00748 font_names[i].c_str(), 00749 100.0 * score / most_ok_chars, 00750 raw_score, 100.0 * raw_score / best_raw_score); 00751 font_list += font_names[i]; 00752 font_list += "\n"; 00753 } else if (score >= least_good_enough || raw_score >= least_raw_enough) { 00754 tlog(1, "Runner-up font %s = %.4f%%, raw = %d = %.2f%%\n", 00755 font_names[i].c_str(), 00756 100.0 * score / most_ok_chars, 00757 raw_score, 100.0 * raw_score / best_raw_score); 00758 } 00759 } 00760 return font_list; 00761 } 00762 00763 /* static */ 00764 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len, 00765 string* font_name, vector<string>* graphemes) { 00766 return SelectFont(utf8_word, utf8_len, ListAvailableFonts(), font_name, 00767 graphemes); 00768 } 00769 00770 /* static */ 00771 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len, 00772 const vector<string>& all_fonts, 00773 string* font_name, vector<string>* graphemes) { 00774 if (font_name) font_name->clear(); 00775 if (graphemes) graphemes->clear(); 00776 for (int i = 0; i < all_fonts.size(); ++i) { 00777 PangoFontInfo font; 00778 vector<string> found_graphemes; 00779 ASSERT_HOST_MSG(font.ParseFontDescriptionName(all_fonts[i]), 00780 "Could not parse font desc name %s\n", 00781 all_fonts[i].c_str()); 00782 if (font.CanRenderString(utf8_word, utf8_len, &found_graphemes)) { 00783 if (graphemes) graphemes->swap(found_graphemes); 00784 if (font_name) *font_name = all_fonts[i]; 00785 return true; 00786 } 00787 } 00788 return false; 00789 } 00790 00791 // PangoFontInfo is reinitialized, so clear the static list of fonts. 00792 /* static */ 00793 void FontUtils::ReInit() { available_fonts_.clear(); } 00794 00795 } // namespace tesseract