|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: stringrenderer.cpp 00003 * Description: Class for rendering UTF-8 text to an image, and retrieving 00004 * bounding boxes around each grapheme cluster. 00005 * Author: Ranjith Unnikrishnan 00006 * Created: Mon Nov 18 2013 00007 * 00008 * (C) Copyright 2013, Google Inc. 00009 * Licensed under the Apache License, Version 2.0 (the "License"); 00010 * you may not use this file except in compliance with the License. 00011 * You may obtain a copy of the License at 00012 * http://www.apache.org/licenses/LICENSE-2.0 00013 * Unless required by applicable law or agreed to in writing, software 00014 * distributed under the License is distributed on an "AS IS" BASIS, 00015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 * See the License for the specific language governing permissions and 00017 * limitations under the License. 00018 * 00019 **********************************************************************/ 00020 00021 #include "stringrenderer.h" 00022 00023 #include <stdio.h> 00024 #include <string.h> 00025 #include <algorithm> 00026 #include <map> 00027 #include <utility> 00028 #include <vector> 00029 00030 #include "allheaders.h" // from leptonica 00031 #include "boxchar.h" 00032 #include "ligature_table.h" 00033 #include "normstrngs.h" 00034 #include "pango/pango-font.h" 00035 #include "pango/pango-glyph-item.h" 00036 #include "tlog.h" 00037 #include "unichar.h" 00038 #include "unicode/uchar.h" // from libicu 00039 #include "util.h" 00040 00041 #ifdef USE_STD_NAMESPACE 00042 using std::map; 00043 using std::max; 00044 using std::min; 00045 using std::swap; 00046 #endif 00047 00048 namespace tesseract { 00049 00050 static const int kDefaultOutputResolution = 300; 00051 00052 // Word joiner (U+2060) inserted after letters in ngram mode, as per 00053 // recommendation in http://unicode.org/reports/tr14/ to avoid line-breaks at 00054 // hyphens and other non-alpha characters. 00055 static const char* kWordJoinerUTF8 = "\u2060"; 00056 static const char32 kWordJoiner = 0x2060; 00057 00058 static bool IsCombiner(int ch) { 00059 const int char_type = u_charType(ch); 00060 return ((char_type == U_NON_SPACING_MARK) || 00061 (char_type == U_ENCLOSING_MARK) || 00062 (char_type == U_COMBINING_SPACING_MARK)); 00063 } 00064 00065 static string EncodeAsUTF8(const char32 ch32) { 00066 UNICHAR uni_ch(ch32); 00067 return string(uni_ch.utf8(), uni_ch.utf8_len()); 00068 } 00069 00070 // Returns true with probability 'prob'. 00071 static bool RandBool(const double prob, TRand* rand) { 00072 if (prob == 1.0) return true; 00073 if (prob == 0.0) return false; 00074 return rand->UnsignedRand(1.0) < prob; 00075 } 00076 00077 /* static */ 00078 Pix* CairoARGB32ToPixFormat(cairo_surface_t *surface) { 00079 if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) { 00080 printf("Unexpected surface format %d\n", 00081 cairo_image_surface_get_format(surface)); 00082 return NULL; 00083 } 00084 const int width = cairo_image_surface_get_width(surface); 00085 const int height = cairo_image_surface_get_height(surface); 00086 Pix* pix = pixCreate(width, height, 32); 00087 int byte_stride = cairo_image_surface_get_stride(surface); 00088 00089 for (int i = 0; i < height; ++i) { 00090 memcpy(reinterpret_cast<unsigned char*>(pix->data + i * pix->wpl) + 1, 00091 cairo_image_surface_get_data(surface) + i * byte_stride, 00092 byte_stride - ((i == height - 1) ? 1 : 0)); 00093 } 00094 return pix; 00095 } 00096 00097 StringRenderer::StringRenderer(const string& font_desc, int page_width, 00098 int page_height) 00099 : page_width_(page_width), 00100 page_height_(page_height), 00101 h_margin_(50), 00102 v_margin_(50), 00103 char_spacing_(0), 00104 leading_(0), 00105 vertical_text_(false), 00106 gravity_hint_strong_(false), 00107 render_fullwidth_latin_(false), 00108 underline_start_prob_(0), 00109 underline_continuation_prob_(0), 00110 underline_style_(PANGO_UNDERLINE_SINGLE), 00111 drop_uncovered_chars_(true), 00112 strip_unrenderable_words_(false), 00113 add_ligatures_(false), 00114 output_word_boxes_(false), 00115 surface_(NULL), 00116 cr_(NULL), 00117 layout_(NULL), 00118 start_box_(0), 00119 page_(0), 00120 box_padding_(0), 00121 total_chars_(0), 00122 font_index_(0), 00123 last_offset_(0) { 00124 pen_color_[0] = 0.0; 00125 pen_color_[1] = 0.0; 00126 pen_color_[2] = 0.0; 00127 set_font(font_desc); 00128 set_resolution(kDefaultOutputResolution); 00129 page_boxes_ = NULL; 00130 } 00131 00132 bool StringRenderer::set_font(const string& desc) { 00133 bool success = font_.ParseFontDescriptionName(desc); 00134 font_.set_resolution(resolution_); 00135 return success; 00136 } 00137 00138 void StringRenderer::set_resolution(const int resolution) { 00139 resolution_ = resolution; 00140 font_.set_resolution(resolution); 00141 } 00142 00143 void StringRenderer::set_underline_start_prob(const double frac) { 00144 underline_start_prob_ = min(max(frac, 0.0), 1.0); 00145 } 00146 00147 void StringRenderer::set_underline_continuation_prob(const double frac) { 00148 underline_continuation_prob_ = min(max(frac, 0.0), 1.0); 00149 } 00150 00151 StringRenderer::~StringRenderer() { 00152 ClearBoxes(); 00153 FreePangoCairo(); 00154 } 00155 00156 void StringRenderer::InitPangoCairo() { 00157 FreePangoCairo(); 00158 surface_ = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, page_width_, 00159 page_height_); 00160 cr_ = cairo_create(surface_); 00161 { 00162 DISABLE_HEAP_LEAK_CHECK; 00163 layout_ = pango_cairo_create_layout(cr_); 00164 } 00165 00166 if (vertical_text_) { 00167 PangoContext* context = pango_layout_get_context(layout_); 00168 pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST); 00169 if (gravity_hint_strong_) { 00170 pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG); 00171 } 00172 pango_layout_context_changed(layout_); 00173 } 00174 00175 SetLayoutProperties(); 00176 } 00177 00178 void StringRenderer::SetLayoutProperties() { 00179 string font_desc = font_.DescriptionName(); 00180 // Specify the font via a description name 00181 PangoFontDescription *desc = 00182 pango_font_description_from_string(font_desc.c_str()); 00183 // Assign the font description to the layout 00184 pango_layout_set_font_description(layout_, desc); 00185 pango_font_description_free(desc); // free the description 00186 pango_cairo_context_set_resolution(pango_layout_get_context(layout_), 00187 resolution_); 00188 00189 int max_width = page_width_ - 2 * h_margin_; 00190 int max_height = page_height_ - 2 * v_margin_; 00191 tlog(3, "max_width = %d, max_height = %d\n", max_width, max_height); 00192 if (vertical_text_) { 00193 swap(max_width, max_height); 00194 } 00195 pango_layout_set_width(layout_, max_width * PANGO_SCALE); 00196 pango_layout_set_wrap(layout_, PANGO_WRAP_WORD); 00197 00198 // Adjust character spacing 00199 PangoAttrList* attr_list = pango_attr_list_new(); 00200 if (char_spacing_) { 00201 PangoAttribute* spacing_attr = pango_attr_letter_spacing_new( 00202 static_cast<int>(char_spacing_ * PANGO_SCALE + 0.5)); 00203 spacing_attr->start_index = 0; 00204 spacing_attr->end_index = static_cast<guint>(-1); 00205 pango_attr_list_change(attr_list, spacing_attr); 00206 } 00207 pango_layout_set_attributes(layout_, attr_list); 00208 pango_attr_list_unref(attr_list); 00209 // Adjust line spacing 00210 if (leading_) { 00211 pango_layout_set_spacing(layout_, leading_ * PANGO_SCALE); 00212 } 00213 } 00214 00215 void StringRenderer::FreePangoCairo() { 00216 if (layout_) { 00217 g_object_unref(layout_); 00218 layout_ = NULL; 00219 } 00220 if (cr_) { 00221 cairo_destroy(cr_); 00222 cr_ = NULL; 00223 } 00224 if (surface_) { 00225 cairo_surface_destroy(surface_); 00226 surface_ = NULL; 00227 } 00228 } 00229 00230 void StringRenderer::SetWordUnderlineAttributes(const string& page_text) { 00231 if (underline_start_prob_ == 0) return; 00232 PangoAttrList* attr_list = pango_layout_get_attributes(layout_); 00233 00234 const char* text = page_text.c_str(); 00235 int offset = 0; 00236 TRand rand; 00237 bool started_underline = false; 00238 PangoAttribute* und_attr = nullptr; 00239 00240 while (offset < page_text.length()) { 00241 offset += SpanUTF8Whitespace(text + offset); 00242 if (offset == page_text.length()) break; 00243 00244 int word_start = offset; 00245 int word_len = SpanUTF8NotWhitespace(text + offset); 00246 offset += word_len; 00247 if (started_underline) { 00248 // Should we continue the underline to the next word? 00249 if (RandBool(underline_continuation_prob_, &rand)) { 00250 // Continue the current underline to this word. 00251 und_attr->end_index = word_start + word_len; 00252 } else { 00253 // Otherwise end the current underline attribute at the end of the 00254 // previous word. 00255 pango_attr_list_insert(attr_list, und_attr); 00256 started_underline = false; 00257 und_attr = nullptr; 00258 } 00259 } 00260 if (!started_underline && RandBool(underline_start_prob_, &rand)) { 00261 // Start a new underline attribute 00262 und_attr = pango_attr_underline_new(underline_style_); 00263 und_attr->start_index = word_start; 00264 und_attr->end_index = word_start + word_len; 00265 started_underline = true; 00266 } 00267 } 00268 // Finish the current underline attribute at the end of the page. 00269 if (started_underline) { 00270 und_attr->end_index = page_text.length(); 00271 pango_attr_list_insert(attr_list, und_attr); 00272 } 00273 } 00274 00275 // Returns offset in utf8 bytes to first page. 00276 int StringRenderer::FindFirstPageBreakOffset(const char* text, 00277 int text_length) { 00278 if (!text_length) return 0; 00279 const int max_height = (page_height_ - 2 * v_margin_); 00280 const int max_width = (page_width_ - 2 * h_margin_); 00281 const int max_layout_height = vertical_text_ ? max_width : max_height; 00282 00283 UNICHAR::const_iterator it = UNICHAR::begin(text, text_length); 00284 const UNICHAR::const_iterator it_end = UNICHAR::end(text, text_length); 00285 const int kMaxUnicodeBufLength = 15000; 00286 for (int i = 0; i < kMaxUnicodeBufLength && it != it_end; ++it, ++i); 00287 int buf_length = it.utf8_data() - text; 00288 tlog(1, "len = %d buf_len = %d\n", text_length, buf_length); 00289 pango_layout_set_text(layout_, text, buf_length); 00290 00291 PangoLayoutIter* line_iter = NULL; 00292 { // Fontconfig caches some info here that is not freed before exit. 00293 DISABLE_HEAP_LEAK_CHECK; 00294 line_iter = pango_layout_get_iter(layout_); 00295 } 00296 bool first_page = true; 00297 int page_top = 0; 00298 int offset = buf_length; 00299 do { 00300 // Get bounding box of the current line 00301 PangoRectangle line_ink_rect; 00302 pango_layout_iter_get_line_extents(line_iter, &line_ink_rect, NULL); 00303 pango_extents_to_pixels(&line_ink_rect, NULL); 00304 PangoLayoutLine* line = pango_layout_iter_get_line_readonly(line_iter); 00305 if (first_page) { 00306 page_top = line_ink_rect.y; 00307 first_page = false; 00308 } 00309 int line_bottom = line_ink_rect.y + line_ink_rect.height; 00310 if (line_bottom - page_top > max_layout_height) { 00311 offset = line->start_index; 00312 tlog(1, "Found offset = %d\n", offset); 00313 break; 00314 } 00315 } while (pango_layout_iter_next_line(line_iter)); 00316 pango_layout_iter_free(line_iter); 00317 return offset; 00318 } 00319 00320 const vector<BoxChar*>& StringRenderer::GetBoxes() const { 00321 return boxchars_; 00322 } 00323 00324 Boxa* StringRenderer::GetPageBoxes() const { 00325 return page_boxes_; 00326 } 00327 00328 void StringRenderer::RotatePageBoxes(float rotation) { 00329 BoxChar::RotateBoxes(rotation, page_width_ / 2, page_height_ / 2, 00330 start_box_, boxchars_.size(), &boxchars_); 00331 } 00332 00333 00334 void StringRenderer::ClearBoxes() { 00335 for (int i = 0; i < boxchars_.size(); ++i) 00336 delete boxchars_[i]; 00337 boxchars_.clear(); 00338 boxaDestroy(&page_boxes_); 00339 } 00340 00341 void StringRenderer::WriteAllBoxes(const string& filename) { 00342 BoxChar::PrepareToWrite(&boxchars_); 00343 BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_); 00344 } 00345 00346 // Returns cluster strings in logical order. 00347 bool StringRenderer::GetClusterStrings(vector<string>* cluster_text) { 00348 map<int, string> start_byte_to_text; 00349 PangoLayoutIter* run_iter = pango_layout_get_iter(layout_); 00350 const char* full_text = pango_layout_get_text(layout_); 00351 do { 00352 PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter); 00353 if (!run) { 00354 // End of line NULL run marker 00355 tlog(2, "Found end of line marker\n"); 00356 continue; 00357 } 00358 PangoGlyphItemIter cluster_iter; 00359 gboolean have_cluster; 00360 for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter, 00361 run, full_text); 00362 have_cluster; 00363 have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) { 00364 const int start_byte_index = cluster_iter.start_index; 00365 const int end_byte_index = cluster_iter.end_index; 00366 string text = string(full_text + start_byte_index, 00367 end_byte_index - start_byte_index); 00368 if (IsUTF8Whitespace(text.c_str())) { 00369 tlog(2, "Found whitespace\n"); 00370 text = " "; 00371 } 00372 tlog(2, "start_byte=%d end_byte=%d : '%s'\n", start_byte_index, 00373 end_byte_index, text.c_str()); 00374 if (add_ligatures_) { 00375 // Make sure the output box files have ligatured text in case the font 00376 // decided to use an unmapped glyph. 00377 text = LigatureTable::Get()->AddLigatures(text, NULL); 00378 } 00379 start_byte_to_text[start_byte_index] = text; 00380 } 00381 } while (pango_layout_iter_next_run(run_iter)); 00382 pango_layout_iter_free(run_iter); 00383 00384 cluster_text->clear(); 00385 for (map<int, string>::const_iterator it = start_byte_to_text.begin(); 00386 it != start_byte_to_text.end(); ++it) { 00387 cluster_text->push_back(it->second); 00388 } 00389 return cluster_text->size(); 00390 } 00391 00392 // Merges an array of BoxChars into words based on the identification of 00393 // BoxChars containing the space character as inter-word separators. 00394 // 00395 // Sometime two adjacent characters in the sequence may be detected as lying on 00396 // different lines based on their spatial positions. This may be the result of a 00397 // newline character at end of the last word on a line in the source text, or of 00398 // a discretionary line-break created by Pango at intra-word locations like 00399 // hyphens. When this is detected the word is split at that location into 00400 // multiple BoxChars. Otherwise, each resulting BoxChar will contain a word and 00401 // its bounding box. 00402 static void MergeBoxCharsToWords(vector<BoxChar*>* boxchars) { 00403 vector<BoxChar*> result; 00404 bool started_word = false; 00405 for (int i = 0; i < boxchars->size(); ++i) { 00406 if (boxchars->at(i)->ch() == " " || 00407 boxchars->at(i)->box() == NULL) { 00408 result.push_back(boxchars->at(i)); 00409 boxchars->at(i) = NULL; 00410 started_word = false; 00411 continue; 00412 } 00413 00414 if (!started_word) { 00415 // Begin new word 00416 started_word = true; 00417 result.push_back(boxchars->at(i)); 00418 boxchars->at(i) = NULL; 00419 } else { 00420 BoxChar* last_boxchar = result.back(); 00421 // Compute bounding box union 00422 const Box* box = boxchars->at(i)->box(); 00423 Box* last_box = last_boxchar->mutable_box(); 00424 int left = min(last_box->x, box->x); 00425 int right = max(last_box->x + last_box->w, box->x + box->w); 00426 int top = min(last_box->y, box->y); 00427 int bottom = max(last_box->y + last_box->h, box->y + box->h); 00428 // Conclude that the word was broken to span multiple lines based on the 00429 // size of the merged bounding box in relation to those of the individual 00430 // characters seen so far. 00431 if (right - left > last_box->w + 5 * box->w) { 00432 tlog(1, "Found line break after '%s'", last_boxchar->ch().c_str()); 00433 // Insert a fake interword space and start a new word with the current 00434 // boxchar. 00435 result.push_back(new BoxChar(" ", 1)); 00436 result.push_back(boxchars->at(i)); 00437 boxchars->at(i) = NULL; 00438 continue; 00439 } 00440 // Append to last word 00441 last_boxchar->mutable_ch()->append(boxchars->at(i)->ch()); 00442 last_box->x = left; 00443 last_box->w = right - left; 00444 last_box->y = top; 00445 last_box->h = bottom - top; 00446 delete boxchars->at(i); 00447 boxchars->at(i) = NULL; 00448 } 00449 } 00450 boxchars->swap(result); 00451 } 00452 00453 00454 void StringRenderer::ComputeClusterBoxes() { 00455 const char* text = pango_layout_get_text(layout_); 00456 PangoLayoutIter* cluster_iter = pango_layout_get_iter(layout_); 00457 00458 // Do a first pass to store cluster start indexes. 00459 vector<int> cluster_start_indices; 00460 do { 00461 cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter)); 00462 tlog(3, "Added %d\n", cluster_start_indices.back()); 00463 } while (pango_layout_iter_next_cluster(cluster_iter)); 00464 pango_layout_iter_free(cluster_iter); 00465 cluster_start_indices.push_back(strlen(text)); 00466 tlog(3, "Added last index %d\n", cluster_start_indices.back()); 00467 // Sort the indices and create a map from start to end indices. 00468 sort(cluster_start_indices.begin(), cluster_start_indices.end()); 00469 map<int, int> cluster_start_to_end_index; 00470 for (int i = 0; i < cluster_start_indices.size() - 1; ++i) { 00471 cluster_start_to_end_index[cluster_start_indices[i]] 00472 = cluster_start_indices[i + 1]; 00473 } 00474 00475 // Iterate again to compute cluster boxes and their text with the obtained 00476 // cluster extent information. 00477 cluster_iter = pango_layout_get_iter(layout_); 00478 // Store BoxChars* sorted by their byte start positions 00479 map<int, BoxChar*> start_byte_to_box; 00480 do { 00481 PangoRectangle cluster_rect; 00482 pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect, 00483 NULL); 00484 pango_extents_to_pixels(&cluster_rect, NULL); 00485 const int start_byte_index = pango_layout_iter_get_index(cluster_iter); 00486 const int end_byte_index = cluster_start_to_end_index[start_byte_index]; 00487 string cluster_text = string(text + start_byte_index, 00488 end_byte_index - start_byte_index); 00489 if (cluster_text.size() && cluster_text[0] == '\n') { 00490 tlog(2, "Skipping newlines at start of text.\n"); 00491 continue; 00492 } 00493 if (!cluster_rect.width || !cluster_rect.height || 00494 IsUTF8Whitespace(cluster_text.c_str())) { 00495 tlog(2, "Skipping whitespace with boxdim (%d,%d) '%s'\n", 00496 cluster_rect.width, cluster_rect.height, cluster_text.c_str()); 00497 BoxChar* boxchar = new BoxChar(" ", 1); 00498 boxchar->set_page(page_); 00499 start_byte_to_box[start_byte_index] = boxchar; 00500 continue; 00501 } 00502 // Prepare a boxchar for addition at this byte position. 00503 tlog(2, "[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\n", 00504 cluster_rect.x, cluster_rect.y, 00505 cluster_rect.width, cluster_rect.height, 00506 start_byte_index, end_byte_index, 00507 cluster_text.c_str()); 00508 ASSERT_HOST_MSG(cluster_rect.width, 00509 "cluster_text:%s start_byte_index:%d\n", 00510 cluster_text.c_str(), start_byte_index); 00511 ASSERT_HOST_MSG(cluster_rect.height, 00512 "cluster_text:%s start_byte_index:%d\n", 00513 cluster_text.c_str(), start_byte_index); 00514 if (box_padding_) { 00515 cluster_rect.x = max(0, cluster_rect.x - box_padding_); 00516 cluster_rect.width += 2 * box_padding_; 00517 cluster_rect.y = max(0, cluster_rect.y - box_padding_); 00518 cluster_rect.height += 2 * box_padding_; 00519 } 00520 if (add_ligatures_) { 00521 // Make sure the output box files have ligatured text in case the font 00522 // decided to use an unmapped glyph. 00523 cluster_text = LigatureTable::Get()->AddLigatures(cluster_text, NULL); 00524 } 00525 BoxChar* boxchar = new BoxChar(cluster_text.c_str(), cluster_text.size()); 00526 boxchar->set_page(page_); 00527 boxchar->AddBox(cluster_rect.x, cluster_rect.y, 00528 cluster_rect.width, cluster_rect.height); 00529 start_byte_to_box[start_byte_index] = boxchar; 00530 } while (pango_layout_iter_next_cluster(cluster_iter)); 00531 pango_layout_iter_free(cluster_iter); 00532 00533 // There is a subtle bug in the cluster text reported by the PangoLayoutIter 00534 // on ligatured characters (eg. The word "Lam-Aliph" in arabic). To work 00535 // around this, we use text reported using the PangoGlyphIter which is 00536 // accurate. 00537 // TODO(ranjith): Revisit whether this is still needed in newer versions of 00538 // pango. 00539 vector<string> cluster_text; 00540 if (GetClusterStrings(&cluster_text)) { 00541 ASSERT_HOST(cluster_text.size() == start_byte_to_box.size()); 00542 int ind = 0; 00543 for (map<int, BoxChar*>::iterator it = start_byte_to_box.begin(); 00544 it != start_byte_to_box.end(); ++it, ++ind) { 00545 it->second->mutable_ch()->swap(cluster_text[ind]); 00546 } 00547 } 00548 00549 // Append to the boxchars list in byte order. 00550 vector<BoxChar*> page_boxchars; 00551 page_boxchars.reserve(start_byte_to_box.size()); 00552 string last_ch; 00553 for (map<int, BoxChar*>::const_iterator it = start_byte_to_box.begin(); 00554 it != start_byte_to_box.end(); ++it) { 00555 if (it->second->ch() == kWordJoinerUTF8) { 00556 // Skip zero-width joiner characters (ZWJs) here. 00557 delete it->second; 00558 } else { 00559 page_boxchars.push_back(it->second); 00560 } 00561 } 00562 CorrectBoxPositionsToLayout(&page_boxchars); 00563 00564 if (render_fullwidth_latin_) { 00565 for (map<int, BoxChar*>::iterator it = start_byte_to_box.begin(); 00566 it != start_byte_to_box.end(); ++it) { 00567 // Convert fullwidth Latin characters to their halfwidth forms. 00568 string half(ConvertFullwidthLatinToBasicLatin(it->second->ch())); 00569 it->second->mutable_ch()->swap(half); 00570 } 00571 } 00572 00573 // Merge the character boxes into word boxes if we are rendering n-grams. 00574 if (output_word_boxes_) { 00575 MergeBoxCharsToWords(&page_boxchars); 00576 } 00577 00578 boxchars_.insert(boxchars_.end(), page_boxchars.begin(), page_boxchars.end()); 00579 00580 // Compute the page bounding box 00581 Box* page_box = NULL; 00582 Boxa* all_boxes = NULL; 00583 for (int i = 0; i < page_boxchars.size(); ++i) { 00584 if (page_boxchars[i]->box() == NULL) continue; 00585 if (all_boxes == NULL) 00586 all_boxes = boxaCreate(0); 00587 boxaAddBox(all_boxes, page_boxchars[i]->mutable_box(), L_CLONE); 00588 } 00589 boxaGetExtent(all_boxes, NULL, NULL, &page_box); 00590 boxaDestroy(&all_boxes); 00591 if (page_boxes_ == NULL) 00592 page_boxes_ = boxaCreate(0); 00593 boxaAddBox(page_boxes_, page_box, L_INSERT); 00594 } 00595 00596 00597 void StringRenderer::CorrectBoxPositionsToLayout(vector<BoxChar*>* boxchars) { 00598 if (vertical_text_) { 00599 const double rotation = - pango_gravity_to_rotation( 00600 pango_context_get_base_gravity(pango_layout_get_context(layout_))); 00601 BoxChar::TranslateBoxes(page_width_ - h_margin_, v_margin_, boxchars); 00602 BoxChar::RotateBoxes(rotation, page_width_ - h_margin_, v_margin_, 00603 0, boxchars->size(), boxchars); 00604 } else { 00605 BoxChar::TranslateBoxes(h_margin_, v_margin_, boxchars); 00606 } 00607 } 00608 00609 int StringRenderer::StripUnrenderableWords(string* utf8_text) const { 00610 string output_text; 00611 const char* text = utf8_text->c_str(); 00612 int offset = 0; 00613 int num_dropped = 0; 00614 while (offset < utf8_text->length()) { 00615 int space_len = SpanUTF8Whitespace(text + offset); 00616 output_text.append(text + offset, space_len); 00617 offset += space_len; 00618 if (offset == utf8_text->length()) break; 00619 00620 int word_len = SpanUTF8NotWhitespace(text + offset); 00621 if (font_.CanRenderString(text + offset, word_len)) { 00622 output_text.append(text + offset, word_len); 00623 } else { 00624 ++num_dropped; 00625 } 00626 offset += word_len; 00627 } 00628 utf8_text->swap(output_text); 00629 00630 if (num_dropped > 0) { 00631 tprintf("Stripped %d unrenderable words\n", num_dropped); 00632 } 00633 return num_dropped; 00634 } 00635 00636 int StringRenderer::RenderToGrayscaleImage(const char* text, int text_length, 00637 Pix** pix) { 00638 Pix *orig_pix = NULL; 00639 int offset = RenderToImage(text, text_length, &orig_pix); 00640 if (orig_pix) { 00641 *pix = pixConvertTo8(orig_pix, false); 00642 pixDestroy(&orig_pix); 00643 } 00644 return offset; 00645 } 00646 00647 int StringRenderer::RenderToBinaryImage(const char* text, int text_length, 00648 int threshold, Pix** pix) { 00649 Pix *orig_pix = NULL; 00650 int offset = RenderToImage(text, text_length, &orig_pix); 00651 if (orig_pix) { 00652 Pix* gray_pix = pixConvertTo8(orig_pix, false); 00653 pixDestroy(&orig_pix); 00654 *pix = pixThresholdToBinary(gray_pix, threshold); 00655 pixDestroy(&gray_pix); 00656 } else { 00657 *pix = orig_pix; 00658 } 00659 return offset; 00660 } 00661 00662 // Add word joiner (WJ) characters between adjacent non-space characters except 00663 // immediately before a combiner. 00664 /* static */ 00665 string StringRenderer::InsertWordJoiners(const string& text) { 00666 string out_str; 00667 const UNICHAR::const_iterator it_end = UNICHAR::end(text.c_str(), 00668 text.length()); 00669 for (UNICHAR::const_iterator it = UNICHAR::begin(text.c_str(), text.length()); 00670 it != it_end; ++it) { 00671 // Add the symbol to the output string. 00672 out_str.append(it.utf8_data(), it.utf8_len()); 00673 // Check the next symbol. 00674 UNICHAR::const_iterator next_it = it; 00675 ++next_it; 00676 bool next_char_is_boundary = (next_it == it_end || *next_it == ' '); 00677 bool next_char_is_combiner = (next_it == it_end) ? 00678 false : IsCombiner(*next_it); 00679 if (*it != ' ' && *it != '\n' && !next_char_is_boundary && 00680 !next_char_is_combiner) { 00681 out_str += kWordJoinerUTF8; 00682 } 00683 } 00684 return out_str; 00685 } 00686 00687 // Convert halfwidth Basic Latin characters to their fullwidth forms. 00688 string StringRenderer::ConvertBasicLatinToFullwidthLatin(const string& str) { 00689 string full_str; 00690 const UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), 00691 str.length()); 00692 for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length()); 00693 it != it_end; ++it) { 00694 // Convert printable and non-space 7-bit ASCII characters to 00695 // their fullwidth forms. 00696 if (IsInterchangeValid7BitAscii(*it) && isprint(*it) && !isspace(*it)) { 00697 // Convert by adding 0xFEE0 to the codepoint of 7-bit ASCII. 00698 char32 full_char = *it + 0xFEE0; 00699 full_str.append(EncodeAsUTF8(full_char)); 00700 } else { 00701 full_str.append(it.utf8_data(), it.utf8_len()); 00702 } 00703 } 00704 return full_str; 00705 } 00706 00707 // Convert fullwidth Latin characters to their halfwidth forms. 00708 string StringRenderer::ConvertFullwidthLatinToBasicLatin(const string& str) { 00709 string half_str; 00710 UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length()); 00711 for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length()); 00712 it != it_end; ++it) { 00713 char32 half_char = FullwidthToHalfwidth(*it); 00714 // Convert fullwidth Latin characters to their halfwidth forms 00715 // only if halfwidth forms are printable and non-space 7-bit ASCII. 00716 if (IsInterchangeValid7BitAscii(half_char) && 00717 isprint(half_char) && !isspace(half_char)) { 00718 half_str.append(EncodeAsUTF8(half_char)); 00719 } else { 00720 half_str.append(it.utf8_data(), it.utf8_len()); 00721 } 00722 } 00723 return half_str; 00724 } 00725 00726 // Returns offset to end of text substring rendered in this method. 00727 int StringRenderer::RenderToImage(const char* text, int text_length, 00728 Pix** pix) { 00729 if (pix && *pix) pixDestroy(pix); 00730 InitPangoCairo(); 00731 00732 const int page_offset = FindFirstPageBreakOffset(text, text_length); 00733 if (!page_offset) { 00734 return 0; 00735 } 00736 start_box_ = boxchars_.size(); 00737 00738 if (!vertical_text_) { 00739 // Translate by the specified margin 00740 cairo_translate(cr_, h_margin_, v_margin_); 00741 } else { 00742 // Vertical text rendering is achieved by a two-step process of first 00743 // performing regular horizontal layout with character orientation set to 00744 // EAST, and then translating and rotating the layout before rendering onto 00745 // the desired image surface. The settings required for the former step are 00746 // done within InitPangoCairo(). 00747 // 00748 // Translate to the top-right margin of page 00749 cairo_translate(cr_, page_width_ - h_margin_, v_margin_); 00750 // Rotate the layout 00751 double rotation = - pango_gravity_to_rotation( 00752 pango_context_get_base_gravity(pango_layout_get_context(layout_))); 00753 tlog(2, "Rotating by %f radians\n", rotation); 00754 cairo_rotate(cr_, rotation); 00755 pango_cairo_update_layout(cr_, layout_); 00756 } 00757 string page_text(text, page_offset); 00758 if (render_fullwidth_latin_) { 00759 // Convert Basic Latin to their fullwidth forms. 00760 page_text = ConvertBasicLatinToFullwidthLatin(page_text); 00761 } 00762 if (strip_unrenderable_words_) { 00763 StripUnrenderableWords(&page_text); 00764 } 00765 if (drop_uncovered_chars_ && 00766 !font_.CoversUTF8Text(page_text.c_str(), page_text.length())) { 00767 int num_dropped = font_.DropUncoveredChars(&page_text); 00768 if (num_dropped) { 00769 tprintf("WARNING: Dropped %d uncovered characters\n", num_dropped); 00770 } 00771 } 00772 if (add_ligatures_) { 00773 // Add ligatures wherever possible, including custom ligatures. 00774 page_text = LigatureTable::Get()->AddLigatures(page_text, &font_); 00775 } 00776 if (underline_start_prob_ > 0) { 00777 SetWordUnderlineAttributes(page_text); 00778 } 00779 00780 pango_layout_set_text(layout_, page_text.c_str(), page_text.length()); 00781 00782 if (pix) { 00783 // Set a white background for the target image surface. 00784 cairo_set_source_rgb(cr_, 1.0, 1.0, 1.0); // sets drawing colour to white 00785 // Fill the surface with the active colour (if you don't do this, you will 00786 // be given a surface with a transparent background to draw on) 00787 cairo_paint(cr_); 00788 // Set the ink color to black 00789 cairo_set_source_rgb(cr_, pen_color_[0], pen_color_[1], pen_color_[2]); 00790 // If the target surface or transformation properties of the cairo instance 00791 // have changed, update the pango layout to reflect this 00792 pango_cairo_update_layout(cr_, layout_); 00793 { 00794 DISABLE_HEAP_LEAK_CHECK; // for Fontconfig 00795 // Draw the pango layout onto the cairo surface 00796 pango_cairo_show_layout(cr_, layout_); 00797 } 00798 *pix = CairoARGB32ToPixFormat(surface_); 00799 } 00800 ComputeClusterBoxes(); 00801 FreePangoCairo(); 00802 // Update internal state variables. 00803 ++page_; 00804 return page_offset; 00805 } 00806 00807 // Render a string to an image, returning it as an 8 bit pix. Behaves as 00808 // RenderString, except that it ignores the font set at construction and works 00809 // through all the fonts, returning 0 until they are exhausted, at which point 00810 // it returns the value it should have returned all along, but no pix this time. 00811 // Fonts that don't contain a given proportion of the characters in the string 00812 // get skipped. 00813 // Fonts that work each get rendered and the font name gets added 00814 // to the image. 00815 // NOTE that no boxes are produced by this function. 00816 // 00817 // Example usage: To render a null terminated char-array "txt" 00818 // 00819 // int offset = 0; 00820 // do { 00821 // Pix *pix; 00822 // offset += renderer.RenderAllFontsToImage(min_proportion, txt + offset, 00823 // strlen(txt + offset), NULL, &pix); 00824 // ... 00825 // } while (offset < strlen(text)); 00826 // 00827 int StringRenderer::RenderAllFontsToImage(double min_coverage, 00828 const char* text, int text_length, 00829 string* font_used, Pix** image) { 00830 *image = NULL; 00831 // Select a suitable font to render the title with. 00832 const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%"; 00833 string title_font; 00834 if (!FontUtils::SelectFont(kTitleTemplate, strlen(kTitleTemplate), 00835 &title_font, NULL)) { 00836 tprintf("WARNING: Could not find a font to render image title with!\n"); 00837 title_font = "Arial"; 00838 } 00839 title_font += " 8"; 00840 tlog(1, "Selected title font: %s\n", title_font.c_str()); 00841 if (font_used) font_used->clear(); 00842 00843 string orig_font = font_.DescriptionName(); 00844 if (char_map_.empty()) { 00845 total_chars_ = 0; 00846 // Fill the hash table and use that for computing which fonts to use. 00847 for (UNICHAR::const_iterator it = UNICHAR::begin(text, text_length); 00848 it != UNICHAR::end(text, text_length); ++it) { 00849 ++total_chars_; 00850 ++char_map_[*it]; 00851 } 00852 tprintf("Total chars = %d\n", total_chars_); 00853 } 00854 const vector<string>& all_fonts = FontUtils::ListAvailableFonts(); 00855 for (int i = font_index_; i < all_fonts.size(); ++i) { 00856 ++font_index_; 00857 int raw_score = 0; 00858 int ok_chars = FontUtils::FontScore(char_map_, all_fonts[i], &raw_score, 00859 NULL); 00860 if (ok_chars > 0 && ok_chars >= total_chars_ * min_coverage) { 00861 set_font(all_fonts[i]); 00862 int offset = RenderToBinaryImage(text, text_length, 128, image); 00863 ClearBoxes(); // Get rid of them as they are garbage. 00864 const int kMaxTitleLength = 1024; 00865 char title[kMaxTitleLength]; 00866 snprintf(title, kMaxTitleLength, kTitleTemplate, 00867 all_fonts[i].c_str(), ok_chars, 00868 100.0 * ok_chars / total_chars_, raw_score, 00869 100.0 * raw_score / char_map_.size()); 00870 tprintf("%s\n", title); 00871 // This is a good font! Store the offset to return once we've tried all 00872 // the fonts. 00873 if (offset) { 00874 last_offset_ = offset; 00875 if (font_used) *font_used = all_fonts[i]; 00876 } 00877 // Add the font to the image. 00878 set_font(title_font); 00879 v_margin_ /= 8; 00880 Pix* title_image = NULL; 00881 RenderToBinaryImage(title, strlen(title), 128, &title_image); 00882 pixOr(*image, *image, title_image); 00883 pixDestroy(&title_image); 00884 00885 v_margin_ *= 8; 00886 set_font(orig_font); 00887 // We return the real offset only after cycling through the list of fonts. 00888 return 0; 00889 } else { 00890 tprintf("Font %s failed with %d hits = %.2f%%\n", 00891 all_fonts[i].c_str(), ok_chars, 100.0 * ok_chars / total_chars_); 00892 } 00893 } 00894 font_index_ = 0; 00895 char_map_.clear(); 00896 return last_offset_ == 0 ? -1 : last_offset_; 00897 } 00898 00899 } // namespace tesseract