tesseract 3.04.01

training/stringrenderer.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        stringrenderer.cpp
00003  * Description: Class for rendering UTF-8 text to an image, and retrieving
00004  *              bounding boxes around each grapheme cluster.
00005  * Author:      Ranjith Unnikrishnan
00006  * Created:     Mon Nov 18 2013
00007  *
00008  * (C) Copyright 2013, Google Inc.
00009  * Licensed under the Apache License, Version 2.0 (the "License");
00010  * you may not use this file except in compliance with the License.
00011  * You may obtain a copy of the License at
00012  * http://www.apache.org/licenses/LICENSE-2.0
00013  * Unless required by applicable law or agreed to in writing, software
00014  * distributed under the License is distributed on an "AS IS" BASIS,
00015  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016  * See the License for the specific language governing permissions and
00017  * limitations under the License.
00018  *
00019  **********************************************************************/
00020 
00021 #include "stringrenderer.h"
00022 
00023 #include <stdio.h>
00024 #include <string.h>
00025 #include <algorithm>
00026 #include <map>
00027 #include <utility>
00028 #include <vector>
00029 
00030 #include "allheaders.h"     // from leptonica
00031 #include "boxchar.h"
00032 #include "ligature_table.h"
00033 #include "normstrngs.h"
00034 #include "pango/pango-font.h"
00035 #include "pango/pango-glyph-item.h"
00036 #include "tlog.h"
00037 #include "unichar.h"
00038 #include "unicode/uchar.h"  // from libicu
00039 #include "util.h"
00040 
00041 #ifdef USE_STD_NAMESPACE
00042 using std::map;
00043 using std::max;
00044 using std::min;
00045 using std::swap;
00046 #endif
00047 
00048 namespace tesseract {
00049 
00050 static const int kDefaultOutputResolution = 300;
00051 
00052 // Word joiner (U+2060) inserted after letters in ngram mode, as per
00053 // recommendation in http://unicode.org/reports/tr14/ to avoid line-breaks at
00054 // hyphens and other non-alpha characters.
00055 static const char* kWordJoinerUTF8 = "\u2060";
00056 static const char32 kWordJoiner = 0x2060;
00057 
00058 static bool IsCombiner(int ch) {
00059   const int char_type = u_charType(ch);
00060   return ((char_type == U_NON_SPACING_MARK) ||
00061           (char_type == U_ENCLOSING_MARK) ||
00062           (char_type == U_COMBINING_SPACING_MARK));
00063 }
00064 
00065 static string EncodeAsUTF8(const char32 ch32) {
00066   UNICHAR uni_ch(ch32);
00067   return string(uni_ch.utf8(), uni_ch.utf8_len());
00068 }
00069 
00070 // Returns true with probability 'prob'.
00071 static bool RandBool(const double prob, TRand* rand) {
00072   if (prob == 1.0) return true;
00073   if (prob == 0.0) return false;
00074   return rand->UnsignedRand(1.0) < prob;
00075 }
00076 
00077 /* static */
00078 Pix* CairoARGB32ToPixFormat(cairo_surface_t *surface) {
00079   if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) {
00080     printf("Unexpected surface format %d\n",
00081            cairo_image_surface_get_format(surface));
00082     return NULL;
00083   }
00084   const int width = cairo_image_surface_get_width(surface);
00085   const int height = cairo_image_surface_get_height(surface);
00086   Pix* pix = pixCreate(width, height, 32);
00087   int byte_stride = cairo_image_surface_get_stride(surface);
00088 
00089   for (int i = 0; i < height; ++i) {
00090     memcpy(reinterpret_cast<unsigned char*>(pix->data + i * pix->wpl) + 1,
00091            cairo_image_surface_get_data(surface) + i * byte_stride,
00092            byte_stride - ((i == height - 1) ? 1 : 0));
00093   }
00094   return pix;
00095 }
00096 
00097 StringRenderer::StringRenderer(const string& font_desc, int page_width,
00098                                int page_height)
00099     : page_width_(page_width),
00100       page_height_(page_height),
00101       h_margin_(50),
00102       v_margin_(50),
00103       char_spacing_(0),
00104       leading_(0),
00105       vertical_text_(false),
00106       gravity_hint_strong_(false),
00107       render_fullwidth_latin_(false),
00108       underline_start_prob_(0),
00109       underline_continuation_prob_(0),
00110       underline_style_(PANGO_UNDERLINE_SINGLE),
00111       drop_uncovered_chars_(true),
00112       strip_unrenderable_words_(false),
00113       add_ligatures_(false),
00114       output_word_boxes_(false),
00115       surface_(NULL),
00116       cr_(NULL),
00117       layout_(NULL),
00118       start_box_(0),
00119       page_(0),
00120       box_padding_(0),
00121       total_chars_(0),
00122       font_index_(0),
00123       last_offset_(0) {
00124   pen_color_[0] = 0.0;
00125   pen_color_[1] = 0.0;
00126   pen_color_[2] = 0.0;
00127   set_font(font_desc);
00128   set_resolution(kDefaultOutputResolution);
00129   page_boxes_ = NULL;
00130 }
00131 
00132 bool StringRenderer::set_font(const string& desc) {
00133   bool success = font_.ParseFontDescriptionName(desc);
00134   font_.set_resolution(resolution_);
00135   return success;
00136 }
00137 
00138 void StringRenderer::set_resolution(const int resolution) {
00139   resolution_ = resolution;
00140   font_.set_resolution(resolution);
00141 }
00142 
00143 void StringRenderer::set_underline_start_prob(const double frac) {
00144   underline_start_prob_ = min(max(frac, 0.0), 1.0);
00145 }
00146 
00147 void StringRenderer::set_underline_continuation_prob(const double frac) {
00148   underline_continuation_prob_ = min(max(frac, 0.0), 1.0);
00149 }
00150 
00151 StringRenderer::~StringRenderer() {
00152   ClearBoxes();
00153   FreePangoCairo();
00154 }
00155 
00156 void StringRenderer::InitPangoCairo() {
00157   FreePangoCairo();
00158   surface_ = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, page_width_,
00159                                         page_height_);
00160   cr_ = cairo_create(surface_);
00161   {
00162     DISABLE_HEAP_LEAK_CHECK;
00163     layout_ = pango_cairo_create_layout(cr_);
00164   }
00165 
00166   if (vertical_text_) {
00167     PangoContext* context = pango_layout_get_context(layout_);
00168     pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST);
00169     if (gravity_hint_strong_) {
00170       pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG);
00171     }
00172     pango_layout_context_changed(layout_);
00173   }
00174 
00175   SetLayoutProperties();
00176 }
00177 
00178 void StringRenderer::SetLayoutProperties() {
00179   string font_desc = font_.DescriptionName();
00180   // Specify the font via a description name
00181   PangoFontDescription *desc =
00182       pango_font_description_from_string(font_desc.c_str());
00183   // Assign the font description to the layout
00184   pango_layout_set_font_description(layout_, desc);
00185   pango_font_description_free(desc);  // free the description
00186   pango_cairo_context_set_resolution(pango_layout_get_context(layout_),
00187                                      resolution_);
00188 
00189   int max_width = page_width_ - 2 * h_margin_;
00190   int max_height = page_height_ - 2 * v_margin_;
00191   tlog(3, "max_width = %d, max_height = %d\n", max_width, max_height);
00192   if (vertical_text_) {
00193     swap(max_width, max_height);
00194   }
00195   pango_layout_set_width(layout_, max_width * PANGO_SCALE);
00196   pango_layout_set_wrap(layout_, PANGO_WRAP_WORD);
00197 
00198   // Adjust character spacing
00199   PangoAttrList* attr_list = pango_attr_list_new();
00200   if (char_spacing_) {
00201     PangoAttribute* spacing_attr = pango_attr_letter_spacing_new(
00202         static_cast<int>(char_spacing_ * PANGO_SCALE + 0.5));
00203     spacing_attr->start_index = 0;
00204     spacing_attr->end_index = static_cast<guint>(-1);
00205     pango_attr_list_change(attr_list, spacing_attr);
00206   }
00207   pango_layout_set_attributes(layout_, attr_list);
00208   pango_attr_list_unref(attr_list);
00209   // Adjust line spacing
00210   if (leading_) {
00211     pango_layout_set_spacing(layout_, leading_ * PANGO_SCALE);
00212   }
00213 }
00214 
00215 void StringRenderer::FreePangoCairo() {
00216   if (layout_) {
00217     g_object_unref(layout_);
00218     layout_ = NULL;
00219   }
00220   if (cr_) {
00221     cairo_destroy(cr_);
00222     cr_ = NULL;
00223   }
00224   if (surface_) {
00225     cairo_surface_destroy(surface_);
00226     surface_ = NULL;
00227   }
00228 }
00229 
00230 void StringRenderer::SetWordUnderlineAttributes(const string& page_text) {
00231   if (underline_start_prob_ == 0) return;
00232   PangoAttrList* attr_list = pango_layout_get_attributes(layout_);
00233 
00234   const char* text = page_text.c_str();
00235   int offset = 0;
00236   TRand rand;
00237   bool started_underline = false;
00238   PangoAttribute* und_attr = nullptr;
00239 
00240   while (offset < page_text.length()) {
00241     offset += SpanUTF8Whitespace(text + offset);
00242     if (offset == page_text.length()) break;
00243 
00244     int word_start = offset;
00245     int word_len = SpanUTF8NotWhitespace(text + offset);
00246     offset += word_len;
00247     if (started_underline) {
00248       // Should we continue the underline to the next word?
00249       if (RandBool(underline_continuation_prob_, &rand)) {
00250         // Continue the current underline to this word.
00251         und_attr->end_index = word_start + word_len;
00252       } else {
00253         // Otherwise end the current underline attribute at the end of the
00254         // previous word.
00255         pango_attr_list_insert(attr_list, und_attr);
00256         started_underline = false;
00257         und_attr = nullptr;
00258       }
00259     }
00260     if (!started_underline && RandBool(underline_start_prob_, &rand)) {
00261       // Start a new underline attribute
00262       und_attr = pango_attr_underline_new(underline_style_);
00263       und_attr->start_index = word_start;
00264       und_attr->end_index = word_start + word_len;
00265       started_underline = true;
00266     }
00267   }
00268   // Finish the current underline attribute at the end of the page.
00269   if (started_underline) {
00270     und_attr->end_index = page_text.length();
00271     pango_attr_list_insert(attr_list, und_attr);
00272   }
00273 }
00274 
00275 // Returns offset in utf8 bytes to first page.
00276 int StringRenderer::FindFirstPageBreakOffset(const char* text,
00277                                              int text_length) {
00278   if (!text_length) return 0;
00279   const int max_height = (page_height_ - 2 * v_margin_);
00280   const int max_width = (page_width_ - 2 * h_margin_);
00281   const int max_layout_height = vertical_text_ ? max_width : max_height;
00282 
00283   UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
00284   const UNICHAR::const_iterator it_end = UNICHAR::end(text, text_length);
00285   const int kMaxUnicodeBufLength = 15000;
00286   for (int i = 0; i < kMaxUnicodeBufLength && it != it_end; ++it, ++i);
00287   int buf_length = it.utf8_data() - text;
00288   tlog(1, "len = %d  buf_len = %d\n", text_length, buf_length);
00289   pango_layout_set_text(layout_, text, buf_length);
00290 
00291   PangoLayoutIter* line_iter = NULL;
00292   { // Fontconfig caches some info here that is not freed before exit.
00293     DISABLE_HEAP_LEAK_CHECK;
00294     line_iter = pango_layout_get_iter(layout_);
00295   }
00296   bool first_page = true;
00297   int page_top = 0;
00298   int offset = buf_length;
00299   do {
00300     // Get bounding box of the current line
00301     PangoRectangle line_ink_rect;
00302     pango_layout_iter_get_line_extents(line_iter, &line_ink_rect, NULL);
00303     pango_extents_to_pixels(&line_ink_rect, NULL);
00304     PangoLayoutLine* line = pango_layout_iter_get_line_readonly(line_iter);
00305     if (first_page) {
00306       page_top = line_ink_rect.y;
00307       first_page = false;
00308     }
00309     int line_bottom = line_ink_rect.y + line_ink_rect.height;
00310     if (line_bottom - page_top > max_layout_height) {
00311       offset = line->start_index;
00312       tlog(1, "Found offset = %d\n", offset);
00313       break;
00314     }
00315   } while (pango_layout_iter_next_line(line_iter));
00316   pango_layout_iter_free(line_iter);
00317   return offset;
00318 }
00319 
00320 const vector<BoxChar*>& StringRenderer::GetBoxes() const {
00321     return boxchars_;
00322 }
00323 
00324 Boxa* StringRenderer::GetPageBoxes() const {
00325     return page_boxes_;
00326 }
00327 
00328 void StringRenderer::RotatePageBoxes(float rotation) {
00329   BoxChar::RotateBoxes(rotation, page_width_ / 2, page_height_ / 2,
00330                        start_box_, boxchars_.size(), &boxchars_);
00331 }
00332 
00333 
00334 void StringRenderer::ClearBoxes() {
00335   for (int i = 0; i < boxchars_.size(); ++i)
00336     delete boxchars_[i];
00337   boxchars_.clear();
00338   boxaDestroy(&page_boxes_);
00339 }
00340 
00341 void StringRenderer::WriteAllBoxes(const string& filename) {
00342   BoxChar::PrepareToWrite(&boxchars_);
00343   BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_);
00344 }
00345 
00346 // Returns cluster strings in logical order.
00347 bool StringRenderer::GetClusterStrings(vector<string>* cluster_text) {
00348   map<int, string> start_byte_to_text;
00349   PangoLayoutIter* run_iter = pango_layout_get_iter(layout_);
00350   const char* full_text = pango_layout_get_text(layout_);
00351   do {
00352     PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
00353     if (!run) {
00354       // End of line NULL run marker
00355       tlog(2, "Found end of line marker\n");
00356       continue;
00357     }
00358     PangoGlyphItemIter cluster_iter;
00359     gboolean have_cluster;
00360     for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
00361                                                           run, full_text);
00362          have_cluster;
00363          have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
00364       const int start_byte_index = cluster_iter.start_index;
00365       const int end_byte_index = cluster_iter.end_index;
00366       string text = string(full_text + start_byte_index,
00367                            end_byte_index - start_byte_index);
00368       if (IsUTF8Whitespace(text.c_str())) {
00369         tlog(2, "Found whitespace\n");
00370         text = " ";
00371       }
00372       tlog(2, "start_byte=%d end_byte=%d : '%s'\n", start_byte_index,
00373            end_byte_index, text.c_str());
00374       if (add_ligatures_) {
00375         // Make sure the output box files have ligatured text in case the font
00376         // decided to use an unmapped glyph.
00377         text = LigatureTable::Get()->AddLigatures(text, NULL);
00378       }
00379       start_byte_to_text[start_byte_index] = text;
00380     }
00381   } while (pango_layout_iter_next_run(run_iter));
00382   pango_layout_iter_free(run_iter);
00383 
00384   cluster_text->clear();
00385   for (map<int, string>::const_iterator it = start_byte_to_text.begin();
00386        it != start_byte_to_text.end(); ++it) {
00387     cluster_text->push_back(it->second);
00388   }
00389   return cluster_text->size();
00390 }
00391 
00392 // Merges an array of BoxChars into words based on the identification of
00393 // BoxChars containing the space character as inter-word separators.
00394 //
00395 // Sometime two adjacent characters in the sequence may be detected as lying on
00396 // different lines based on their spatial positions. This may be the result of a
00397 // newline character at end of the last word on a line in the source text, or of
00398 // a discretionary line-break created by Pango at intra-word locations like
00399 // hyphens. When this is detected the word is split at that location into
00400 // multiple BoxChars. Otherwise, each resulting BoxChar will contain a word and
00401 // its bounding box.
00402 static void MergeBoxCharsToWords(vector<BoxChar*>* boxchars) {
00403   vector<BoxChar*> result;
00404   bool started_word = false;
00405   for (int i = 0; i < boxchars->size(); ++i) {
00406     if (boxchars->at(i)->ch() == " " ||
00407         boxchars->at(i)->box() == NULL) {
00408       result.push_back(boxchars->at(i));
00409       boxchars->at(i) = NULL;
00410       started_word = false;
00411       continue;
00412     }
00413 
00414     if (!started_word) {
00415       // Begin new word
00416       started_word = true;
00417       result.push_back(boxchars->at(i));
00418       boxchars->at(i) = NULL;
00419     } else {
00420       BoxChar* last_boxchar = result.back();
00421       // Compute bounding box union
00422       const Box* box = boxchars->at(i)->box();
00423       Box* last_box = last_boxchar->mutable_box();
00424       int left = min(last_box->x, box->x);
00425       int right = max(last_box->x + last_box->w, box->x + box->w);
00426       int top = min(last_box->y, box->y);
00427       int bottom = max(last_box->y + last_box->h, box->y + box->h);
00428       // Conclude that the word was broken to span multiple lines based on the
00429       // size of the merged bounding box in relation to those of the individual
00430       // characters seen so far.
00431       if (right - left > last_box->w + 5 * box->w) {
00432         tlog(1, "Found line break after '%s'", last_boxchar->ch().c_str());
00433         // Insert a fake interword space and start a new word with the current
00434         // boxchar.
00435         result.push_back(new BoxChar(" ", 1));
00436         result.push_back(boxchars->at(i));
00437         boxchars->at(i) = NULL;
00438         continue;
00439       }
00440       // Append to last word
00441       last_boxchar->mutable_ch()->append(boxchars->at(i)->ch());
00442       last_box->x = left;
00443       last_box->w = right - left;
00444       last_box->y = top;
00445       last_box->h = bottom - top;
00446       delete boxchars->at(i);
00447       boxchars->at(i) = NULL;
00448     }
00449   }
00450   boxchars->swap(result);
00451 }
00452 
00453 
00454 void StringRenderer::ComputeClusterBoxes() {
00455   const char* text = pango_layout_get_text(layout_);
00456   PangoLayoutIter* cluster_iter = pango_layout_get_iter(layout_);
00457 
00458   // Do a first pass to store cluster start indexes.
00459   vector<int> cluster_start_indices;
00460   do {
00461     cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter));
00462     tlog(3, "Added %d\n", cluster_start_indices.back());
00463   } while (pango_layout_iter_next_cluster(cluster_iter));
00464   pango_layout_iter_free(cluster_iter);
00465   cluster_start_indices.push_back(strlen(text));
00466   tlog(3, "Added last index %d\n", cluster_start_indices.back());
00467   // Sort the indices and create a map from start to end indices.
00468   sort(cluster_start_indices.begin(), cluster_start_indices.end());
00469   map<int, int> cluster_start_to_end_index;
00470   for (int i = 0; i < cluster_start_indices.size() - 1; ++i) {
00471     cluster_start_to_end_index[cluster_start_indices[i]]
00472         = cluster_start_indices[i + 1];
00473   }
00474 
00475   // Iterate again to compute cluster boxes and their text with the obtained
00476   // cluster extent information.
00477   cluster_iter = pango_layout_get_iter(layout_);
00478   // Store BoxChars* sorted by their byte start positions
00479   map<int, BoxChar*> start_byte_to_box;
00480   do {
00481     PangoRectangle cluster_rect;
00482     pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect,
00483                                           NULL);
00484     pango_extents_to_pixels(&cluster_rect, NULL);
00485     const int start_byte_index = pango_layout_iter_get_index(cluster_iter);
00486     const int end_byte_index = cluster_start_to_end_index[start_byte_index];
00487     string cluster_text = string(text + start_byte_index,
00488                                  end_byte_index - start_byte_index);
00489     if (cluster_text.size() && cluster_text[0] == '\n') {
00490       tlog(2, "Skipping newlines at start of text.\n");
00491       continue;
00492     }
00493     if (!cluster_rect.width || !cluster_rect.height ||
00494         IsUTF8Whitespace(cluster_text.c_str())) {
00495       tlog(2, "Skipping whitespace with boxdim (%d,%d) '%s'\n",
00496            cluster_rect.width, cluster_rect.height, cluster_text.c_str());
00497       BoxChar* boxchar = new BoxChar(" ", 1);
00498       boxchar->set_page(page_);
00499       start_byte_to_box[start_byte_index] = boxchar;
00500       continue;
00501     }
00502     // Prepare a boxchar for addition at this byte position.
00503     tlog(2, "[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\n",
00504          cluster_rect.x, cluster_rect.y,
00505          cluster_rect.width, cluster_rect.height,
00506          start_byte_index, end_byte_index,
00507          cluster_text.c_str());
00508     ASSERT_HOST_MSG(cluster_rect.width,
00509                     "cluster_text:%s  start_byte_index:%d\n",
00510                     cluster_text.c_str(), start_byte_index);
00511     ASSERT_HOST_MSG(cluster_rect.height,
00512                     "cluster_text:%s  start_byte_index:%d\n",
00513                     cluster_text.c_str(), start_byte_index);
00514     if (box_padding_) {
00515       cluster_rect.x = max(0, cluster_rect.x - box_padding_);
00516       cluster_rect.width += 2 * box_padding_;
00517       cluster_rect.y = max(0, cluster_rect.y - box_padding_);
00518       cluster_rect.height += 2 * box_padding_;
00519     }
00520     if (add_ligatures_) {
00521       // Make sure the output box files have ligatured text in case the font
00522       // decided to use an unmapped glyph.
00523       cluster_text = LigatureTable::Get()->AddLigatures(cluster_text, NULL);
00524     }
00525     BoxChar* boxchar = new BoxChar(cluster_text.c_str(), cluster_text.size());
00526     boxchar->set_page(page_);
00527     boxchar->AddBox(cluster_rect.x, cluster_rect.y,
00528                     cluster_rect.width, cluster_rect.height);
00529     start_byte_to_box[start_byte_index] = boxchar;
00530   } while (pango_layout_iter_next_cluster(cluster_iter));
00531   pango_layout_iter_free(cluster_iter);
00532 
00533   // There is a subtle bug in the cluster text reported by the PangoLayoutIter
00534   // on ligatured characters (eg. The word "Lam-Aliph" in arabic). To work
00535   // around this, we use text reported using the PangoGlyphIter which is
00536   // accurate.
00537   // TODO(ranjith): Revisit whether this is still needed in newer versions of
00538   // pango.
00539   vector<string> cluster_text;
00540   if (GetClusterStrings(&cluster_text)) {
00541     ASSERT_HOST(cluster_text.size() == start_byte_to_box.size());
00542     int ind = 0;
00543     for (map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
00544          it != start_byte_to_box.end(); ++it, ++ind) {
00545       it->second->mutable_ch()->swap(cluster_text[ind]);
00546     }
00547   }
00548 
00549   // Append to the boxchars list in byte order.
00550   vector<BoxChar*> page_boxchars;
00551   page_boxchars.reserve(start_byte_to_box.size());
00552   string last_ch;
00553   for (map<int, BoxChar*>::const_iterator it = start_byte_to_box.begin();
00554        it != start_byte_to_box.end(); ++it) {
00555     if (it->second->ch() == kWordJoinerUTF8) {
00556       // Skip zero-width joiner characters (ZWJs) here.
00557       delete it->second;
00558     } else {
00559       page_boxchars.push_back(it->second);
00560     }
00561   }
00562   CorrectBoxPositionsToLayout(&page_boxchars);
00563 
00564   if (render_fullwidth_latin_) {
00565     for (map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
00566          it != start_byte_to_box.end(); ++it) {
00567       // Convert fullwidth Latin characters to their halfwidth forms.
00568       string half(ConvertFullwidthLatinToBasicLatin(it->second->ch()));
00569       it->second->mutable_ch()->swap(half);
00570     }
00571   }
00572 
00573   // Merge the character boxes into word boxes if we are rendering n-grams.
00574   if (output_word_boxes_) {
00575     MergeBoxCharsToWords(&page_boxchars);
00576   }
00577 
00578   boxchars_.insert(boxchars_.end(), page_boxchars.begin(), page_boxchars.end());
00579 
00580   // Compute the page bounding box
00581   Box* page_box = NULL;
00582   Boxa* all_boxes = NULL;
00583   for (int i = 0; i < page_boxchars.size(); ++i) {
00584     if (page_boxchars[i]->box() == NULL) continue;
00585     if (all_boxes == NULL)
00586       all_boxes = boxaCreate(0);
00587     boxaAddBox(all_boxes, page_boxchars[i]->mutable_box(), L_CLONE);
00588   }
00589   boxaGetExtent(all_boxes, NULL, NULL, &page_box);
00590   boxaDestroy(&all_boxes);
00591   if (page_boxes_ == NULL)
00592     page_boxes_ = boxaCreate(0);
00593   boxaAddBox(page_boxes_, page_box, L_INSERT);
00594 }
00595 
00596 
00597 void StringRenderer::CorrectBoxPositionsToLayout(vector<BoxChar*>* boxchars) {
00598   if (vertical_text_) {
00599     const double rotation = - pango_gravity_to_rotation(
00600         pango_context_get_base_gravity(pango_layout_get_context(layout_)));
00601     BoxChar::TranslateBoxes(page_width_ - h_margin_, v_margin_, boxchars);
00602     BoxChar::RotateBoxes(rotation, page_width_ - h_margin_, v_margin_,
00603                          0, boxchars->size(), boxchars);
00604   } else {
00605     BoxChar::TranslateBoxes(h_margin_, v_margin_, boxchars);
00606   }
00607 }
00608 
00609 int StringRenderer::StripUnrenderableWords(string* utf8_text) const {
00610   string output_text;
00611   const char* text = utf8_text->c_str();
00612   int offset = 0;
00613   int num_dropped = 0;
00614   while (offset < utf8_text->length()) {
00615     int space_len = SpanUTF8Whitespace(text + offset);
00616     output_text.append(text + offset, space_len);
00617     offset += space_len;
00618     if (offset == utf8_text->length()) break;
00619 
00620     int word_len = SpanUTF8NotWhitespace(text + offset);
00621     if (font_.CanRenderString(text + offset, word_len)) {
00622       output_text.append(text + offset, word_len);
00623     } else {
00624       ++num_dropped;
00625     }
00626     offset += word_len;
00627   }
00628   utf8_text->swap(output_text);
00629 
00630   if (num_dropped > 0) {
00631     tprintf("Stripped %d unrenderable words\n", num_dropped);
00632   }
00633   return num_dropped;
00634 }
00635 
00636 int StringRenderer::RenderToGrayscaleImage(const char* text, int text_length,
00637                                            Pix** pix) {
00638   Pix *orig_pix = NULL;
00639   int offset = RenderToImage(text, text_length, &orig_pix);
00640   if (orig_pix) {
00641     *pix = pixConvertTo8(orig_pix, false);
00642     pixDestroy(&orig_pix);
00643   }
00644   return offset;
00645 }
00646 
00647 int StringRenderer::RenderToBinaryImage(const char* text, int text_length,
00648                                         int threshold, Pix** pix) {
00649   Pix *orig_pix = NULL;
00650   int offset = RenderToImage(text, text_length, &orig_pix);
00651   if (orig_pix) {
00652     Pix* gray_pix = pixConvertTo8(orig_pix, false);
00653     pixDestroy(&orig_pix);
00654     *pix = pixThresholdToBinary(gray_pix, threshold);
00655     pixDestroy(&gray_pix);
00656   } else {
00657     *pix = orig_pix;
00658   }
00659   return offset;
00660 }
00661 
00662 // Add word joiner (WJ) characters between adjacent non-space characters except
00663 // immediately before a combiner.
00664 /* static */
00665 string StringRenderer::InsertWordJoiners(const string& text) {
00666   string out_str;
00667   const UNICHAR::const_iterator it_end = UNICHAR::end(text.c_str(),
00668                                                       text.length());
00669   for (UNICHAR::const_iterator it = UNICHAR::begin(text.c_str(), text.length());
00670        it != it_end; ++it) {
00671     // Add the symbol to the output string.
00672     out_str.append(it.utf8_data(), it.utf8_len());
00673     // Check the next symbol.
00674     UNICHAR::const_iterator next_it = it;
00675     ++next_it;
00676     bool next_char_is_boundary = (next_it == it_end || *next_it == ' ');
00677     bool next_char_is_combiner = (next_it == it_end) ?
00678         false : IsCombiner(*next_it);
00679     if (*it != ' ' && *it != '\n' && !next_char_is_boundary &&
00680         !next_char_is_combiner) {
00681       out_str += kWordJoinerUTF8;
00682     }
00683   }
00684   return out_str;
00685 }
00686 
00687 // Convert halfwidth Basic Latin characters to their fullwidth forms.
00688 string StringRenderer::ConvertBasicLatinToFullwidthLatin(const string& str) {
00689   string full_str;
00690   const UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(),
00691                                                       str.length());
00692   for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length());
00693        it != it_end; ++it) {
00694     // Convert printable and non-space 7-bit ASCII characters to
00695     // their fullwidth forms.
00696     if (IsInterchangeValid7BitAscii(*it) && isprint(*it) && !isspace(*it)) {
00697       // Convert by adding 0xFEE0 to the codepoint of 7-bit ASCII.
00698       char32 full_char = *it + 0xFEE0;
00699       full_str.append(EncodeAsUTF8(full_char));
00700     } else {
00701       full_str.append(it.utf8_data(), it.utf8_len());
00702     }
00703   }
00704   return full_str;
00705 }
00706 
00707 // Convert fullwidth Latin characters to their halfwidth forms.
00708 string StringRenderer::ConvertFullwidthLatinToBasicLatin(const string& str) {
00709   string half_str;
00710   UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
00711   for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length());
00712        it != it_end; ++it) {
00713     char32 half_char = FullwidthToHalfwidth(*it);
00714     // Convert fullwidth Latin characters to their halfwidth forms
00715     // only if halfwidth forms are printable and non-space 7-bit ASCII.
00716     if (IsInterchangeValid7BitAscii(half_char) &&
00717         isprint(half_char) && !isspace(half_char)) {
00718       half_str.append(EncodeAsUTF8(half_char));
00719     } else {
00720       half_str.append(it.utf8_data(), it.utf8_len());
00721     }
00722   }
00723   return half_str;
00724 }
00725 
00726 // Returns offset to end of text substring rendered in this method.
00727 int StringRenderer::RenderToImage(const char* text, int text_length,
00728                                   Pix** pix) {
00729   if (pix && *pix) pixDestroy(pix);
00730   InitPangoCairo();
00731 
00732   const int page_offset = FindFirstPageBreakOffset(text, text_length);
00733   if (!page_offset) {
00734     return 0;
00735   }
00736   start_box_ = boxchars_.size();
00737 
00738   if (!vertical_text_) {
00739     // Translate by the specified margin
00740     cairo_translate(cr_, h_margin_, v_margin_);
00741   } else {
00742     // Vertical text rendering is achieved by a two-step process of first
00743     // performing regular horizontal layout with character orientation set to
00744     // EAST, and then translating and rotating the layout before rendering onto
00745     // the desired image surface. The settings required for the former step are
00746     // done within InitPangoCairo().
00747     //
00748     // Translate to the top-right margin of page
00749     cairo_translate(cr_, page_width_ - h_margin_, v_margin_);
00750     // Rotate the layout
00751     double rotation = - pango_gravity_to_rotation(
00752         pango_context_get_base_gravity(pango_layout_get_context(layout_)));
00753     tlog(2, "Rotating by %f radians\n", rotation);
00754     cairo_rotate(cr_, rotation);
00755     pango_cairo_update_layout(cr_, layout_);
00756   }
00757   string page_text(text, page_offset);
00758   if (render_fullwidth_latin_) {
00759     // Convert Basic Latin to their fullwidth forms.
00760     page_text = ConvertBasicLatinToFullwidthLatin(page_text);
00761   }
00762   if (strip_unrenderable_words_) {
00763     StripUnrenderableWords(&page_text);
00764   }
00765   if (drop_uncovered_chars_ &&
00766       !font_.CoversUTF8Text(page_text.c_str(), page_text.length())) {
00767     int num_dropped = font_.DropUncoveredChars(&page_text);
00768     if (num_dropped) {
00769       tprintf("WARNING: Dropped %d uncovered characters\n", num_dropped);
00770     }
00771   }
00772   if (add_ligatures_) {
00773     // Add ligatures wherever possible, including custom ligatures.
00774     page_text = LigatureTable::Get()->AddLigatures(page_text, &font_);
00775   }
00776   if (underline_start_prob_ > 0) {
00777     SetWordUnderlineAttributes(page_text);
00778   }
00779 
00780   pango_layout_set_text(layout_, page_text.c_str(), page_text.length());
00781 
00782   if (pix) {
00783     // Set a white background for the target image surface.
00784     cairo_set_source_rgb(cr_, 1.0, 1.0, 1.0);  // sets drawing colour to white
00785     // Fill the surface with the active colour (if you don't do this, you will
00786     // be given a surface with a transparent background to draw on)
00787     cairo_paint(cr_);
00788     // Set the ink color to black
00789     cairo_set_source_rgb(cr_, pen_color_[0], pen_color_[1], pen_color_[2]);
00790     // If the target surface or transformation properties of the cairo instance
00791     // have changed, update the pango layout to reflect this
00792     pango_cairo_update_layout(cr_, layout_);
00793     {
00794       DISABLE_HEAP_LEAK_CHECK;  // for Fontconfig
00795       // Draw the pango layout onto the cairo surface
00796       pango_cairo_show_layout(cr_, layout_);
00797     }
00798     *pix = CairoARGB32ToPixFormat(surface_);
00799   }
00800   ComputeClusterBoxes();
00801   FreePangoCairo();
00802   // Update internal state variables.
00803   ++page_;
00804   return page_offset;
00805 }
00806 
00807 // Render a string to an image, returning it as an 8 bit pix.  Behaves as
00808 // RenderString, except that it ignores the font set at construction and works
00809 // through all the fonts, returning 0 until they are exhausted, at which point
00810 // it returns the value it should have returned all along, but no pix this time.
00811 // Fonts that don't contain a given proportion of the characters in the string
00812 // get skipped.
00813 // Fonts that work each get rendered and the font name gets added
00814 // to the image.
00815 // NOTE that no boxes are produced by this function.
00816 //
00817 // Example usage: To render a null terminated char-array "txt"
00818 //
00819 // int offset = 0;
00820 // do {
00821 //   Pix *pix;
00822 //   offset += renderer.RenderAllFontsToImage(min_proportion, txt + offset,
00823 //                                            strlen(txt + offset), NULL, &pix);
00824 //   ...
00825 // } while (offset < strlen(text));
00826 //
00827 int StringRenderer::RenderAllFontsToImage(double min_coverage,
00828                                           const char* text, int text_length,
00829                                           string* font_used, Pix** image) {
00830   *image = NULL;
00831   // Select a suitable font to render the title with.
00832   const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%";
00833   string title_font;
00834   if (!FontUtils::SelectFont(kTitleTemplate, strlen(kTitleTemplate),
00835                              &title_font, NULL)) {
00836     tprintf("WARNING: Could not find a font to render image title with!\n");
00837     title_font = "Arial";
00838   }
00839   title_font += " 8";
00840   tlog(1, "Selected title font: %s\n", title_font.c_str());
00841   if (font_used) font_used->clear();
00842 
00843   string orig_font = font_.DescriptionName();
00844   if (char_map_.empty()) {
00845     total_chars_ = 0;
00846     // Fill the hash table and use that for computing which fonts to use.
00847     for (UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
00848          it != UNICHAR::end(text, text_length); ++it) {
00849       ++total_chars_;
00850       ++char_map_[*it];
00851     }
00852     tprintf("Total chars = %d\n", total_chars_);
00853   }
00854   const vector<string>& all_fonts = FontUtils::ListAvailableFonts();
00855   for (int i = font_index_; i < all_fonts.size(); ++i) {
00856     ++font_index_;
00857     int raw_score = 0;
00858     int ok_chars = FontUtils::FontScore(char_map_, all_fonts[i], &raw_score,
00859                                         NULL);
00860     if (ok_chars > 0 && ok_chars >= total_chars_ * min_coverage) {
00861       set_font(all_fonts[i]);
00862       int offset = RenderToBinaryImage(text, text_length, 128, image);
00863       ClearBoxes();  // Get rid of them as they are garbage.
00864       const int kMaxTitleLength = 1024;
00865       char title[kMaxTitleLength];
00866       snprintf(title, kMaxTitleLength, kTitleTemplate,
00867                all_fonts[i].c_str(), ok_chars,
00868                100.0 * ok_chars / total_chars_, raw_score,
00869                100.0 * raw_score / char_map_.size());
00870       tprintf("%s\n", title);
00871       // This is a good font! Store the offset to return once we've tried all
00872       // the fonts.
00873       if (offset) {
00874         last_offset_ = offset;
00875         if (font_used) *font_used = all_fonts[i];
00876       }
00877       // Add the font to the image.
00878       set_font(title_font);
00879       v_margin_ /= 8;
00880       Pix* title_image = NULL;
00881       RenderToBinaryImage(title, strlen(title), 128, &title_image);
00882       pixOr(*image, *image, title_image);
00883       pixDestroy(&title_image);
00884 
00885       v_margin_ *= 8;
00886       set_font(orig_font);
00887       // We return the real offset only after cycling through the list of fonts.
00888       return 0;
00889     } else {
00890       tprintf("Font %s failed with %d hits = %.2f%%\n",
00891               all_fonts[i].c_str(), ok_chars, 100.0 * ok_chars / total_chars_);
00892     }
00893   }
00894   font_index_ = 0;
00895   char_map_.clear();
00896   return last_offset_ == 0 ? -1 : last_offset_;
00897 }
00898 
00899 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines