|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: text2image.cpp 00003 * Description: Program to generate OCR training pages. Given a text file it 00004 * outputs an image with a given font and degradation. 00005 * 00006 * Note that since the results depend on the fonts available on 00007 * your system, running the code on a different machine, or 00008 * different OS, or even at a different time on the same machine, 00009 * may produce different fonts even if --font is given explicitly. 00010 * To see names of available fonts, use --list_available_fonts with 00011 * the appropriate --fonts_dir path. 00012 * Specifying --use_only_legacy_fonts will restrict the available 00013 * fonts to those listed in legacy_fonts.h 00014 * 00015 * Authors: Ranjith Unnikrishnan, Ray Smith 00016 * Created: Tue Nov 19 2013 00017 * 00018 * (C) Copyright 2013, Google Inc. 00019 * Licensed under the Apache License, Version 2.0 (the "License"); 00020 * you may not use this file except in compliance with the License. 00021 * You may obtain a copy of the License at 00022 * http://www.apache.org/licenses/LICENSE-2.0 00023 * Unless required by applicable law or agreed to in writing, software 00024 * distributed under the License is distributed on an "AS IS" BASIS, 00025 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00026 * See the License for the specific language governing permissions and 00027 * limitations under the License. 00028 * 00029 **********************************************************************/ 00030 00031 #include <stdlib.h> 00032 #include <string.h> 00033 #include <algorithm> 00034 #include <iostream> 00035 #include <map> 00036 #include <string> 00037 #include <utility> 00038 #include <vector> 00039 00040 #include "allheaders.h" // from leptonica 00041 #include "boxchar.h" 00042 #include "commandlineflags.h" 00043 #include "degradeimage.h" 00044 #include "errcode.h" 00045 #include "fileio.h" 00046 #include "helpers.h" 00047 #include "normstrngs.h" 00048 #include "stringrenderer.h" 00049 #include "tlog.h" 00050 #include "unicharset.h" 00051 #include "util.h" 00052 00053 #ifdef USE_STD_NAMESPACE 00054 using std::make_pair; 00055 using std::map; 00056 using std::pair; 00057 #endif 00058 00059 // A number with which to initialize the random number generator. 00060 const int kRandomSeed = 0x18273645; 00061 00062 // The text input file. 00063 STRING_PARAM_FLAG(text, "", "File name of text input to process"); 00064 00065 // The text output file. 00066 STRING_PARAM_FLAG(outputbase, "", "Basename for output image/box file"); 00067 00068 // Degrade the rendered image to mimic scanner quality. 00069 BOOL_PARAM_FLAG(degrade_image, true, 00070 "Degrade rendered image with speckle noise, dilation/erosion " 00071 "and rotation"); 00072 00073 // Degradation to apply to the image. 00074 INT_PARAM_FLAG(exposure, 0, "Exposure level in photocopier"); 00075 00076 // Output image resolution. 00077 INT_PARAM_FLAG(resolution, 300, "Pixels per inch"); 00078 00079 // Width of output image (in pixels). 00080 INT_PARAM_FLAG(xsize, 3600, "Width of output image"); 00081 00082 // Max height of output image (in pixels). 00083 INT_PARAM_FLAG(ysize, 4800, "Height of output image"); 00084 00085 // Margin around text (in pixels). 00086 INT_PARAM_FLAG(margin, 100, "Margin round edges of image"); 00087 00088 // Size of text (in points). 00089 INT_PARAM_FLAG(ptsize, 12, "Size of printed text"); 00090 00091 // Inter-character space (in ems). 00092 DOUBLE_PARAM_FLAG(char_spacing, 0, "Inter-character space in ems"); 00093 00094 // Sets the probability (value in [0, 1]) of starting to render a word with an 00095 // underline. Words are assumed to be space-delimited. 00096 DOUBLE_PARAM_FLAG(underline_start_prob, 0, 00097 "Fraction of words to underline (value in [0,1])"); 00098 // Set the probability (value in [0, 1]) of continuing a started underline to 00099 // the next word. 00100 DOUBLE_PARAM_FLAG(underline_continuation_prob, 0, 00101 "Fraction of words to underline (value in [0,1])"); 00102 00103 // Inter-line space (in pixels). 00104 INT_PARAM_FLAG(leading, 12, "Inter-line space (in pixels)"); 00105 00106 // Layout and glyph orientation on rendering. 00107 STRING_PARAM_FLAG(writing_mode, "horizontal", 00108 "Specify one of the following writing" 00109 " modes.\n" 00110 "'horizontal' : Render regular horizontal text. (default)\n" 00111 "'vertical' : Render vertical text. Glyph orientation is" 00112 " selected by Pango.\n" 00113 "'vertical-upright' : Render vertical text. Glyph " 00114 " orientation is set to be upright."); 00115 00116 INT_PARAM_FLAG(box_padding, 0, "Padding around produced bounding boxes"); 00117 00118 BOOL_PARAM_FLAG(strip_unrenderable_words, true, 00119 "Remove unrenderable words from source text"); 00120 00121 // Font name. 00122 STRING_PARAM_FLAG(font, "Arial", "Font description name to use"); 00123 00124 BOOL_PARAM_FLAG(ligatures, false, 00125 "Rebuild and render ligatures"); 00126 00127 BOOL_PARAM_FLAG(find_fonts, false, 00128 "Search for all fonts that can render the text"); 00129 BOOL_PARAM_FLAG(render_per_font, true, 00130 "If find_fonts==true, render each font to its own image. " 00131 "Image filenames are of the form output_name.font_name.tif"); 00132 DOUBLE_PARAM_FLAG(min_coverage, 1.0, 00133 "If find_fonts==true, the minimum coverage the font has of " 00134 "the characters in the text file to include it, between " 00135 "0 and 1."); 00136 00137 BOOL_PARAM_FLAG(list_available_fonts, false, "List available fonts and quit."); 00138 00139 BOOL_PARAM_FLAG(render_ngrams, false, "Put each space-separated entity from the" 00140 " input file into one bounding box. The ngrams in the input" 00141 " file will be randomly permuted before rendering (so that" 00142 " there is sufficient variety of characters on each line)."); 00143 00144 BOOL_PARAM_FLAG(output_word_boxes, false, 00145 "Output word bounding boxes instead of character boxes. " 00146 "This is used for Cube training, and implied by " 00147 "--render_ngrams."); 00148 00149 STRING_PARAM_FLAG(unicharset_file, "", 00150 "File with characters in the unicharset. If --render_ngrams" 00151 " is true and --unicharset_file is specified, ngrams with" 00152 " characters that are not in unicharset will be omitted"); 00153 00154 BOOL_PARAM_FLAG(bidirectional_rotation, false, 00155 "Rotate the generated characters both ways."); 00156 00157 BOOL_PARAM_FLAG(only_extract_font_properties, false, 00158 "Assumes that the input file contains a list of ngrams. Renders" 00159 " each ngram, extracts spacing properties and records them in" 00160 " output_base/[font_name].fontinfo file."); 00161 00162 // Use these flags to output zero-padded, square individual character images 00163 BOOL_PARAM_FLAG(output_individual_glyph_images, false, 00164 "If true also outputs individual character images"); 00165 INT_PARAM_FLAG(glyph_resized_size, 0, 00166 "Each glyph is square with this side length in pixels"); 00167 INT_PARAM_FLAG(glyph_num_border_pixels_to_pad, 0, 00168 "Final_size=glyph_resized_size+2*glyph_num_border_pixels_to_pad"); 00169 00170 namespace tesseract { 00171 00172 struct SpacingProperties { 00173 SpacingProperties() : x_gap_before(0), x_gap_after(0) {} 00174 SpacingProperties(int b, int a) : x_gap_before(b), x_gap_after(a) {} 00175 // These values are obtained from FT_Glyph_Metrics struct 00176 // used by the FreeType font engine. 00177 int x_gap_before; // horizontal x bearing 00178 int x_gap_after; // horizontal advance - x_gap_before - width 00179 map<string, int> kerned_x_gaps; 00180 }; 00181 00182 static bool IsWhitespaceBox(const BoxChar* boxchar) { 00183 return (boxchar->box() == NULL || 00184 SpanUTF8Whitespace(boxchar->ch().c_str())); 00185 } 00186 00187 static string StringReplace(const string& in, 00188 const string& oldsub, const string& newsub) { 00189 string out; 00190 int start_pos = 0; 00191 do { 00192 int pos = in.find(oldsub, start_pos); 00193 if (pos == string::npos) break; 00194 out.append(in.data() + start_pos, pos - start_pos); 00195 out.append(newsub.data(), newsub.length()); 00196 start_pos = pos + oldsub.length(); 00197 } while (true); 00198 out.append(in.data() + start_pos, in.length() - start_pos); 00199 return out; 00200 } 00201 00202 // Assumes that each word (whitespace-separated entity) in text is a bigram. 00203 // Renders the bigrams and calls FontInfo::GetSpacingProperties() to 00204 // obtain spacing information. Produces the output .fontinfo file with a line 00205 // per unichar of the form: 00206 // unichar space_before space_after kerned1 kerned_space1 kerned2 ... 00207 // Fox example, if unichar "A" has spacing of 0 pixels before and -1 pixels 00208 // after, is kerned with "V" resulting in spacing of "AV" to be -7 and kerned 00209 // with "T", such that "AT" has spacing of -5, the entry/line for unichar "A" 00210 // in .fontinfo file will be: 00211 // A 0 -1 T -5 V -7 00212 void ExtractFontProperties(const string &utf8_text, 00213 StringRenderer *render, 00214 const string &output_base) { 00215 map<string, SpacingProperties> spacing_map; 00216 map<string, SpacingProperties>::iterator spacing_map_it0; 00217 map<string, SpacingProperties>::iterator spacing_map_it1; 00218 int x_bearing, x_advance; 00219 int len = utf8_text.length(); 00220 int offset = 0; 00221 const char* text = utf8_text.c_str(); 00222 while (offset < len) { 00223 offset += render->RenderToImage(text + offset, strlen(text + offset), NULL); 00224 const vector<BoxChar*> &boxes = render->GetBoxes(); 00225 00226 // If the page break split a bigram, correct the offset so we try the bigram 00227 // on the next iteration. 00228 if (boxes.size() > 2 && !IsWhitespaceBox(boxes[boxes.size() - 1]) && 00229 IsWhitespaceBox(boxes[boxes.size() - 2])) { 00230 if (boxes.size() > 3) { 00231 tprintf("WARNING: Adjusting to bad page break after '%s%s'\n", 00232 boxes[boxes.size() - 4]->ch().c_str(), 00233 boxes[boxes.size() - 3]->ch().c_str()); 00234 } 00235 offset -= boxes[boxes.size() - 1]->ch().size(); 00236 } 00237 00238 for (int b = 0; b < boxes.size(); b += 2) { 00239 while (b < boxes.size() && IsWhitespaceBox(boxes[b])) ++b; 00240 if (b + 1 >= boxes.size()) break; 00241 const string &ch0 = boxes[b]->ch(); 00242 // We encountered a ligature. This happens in at least two scenarios: 00243 // One is when the rendered bigram forms a grapheme cluster (eg. the 00244 // second character in the bigram is a combining vowel), in which case we 00245 // correctly output only one bounding box. 00246 // A second far less frequent case is when caused some fonts like 'DejaVu 00247 // Sans Ultra-Light' force Pango to render a ligatured character even if 00248 // the input consists of the separated characters. NOTE(ranjith): As per 00249 // behdad@ this is not currently controllable at the level of the Pango 00250 // API. 00251 // Safeguard against these cases here by just skipping the bigram. 00252 if (IsWhitespaceBox(boxes[b+1])) { 00253 continue; 00254 } 00255 int xgap = (boxes[b+1]->box()->x - 00256 (boxes[b]->box()->x + boxes[b]->box()->w)); 00257 spacing_map_it0 = spacing_map.find(ch0); 00258 int ok_count = 0; 00259 if (spacing_map_it0 == spacing_map.end() && 00260 render->font().GetSpacingProperties(ch0, &x_bearing, &x_advance)) { 00261 spacing_map[ch0] = SpacingProperties( 00262 x_bearing, x_advance - x_bearing - boxes[b]->box()->w); 00263 spacing_map_it0 = spacing_map.find(ch0); 00264 ++ok_count; 00265 } 00266 const string &ch1 = boxes[b+1]->ch(); 00267 tlog(3, "%s%s\n", ch0.c_str(), ch1.c_str()); 00268 spacing_map_it1 = spacing_map.find(ch1); 00269 if (spacing_map_it1 == spacing_map.end() && 00270 render->font().GetSpacingProperties(ch1, &x_bearing, &x_advance)) { 00271 spacing_map[ch1] = SpacingProperties( 00272 x_bearing, x_advance - x_bearing - boxes[b+1]->box()->w); 00273 spacing_map_it1 = spacing_map.find(ch1); 00274 ++ok_count; 00275 } 00276 if (ok_count == 2 && xgap != (spacing_map_it0->second.x_gap_after + 00277 spacing_map_it1->second.x_gap_before)) { 00278 spacing_map_it0->second.kerned_x_gaps[ch1] = xgap; 00279 } 00280 } 00281 render->ClearBoxes(); 00282 } 00283 string output_string; 00284 const int kBufSize = 1024; 00285 char buf[kBufSize]; 00286 snprintf(buf, kBufSize, "%d\n", static_cast<int>(spacing_map.size())); 00287 output_string.append(buf); 00288 map<string, SpacingProperties>::const_iterator spacing_map_it; 00289 for (spacing_map_it = spacing_map.begin(); 00290 spacing_map_it != spacing_map.end(); ++spacing_map_it) { 00291 snprintf(buf, kBufSize, 00292 "%s %d %d %d", spacing_map_it->first.c_str(), 00293 spacing_map_it->second.x_gap_before, 00294 spacing_map_it->second.x_gap_after, 00295 static_cast<int>(spacing_map_it->second.kerned_x_gaps.size())); 00296 output_string.append(buf); 00297 map<string, int>::const_iterator kern_it; 00298 for (kern_it = spacing_map_it->second.kerned_x_gaps.begin(); 00299 kern_it != spacing_map_it->second.kerned_x_gaps.end(); ++kern_it) { 00300 snprintf(buf, kBufSize, 00301 " %s %d", kern_it->first.c_str(), kern_it->second); 00302 output_string.append(buf); 00303 } 00304 output_string.append("\n"); 00305 } 00306 File::WriteStringToFileOrDie(output_string, output_base + ".fontinfo"); 00307 } 00308 00309 bool MakeIndividualGlyphs(Pix* pix, 00310 const vector<BoxChar*>& vbox, 00311 const int input_tiff_page) { 00312 // If checks fail, return false without exiting text2image 00313 if (!pix) { 00314 tprintf("ERROR: MakeIndividualGlyphs(): Input Pix* is NULL\n"); 00315 return false; 00316 } else if (FLAGS_glyph_resized_size <= 0) { 00317 tprintf("ERROR: --glyph_resized_size must be positive\n"); 00318 return false; 00319 } else if (FLAGS_glyph_num_border_pixels_to_pad < 0) { 00320 tprintf("ERROR: --glyph_num_border_pixels_to_pad must be 0 or positive\n"); 00321 return false; 00322 } 00323 00324 const int n_boxes = vbox.size(); 00325 int n_boxes_saved = 0; 00326 int current_tiff_page = 0; 00327 int y_previous = 0; 00328 static int glyph_count = 0; 00329 for (int i = 0; i < n_boxes; i++) { 00330 // Get one bounding box 00331 Box* b = vbox[i]->mutable_box(); 00332 if (!b) continue; 00333 const int x = b->x; 00334 const int y = b->y; 00335 const int w = b->w; 00336 const int h = b->h; 00337 // Check present tiff page (for multipage tiff) 00338 if (y < y_previous-pixGetHeight(pix)/10) { 00339 tprintf("ERROR: Wrap-around encountered, at i=%d\n", i); 00340 current_tiff_page++; 00341 } 00342 if (current_tiff_page < input_tiff_page) continue; 00343 else if (current_tiff_page > input_tiff_page) break; 00344 // Check box validity 00345 if (x < 0 || y < 0 || 00346 (x+w-1) >= pixGetWidth(pix) || 00347 (y+h-1) >= pixGetHeight(pix)) { 00348 tprintf("ERROR: MakeIndividualGlyphs(): Index out of range, at i=%d" 00349 " (x=%d, y=%d, w=%d, h=%d\n)", i, x, y, w, h); 00350 continue; 00351 } else if (w < FLAGS_glyph_num_border_pixels_to_pad && 00352 h < FLAGS_glyph_num_border_pixels_to_pad) { 00353 tprintf("ERROR: Input image too small to be a character, at i=%d\n", i); 00354 continue; 00355 } 00356 // Crop the boxed character 00357 Pix* pix_glyph = pixClipRectangle(pix, b, NULL); 00358 if (!pix_glyph) { 00359 tprintf("ERROR: MakeIndividualGlyphs(): Failed to clip, at i=%d\n", i); 00360 continue; 00361 } 00362 // Resize to square 00363 Pix* pix_glyph_sq = pixScaleToSize(pix_glyph, 00364 FLAGS_glyph_resized_size, 00365 FLAGS_glyph_resized_size); 00366 if (!pix_glyph_sq) { 00367 tprintf("ERROR: MakeIndividualGlyphs(): Failed to resize, at i=%d\n", i); 00368 continue; 00369 } 00370 // Zero-pad 00371 Pix* pix_glyph_sq_pad = pixAddBorder(pix_glyph_sq, 00372 FLAGS_glyph_num_border_pixels_to_pad, 00373 0); 00374 if (!pix_glyph_sq_pad) { 00375 tprintf("ERROR: MakeIndividualGlyphs(): Failed to zero-pad, at i=%d\n", 00376 i); 00377 continue; 00378 } 00379 // Write out 00380 Pix* pix_glyph_sq_pad_8 = pixConvertTo8(pix_glyph_sq_pad, false); 00381 char filename[1024]; 00382 snprintf(filename, 1024, "%s_%d.jpg", FLAGS_outputbase.c_str(), 00383 glyph_count++); 00384 if (pixWriteJpeg(filename, pix_glyph_sq_pad_8, 100, 0)) { 00385 tprintf("ERROR: MakeIndividualGlyphs(): Failed to write JPEG to %s," 00386 " at i=%d\n", filename, i); 00387 continue; 00388 } 00389 00390 pixDestroy(&pix_glyph); 00391 pixDestroy(&pix_glyph_sq); 00392 pixDestroy(&pix_glyph_sq_pad); 00393 pixDestroy(&pix_glyph_sq_pad_8); 00394 n_boxes_saved++; 00395 y_previous = y; 00396 } 00397 if (n_boxes_saved == 0) { 00398 return false; 00399 } else { 00400 tprintf("Total number of characters saved = %d\n", n_boxes_saved); 00401 return true; 00402 } 00403 } 00404 } // namespace tesseract 00405 00406 using tesseract::DegradeImage; 00407 using tesseract::ExtractFontProperties; 00408 using tesseract::File; 00409 using tesseract::FontUtils; 00410 using tesseract::SpanUTF8NotWhitespace; 00411 using tesseract::SpanUTF8Whitespace; 00412 using tesseract::StringRenderer; 00413 00414 int main(int argc, char** argv) { 00415 tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); 00416 00417 if (FLAGS_list_available_fonts) { 00418 const vector<string>& all_fonts = FontUtils::ListAvailableFonts(); 00419 for (int i = 0; i < all_fonts.size(); ++i) { 00420 tprintf("%3d: %s\n", i, all_fonts[i].c_str()); 00421 ASSERT_HOST_MSG(FontUtils::IsAvailableFont(all_fonts[i].c_str()), 00422 "Font %s is unrecognized.\n", all_fonts[i].c_str()); 00423 } 00424 return EXIT_SUCCESS; 00425 } 00426 // Check validity of input flags. 00427 ASSERT_HOST_MSG(!FLAGS_text.empty(), "Text file missing!\n"); 00428 ASSERT_HOST_MSG(!FLAGS_outputbase.empty(), "Output file missing!\n"); 00429 ASSERT_HOST_MSG(FLAGS_render_ngrams || FLAGS_unicharset_file.empty(), 00430 "Use --unicharset_file only if --render_ngrams is set.\n"); 00431 00432 if (!FLAGS_find_fonts && !FontUtils::IsAvailableFont(FLAGS_font.c_str())) { 00433 string pango_name; 00434 if (!FontUtils::IsAvailableFont(FLAGS_font.c_str(), &pango_name)) { 00435 tprintf("Could not find font named %s. Pango suggested font %s\n", 00436 FLAGS_font.c_str(), pango_name.c_str()); 00437 TLOG_FATAL("Please correct --font arg."); 00438 } 00439 } 00440 00441 if (FLAGS_render_ngrams) 00442 FLAGS_output_word_boxes = true; 00443 00444 char font_desc_name[1024]; 00445 snprintf(font_desc_name, 1024, "%s %d", FLAGS_font.c_str(), 00446 static_cast<int>(FLAGS_ptsize)); 00447 StringRenderer render(font_desc_name, FLAGS_xsize, FLAGS_ysize); 00448 render.set_add_ligatures(FLAGS_ligatures); 00449 render.set_leading(FLAGS_leading); 00450 render.set_resolution(FLAGS_resolution); 00451 render.set_char_spacing(FLAGS_char_spacing * FLAGS_ptsize); 00452 render.set_h_margin(FLAGS_margin); 00453 render.set_v_margin(FLAGS_margin); 00454 render.set_output_word_boxes(FLAGS_output_word_boxes); 00455 render.set_box_padding(FLAGS_box_padding); 00456 render.set_strip_unrenderable_words(FLAGS_strip_unrenderable_words); 00457 render.set_underline_start_prob(FLAGS_underline_start_prob); 00458 render.set_underline_continuation_prob(FLAGS_underline_continuation_prob); 00459 00460 // Set text rendering orientation and their forms. 00461 if (FLAGS_writing_mode == "horizontal") { 00462 // Render regular horizontal text (default). 00463 render.set_vertical_text(false); 00464 render.set_gravity_hint_strong(false); 00465 render.set_render_fullwidth_latin(false); 00466 } else if (FLAGS_writing_mode == "vertical") { 00467 // Render vertical text. Glyph orientation is selected by Pango. 00468 render.set_vertical_text(true); 00469 render.set_gravity_hint_strong(false); 00470 render.set_render_fullwidth_latin(false); 00471 } else if (FLAGS_writing_mode == "vertical-upright") { 00472 // Render vertical text. Glyph orientation is set to be upright. 00473 // Also Basic Latin characters are converted to their fullwidth forms 00474 // on rendering, since fullwidth Latin characters are well designed to fit 00475 // vertical text lines, while .box files store halfwidth Basic Latin 00476 // unichars. 00477 render.set_vertical_text(true); 00478 render.set_gravity_hint_strong(true); 00479 render.set_render_fullwidth_latin(true); 00480 } else { 00481 TLOG_FATAL("Invalid writing mode : %s\n", FLAGS_writing_mode.c_str()); 00482 } 00483 00484 string src_utf8; 00485 // This c_str is NOT redundant! 00486 File::ReadFileToStringOrDie(FLAGS_text.c_str(), &src_utf8); 00487 00488 // Remove the unicode mark if present. 00489 if (strncmp(src_utf8.c_str(), "\xef\xbb\xbf", 3) == 0) { 00490 src_utf8.erase(0, 3); 00491 } 00492 tlog(1, "Render string of size %d\n", src_utf8.length()); 00493 00494 if (FLAGS_render_ngrams || FLAGS_only_extract_font_properties) { 00495 // Try to preserve behavior of old text2image by expanding inter-word 00496 // spaces by a factor of 4. 00497 const string kSeparator = FLAGS_render_ngrams ? " " : " "; 00498 // Also restrict the number of charactes per line to try and avoid 00499 // line-breaking in the middle of words like "-A", "R$" etc. which are 00500 // otherwise allowed by the standard unicode line-breaking rules. 00501 const int kCharsPerLine = (FLAGS_ptsize > 20) ? 50 : 100; 00502 string rand_utf8; 00503 UNICHARSET unicharset; 00504 if (FLAGS_render_ngrams && !FLAGS_unicharset_file.empty() && 00505 !unicharset.load_from_file(FLAGS_unicharset_file.c_str())) { 00506 TLOG_FATAL("Failed to load unicharset from file %s\n", 00507 FLAGS_unicharset_file.c_str()); 00508 } 00509 00510 // If we are rendering ngrams that will be OCRed later, shuffle them so that 00511 // tesseract does not have difficulties finding correct baseline, word 00512 // spaces, etc. 00513 const char *str8 = src_utf8.c_str(); 00514 int len = src_utf8.length(); 00515 int step; 00516 vector<pair<int, int> > offsets; 00517 int offset = SpanUTF8Whitespace(str8); 00518 while (offset < len) { 00519 step = SpanUTF8NotWhitespace(str8 + offset); 00520 offsets.push_back(make_pair(offset, step)); 00521 offset += step; 00522 offset += SpanUTF8Whitespace(str8 + offset); 00523 } 00524 if (FLAGS_render_ngrams) 00525 std::random_shuffle(offsets.begin(), offsets.end()); 00526 00527 for (int i = 0, line = 1; i < offsets.size(); ++i) { 00528 const char *curr_pos = str8 + offsets[i].first; 00529 int ngram_len = offsets[i].second; 00530 // Skip words that contain characters not in found in unicharset. 00531 if (!FLAGS_unicharset_file.empty() && 00532 !unicharset.encodable_string(curr_pos, NULL)) { 00533 continue; 00534 } 00535 rand_utf8.append(curr_pos, ngram_len); 00536 if (rand_utf8.length() > line * kCharsPerLine) { 00537 rand_utf8.append(" \n"); 00538 ++line; 00539 if (line & 0x1) rand_utf8.append(kSeparator); 00540 } else { 00541 rand_utf8.append(kSeparator); 00542 } 00543 } 00544 tlog(1, "Rendered ngram string of size %d\n", rand_utf8.length()); 00545 src_utf8.swap(rand_utf8); 00546 } 00547 if (FLAGS_only_extract_font_properties) { 00548 tprintf("Extracting font properties only\n"); 00549 ExtractFontProperties(src_utf8, &render, FLAGS_outputbase.c_str()); 00550 tprintf("Done!\n"); 00551 return 0; 00552 } 00553 00554 int im = 0; 00555 vector<float> page_rotation; 00556 const char* to_render_utf8 = src_utf8.c_str(); 00557 00558 tesseract::TRand randomizer; 00559 randomizer.set_seed(kRandomSeed); 00560 vector<string> font_names; 00561 // We use a two pass mechanism to rotate images in both direction. 00562 // The first pass(0) will rotate the images in random directions and 00563 // the second pass(1) will mirror those rotations. 00564 int num_pass = FLAGS_bidirectional_rotation ? 2 : 1; 00565 for (int pass = 0; pass < num_pass; ++pass) { 00566 int page_num = 0; 00567 string font_used; 00568 for (int offset = 0; offset < strlen(to_render_utf8); ++im, ++page_num) { 00569 tlog(1, "Starting page %d\n", im); 00570 Pix* pix = NULL; 00571 if (FLAGS_find_fonts) { 00572 offset += render.RenderAllFontsToImage(FLAGS_min_coverage, 00573 to_render_utf8 + offset, 00574 strlen(to_render_utf8 + offset), 00575 &font_used, &pix); 00576 } else { 00577 offset += render.RenderToImage(to_render_utf8 + offset, 00578 strlen(to_render_utf8 + offset), &pix); 00579 } 00580 if (pix != NULL) { 00581 float rotation = 0; 00582 if (pass == 1) { 00583 // Pass 2, do mirror rotation. 00584 rotation = -1 * page_rotation[page_num]; 00585 } 00586 if (FLAGS_degrade_image) { 00587 pix = DegradeImage(pix, FLAGS_exposure, &randomizer, &rotation); 00588 } 00589 render.RotatePageBoxes(rotation); 00590 00591 if (pass == 0) { 00592 // Pass 1, rotate randomly and store the rotation.. 00593 page_rotation.push_back(rotation); 00594 } 00595 00596 Pix* gray_pix = pixConvertTo8(pix, false); 00597 pixDestroy(&pix); 00598 Pix* binary = pixThresholdToBinary(gray_pix, 128); 00599 pixDestroy(&gray_pix); 00600 char tiff_name[1024]; 00601 if (FLAGS_find_fonts) { 00602 if (FLAGS_render_per_font) { 00603 string fontname_for_file = tesseract::StringReplace( 00604 font_used, " ", "_"); 00605 snprintf(tiff_name, 1024, "%s.%s.tif", FLAGS_outputbase.c_str(), 00606 fontname_for_file.c_str()); 00607 pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, "w"); 00608 tprintf("Rendered page %d to file %s\n", im, tiff_name); 00609 } else { 00610 font_names.push_back(font_used); 00611 } 00612 } else { 00613 snprintf(tiff_name, 1024, "%s.tif", FLAGS_outputbase.c_str()); 00614 pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, im == 0 ? "w" : "a"); 00615 tprintf("Rendered page %d to file %s\n", im, tiff_name); 00616 } 00617 // Make individual glyphs 00618 if (FLAGS_output_individual_glyph_images) { 00619 if (!MakeIndividualGlyphs(binary, render.GetBoxes(), im)) { 00620 tprintf("ERROR: Individual glyphs not saved\n"); 00621 } 00622 } 00623 pixDestroy(&binary); 00624 } 00625 if (FLAGS_find_fonts && offset != 0) { 00626 // We just want a list of names, or some sample images so we don't need 00627 // to render more than the first page of the text. 00628 break; 00629 } 00630 } 00631 } 00632 if (!FLAGS_find_fonts) { 00633 string box_name = FLAGS_outputbase.c_str(); 00634 box_name += ".box"; 00635 render.WriteAllBoxes(box_name); 00636 } else if (!FLAGS_render_per_font && !font_names.empty()) { 00637 string filename = FLAGS_outputbase.c_str(); 00638 filename += ".fontlist.txt"; 00639 FILE* fp = fopen(filename.c_str(), "wb"); 00640 if (fp == NULL) { 00641 tprintf("Failed to create output font list %s\n", filename.c_str()); 00642 } else { 00643 for (int i = 0; i < font_names.size(); ++i) { 00644 fprintf(fp, "%s\n", font_names[i].c_str()); 00645 } 00646 fclose(fp); 00647 } 00648 } 00649 00650 return 0; 00651 }