20 #include "config_auto.h" 26 #include "allheaders.h" 169 static const int kCharWidth = 2;
174 static const int kMaxBytesPerCodepoint = 20;
184 textonly_ = textonly;
188 void TessPDFRenderer::AppendPDFObjectDIY(
size_t objectsize) {
193 void TessPDFRenderer::AppendPDFObject(
const char *data) {
194 AppendPDFObjectDIY(strlen(data));
201 static double prec(
double x) {
202 double kPrecision = 1000.0;
203 double a = round(x * kPrecision) / kPrecision;
209 static long dist2(
int x1,
int y1,
int x2,
int y2) {
210 return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
221 static void GetWordBaseline(
int writing_direction,
int ppi,
int height,
222 int word_x1,
int word_y1,
int word_x2,
int word_y2,
223 int line_x1,
int line_y1,
int line_x2,
int line_y2,
224 double *x0,
double *y0,
double *length) {
226 Swap(&word_x1, &word_x2);
227 Swap(&word_y1, &word_y2);
234 double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
239 double t = ((px - line_x2) * (line_x2 - line_x1) +
240 (py - line_y2) * (line_y2 - line_y1)) / l2;
241 x = line_x2 + t * (line_x2 - line_x1);
242 y = line_y2 + t * (line_y2 - line_y1);
244 word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
246 word_length = word_length * 72.0 / ppi;
248 y = height - (y * 72.0 / ppi);
252 *length = word_length;
263 static void AffineMatrix(
int writing_direction,
264 int line_x1,
int line_y1,
int line_x2,
int line_y2,
265 double *a,
double *b,
double *c,
double *d) {
266 double theta = atan2(static_cast<double>(line_y1 - line_y2),
267 static_cast<double>(line_x2 - line_x1));
272 switch(writing_direction) {
292 static void ClipBaseline(
int ppi,
int x1,
int y1,
int x2,
int y2,
293 int *line_x1,
int *line_y1,
294 int *line_x2,
int *line_y2) {
299 int rise = abs(y2 - y1) * 72;
300 int run = abs(x2 - x1) * 72;
301 if (rise < 2 * ppi && 2 * ppi < run)
302 *line_y1 = *line_y2 = (y1 + y2) / 2;
305 static bool CodepointToUtf16be(
int code,
char utf16[kMaxBytesPerCodepoint]) {
306 if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
307 tprintf(
"Dropping invalid codepoint %d\n", code);
310 if (code < 0x10000) {
311 snprintf(utf16, kMaxBytesPerCodepoint,
"%04X", code);
313 int a = code - 0x010000;
314 int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
315 int low_surrogate = (0x03FF & a) + 0xDC00;
316 snprintf(utf16, kMaxBytesPerCodepoint,
317 "%04X%04X", high_surrogate, low_surrogate);
322 char* TessPDFRenderer::GetPDFTextObjects(
TessBaseAPI* api,
323 double width,
double height) {
327 double old_x = 0.0, old_y = 0.0;
328 int old_fontsize = 0;
331 bool new_block =
true;
338 std::stringstream pdf_str;
340 pdf_str.imbue(std::locale::classic());
342 pdf_str.precision(8);
347 pdf_str <<
"q " << prec(width) <<
" 0 0 " << prec(height) <<
" 0 0 cm";
349 pdf_str <<
" /Im1 Do";
361 pdf_str <<
"BT\n3 Tr";
369 ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
383 res_it->
Orientation(&orientation, &writing_direction,
384 &textline_order, &deskew_angle);
394 writing_direction = old_writing_direction;
400 double x, y, word_length;
402 int word_x1, word_y1, word_x2, word_y2;
404 GetWordBaseline(writing_direction, ppi, height,
405 word_x1, word_y1, word_x2, word_y2,
406 line_x1, line_y1, line_x2, line_y2,
407 &x, &y, &word_length);
410 if (writing_direction != old_writing_direction || new_block) {
411 AffineMatrix(writing_direction,
412 line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
413 pdf_str <<
" " << prec(a)
422 double dx = x - old_x;
423 double dy = y - old_y;
424 pdf_str <<
" " << prec(dx * a + dy * b)
425 <<
" " << prec(dx * c + dy * d)
430 old_writing_direction = writing_direction;
437 bool bold, italic, underlined, monospace, serif, smallcaps;
440 &serif, &smallcaps, &fontsize, &font_id);
441 const int kDefaultFontsize = 8;
443 fontsize = kDefaultFontsize;
444 if (fontsize != old_fontsize) {
445 pdf_str <<
"/f-0-0 " << fontsize <<
" Tf ";
446 old_fontsize = fontsize;
452 std::string pdf_word;
453 int pdf_word_len = 0;
455 const std::unique_ptr<const char[]> grapheme(
457 if (grapheme && grapheme[0] !=
'\0') {
459 char utf16[kMaxBytesPerCodepoint];
460 for (
char32 code : unicodes) {
461 if (CodepointToUtf16be(code, utf16)) {
473 if (word_length > 0 && pdf_word_len > 0) {
475 kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
476 pdf_str << h_stretch <<
" Tz" 477 <<
" [ <" << pdf_word
480 if (last_word_in_line) {
483 if (last_word_in_block) {
487 const std::string& text = pdf_str.str();
488 char* result =
new char[text.length() + 1];
489 strcpy(result, text.c_str());
495 AppendPDFObject(
"%PDF-1.5\n%\xDE\xAD\xBE\xEB\n");
498 AppendPDFObject(
"1 0 obj\n" 510 AppendPDFObject(
"3 0 obj\n" 512 " /BaseFont /GlyphLessFont\n" 513 " /DescendantFonts [ 4 0 R ]\n" 514 " /Encoding /Identity-H\n" 516 " /ToUnicode 6 0 R\n" 522 std::stringstream stream;
526 " /BaseFont /GlyphLessFont\n" 527 " /CIDToGIDMap 5 0 R\n" 530 " /Ordering (Identity)\n" 531 " /Registry (Adobe)\n" 534 " /FontDescriptor 7 0 R\n" 535 " /Subtype /CIDFontType2\n" 537 " /DW " << (1000 / kCharWidth) <<
"\n" 540 AppendPDFObject(stream.str().c_str());
543 const int kCIDToGIDMapSize = 2 * (1 << 16);
544 const std::unique_ptr<unsigned char[]> cidtogidmap(
545 new unsigned char[kCIDToGIDMapSize]);
546 for (
int i = 0; i < kCIDToGIDMapSize; i++) {
547 cidtogidmap[i] = (i % 2) ? 1 : 0;
550 unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
555 " /Length " << len <<
" /Filter /FlateDecode\n" 559 long objsize = stream.str().size();
560 AppendData(reinterpret_cast<char *>(comp), len);
563 const char *endstream_endobj =
567 objsize += strlen(endstream_endobj);
568 AppendPDFObjectDIY(objsize);
570 const char stream2[] =
571 "/CIDInit /ProcSet findresource begin\n" 576 " /Registry (Adobe)\n" 580 "/CMapName /Adobe-Identify-UCS def\n" 582 "1 begincodespacerange\n" 584 "endcodespacerange\n" 586 "<0000> <FFFF> <0000>\n" 589 "CMapName currentdict /CMap defineresource pop\n" 597 "<< /Length " << (
sizeof(stream2) - 1) <<
" >>\n" 598 "stream\n" << stream2 <<
601 AppendPDFObject(stream.str().c_str());
612 " /FontBBox [ 0 0 " << (1000 / kCharWidth) <<
" 1000 ]\n" 613 " /FontFile2 8 0 R\n" 614 " /FontName /GlyphLessFont\n" 617 " /Type /FontDescriptor\n" 620 AppendPDFObject(stream.str().c_str());
623 stream << datadir_.c_str() <<
"/pdf.ttf";
624 FILE *fp = fopen(stream.str().c_str(),
"rb");
626 tprintf(
"Cannot open file \"%s\"!\n", stream.str().c_str());
629 fseek(fp, 0, SEEK_END);
630 long int size = ftell(fp);
635 fseek(fp, 0, SEEK_SET);
636 const std::unique_ptr<char[]> buffer(
new char[size]);
647 " /Length " << size <<
"\n" 648 " /Length1 " << size <<
"\n" 652 objsize = stream.str().size();
656 objsize += strlen(endstream_endobj);
657 AppendPDFObjectDIY(objsize);
661 bool TessPDFRenderer::imageToPDFObj(Pix *pix,
662 const char* filename,
665 long int* pdf_object_size,
666 const int jpg_quality) {
667 if (!pdf_object_size || !pdf_object)
669 *pdf_object =
nullptr;
670 *pdf_object_size = 0;
671 if (!filename && !pix)
674 L_Compressed_Data *cid =
nullptr;
677 if (pixGetInputFormat(pix) == IFF_PNG)
678 sad = pixGenerateCIData(pix, L_FLATE_ENCODE, 0, 0, &cid);
680 sad = l_generateCIDataForPdf(filename, pix, jpg_quality, &cid);
684 l_CIDataDestroy(&cid);
688 const char *group4 =
"";
692 filter =
"/FlateDecode";
695 filter =
"/DCTDecode";
698 filter =
"/CCITTFaxDecode";
702 filter =
"/JPXDecode";
705 l_CIDataDestroy(&cid);
712 std::stringstream colorspace;
713 if (cid->ncolors > 0) {
715 <<
" /ColorSpace [ /Indexed /DeviceRGB " << (cid->ncolors - 1)
716 <<
" " << cid->cmapdatahex <<
" ]\n";
720 colorspace.str(
" /ColorSpace /DeviceGray\n");
723 colorspace.str(
" /ColorSpace /DeviceRGB\n");
726 l_CIDataDestroy(&cid);
731 int predictor = (cid->predictor) ? 14 : 1;
734 std::stringstream b1;
738 " /Length " << cid->nbytescomp <<
"\n" 739 " /Subtype /Image\n";
741 std::stringstream b2;
743 " /Width " << cid->w <<
"\n" 744 " /Height " << cid->h <<
"\n" 745 " /BitsPerComponent " << cid->bps <<
"\n" 746 " /Filter " << filter <<
"\n" 749 " /Predictor " << predictor <<
"\n" 750 " /Colors " << cid->spp <<
"\n" << group4 <<
751 " /Columns " << cid->w <<
"\n" 752 " /BitsPerComponent " << cid->bps <<
"\n" 761 size_t b1_len = b1.str().size();
762 size_t b2_len = b2.str().size();
763 size_t b3_len = strlen(b3);
764 size_t colorspace_len = colorspace.str().size();
767 b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len;
768 *pdf_object =
new char[*pdf_object_size];
770 char *p = *pdf_object;
771 memcpy(p, b1.str().c_str(), b1_len);
773 memcpy(p, colorspace.str().c_str(), colorspace_len);
775 memcpy(p, b2.str().c_str(), b2_len);
777 memcpy(p, cid->datacomp, cid->nbytescomp);
778 p += cid->nbytescomp;
779 memcpy(p, b3, b3_len);
780 l_CIDataDestroy(&cid);
788 if (!pix || ppi <= 0)
790 double width = pixGetWidth(pix) * 72.0 / ppi;
791 double height = pixGetHeight(pix) * 72.0 / ppi;
793 std::stringstream xobject;
795 xobject <<
"/XObject << /Im1 " << (obj_ + 2) <<
" 0 R >>\n";
799 std::stringstream stream;
801 stream.imbue(std::locale::classic());
803 stream << std::fixed <<
808 " /MediaBox [0 0 " << width <<
" " << height <<
"]\n" 809 " /Contents " << (obj_ + 1) <<
" 0 R\n" 812 " " << xobject.str() <<
813 " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n" 814 " /Font << /f-0-0 3 0 R >>\n" 819 AppendPDFObject(stream.str().c_str());
822 const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
823 const size_t pdftext_len = strlen(pdftext.get());
825 unsigned char *comp_pdftext = zlibCompress(
826 reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
827 long comp_pdftext_len = len;
832 " /Length " << comp_pdftext_len <<
" /Filter /FlateDecode\n" 836 long objsize = stream.str().size();
837 AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
838 objsize += comp_pdftext_len;
839 lept_free(comp_pdftext);
844 objsize += strlen(b2);
845 AppendPDFObjectDIY(objsize);
848 char *pdf_object =
nullptr;
851 if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize,
856 AppendPDFObjectDIY(objsize);
871 const long int kPagesObjectNumber = 2;
872 offsets_[kPagesObjectNumber] = offsets_.
back();
873 std::stringstream stream;
874 stream << kPagesObjectNumber <<
" 0 obj\n<<\n /Type /Pages\n /Kids [ ";
876 size_t pages_objsize = stream.str().size();
879 stream << pages_[i] <<
" 0 R ";
881 pages_objsize += stream.str().size();
884 stream <<
"]\n /Count " << pages_.
size() <<
"\n>>\nendobj\n";
886 pages_objsize += stream.str().size();
887 offsets_.
back() += pages_objsize;
890 STRING utf16_title =
"FEFF";
892 char utf16[kMaxBytesPerCodepoint];
893 for (
char32 code : unicodes) {
894 if (CodepointToUtf16be(code, utf16)) {
895 utf16_title += utf16;
899 char* datestr = l_getFormattedDate();
902 << obj_ <<
" 0 obj\n" 905 " /CreationDate (D:" << datestr <<
")\n" 906 " /Title <" << utf16_title.
c_str() <<
">\n" 910 AppendPDFObject(stream.str().c_str());
912 stream <<
"xref\n0 " << obj_ <<
"\n0000000000 65535 f \n";
914 for (
int i = 1; i < obj_; i++) {
918 stream << offsets_[i] <<
" 00000 n \n";
923 <<
"trailer\n<<\n /Size " << obj_ <<
"\n" 925 " /Info " << (obj_ - 1) <<
" 0 R\n" 926 ">>\nstartxref\n" << offsets_.
back() <<
"\n%%EOF\n";
bool IsAtBeginningOf(PageIteratorLevel level) const override
virtual char * GetUTF8Text(PageIteratorLevel level) const
bool GetIntVariable(const char *name, int *value) const
bool EndDocumentHandler() override
bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const
ResultIterator * GetIterator()
StrongScriptDirection WordDirection() const
void AppendString(const char *s)
static const char * Version()
bool AddImageHandler(TessBaseAPI *api) override
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
TessPDFRenderer(const char *outputbase, const char *datadir, bool textonly=false)
DLLSYM void tprintf(const char *format,...)
size_t unsigned_size() const
bool Empty(PageIteratorLevel level) const
int GetSourceYResolution()
const char * title() const
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
const char * WordFontAttributes(bool *is_bold, bool *is_italic, bool *is_underlined, bool *is_monospace, bool *is_serif, bool *is_smallcaps, int *pointsize, int *font_id) const
bool Next(PageIteratorLevel level) override
void Orientation(tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const
bool BeginDocumentHandler() override
void AppendData(const char *s, int len)
bool DeSerialize(FILE *fp, char *data, size_t n)
const char * GetInputName()
const char * c_str() const