11 #include "allheaders.h"
171 void TessPDFRenderer::AppendPDFObjectDIY(
size_t objectsize) {
176 void TessPDFRenderer::AppendPDFObject(
const char *data) {
177 AppendPDFObjectDIY(strlen(data));
185 double kPrecision = 1000.0;
186 double a = round(x * kPrecision) / kPrecision;
192 long dist2(
int x1,
int y1,
int x2,
int y2) {
193 return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
205 int word_x1,
int word_y1,
int word_x2,
int word_y2,
206 int line_x1,
int line_y1,
int line_x2,
int line_y2,
207 double *x0,
double *y0,
double *length) {
209 Swap(&word_x1, &word_x2);
210 Swap(&word_y1, &word_y2);
217 double l2 =
dist2(line_x1, line_y1, line_x2, line_y2);
222 double t = ((px - line_x2) * (line_x2 - line_x1) +
223 (py - line_y2) * (line_y2 - line_y1)) / l2;
224 x = line_x2 + t * (line_x2 - line_x1);
225 y = line_y2 + t * (line_y2 - line_y1);
227 word_length = sqrt(static_cast<double>(
dist2(word_x1, word_y1,
229 word_length = word_length * 72.0 / ppi;
231 y = height - (y * 72.0 / ppi);
235 *length = word_length;
247 int line_x1,
int line_y1,
int line_x2,
int line_y2,
248 double *a,
double *b,
double *c,
double *d) {
249 double theta = atan2(static_cast<double>(line_y1 - line_y2),
250 static_cast<double>(line_x2 - line_x1));
255 switch(writing_direction) {
276 int *line_x1,
int *line_y1,
277 int *line_x2,
int *line_y2) {
282 double rise = abs(y2 - y1) * 72 / ppi;
283 double run = abs(x2 - x1) * 72 / ppi;
284 if (rise < 2.0 && 2.0 < run)
285 *line_y1 = *line_y2 = (y1 + y2) / 2;
289 double width,
double height) {
291 double ppi = api->GetSourceYResolution();
294 double old_x = 0.0, old_y = 0.0;
295 int old_fontsize = 0;
298 bool new_block =
true;
309 pdf_str.add_str_double(
"",
prec(width));
311 pdf_str.add_str_double(
"",
prec(height));
312 pdf_str +=
" 0 0 cm /Im1 Do Q\n";
319 ResultIterator *res_it = api->GetIterator();
321 if (res_it->IsAtBeginningOf(
RIL_BLOCK)) {
322 pdf_str +=
"BT\n3 Tr";
330 ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
344 res_it->Orientation(&orientation, &writing_direction,
345 &textline_order, &deskew_angle);
347 switch (res_it->WordDirection()) {
355 writing_direction = old_writing_direction;
361 double x, y, word_length;
363 int word_x1, word_y1, word_x2, word_y2;
364 res_it->Baseline(
RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
366 word_x1, word_y1, word_x2, word_y2,
367 line_x1, line_y1, line_x2, line_y2,
368 &x, &y, &word_length);
371 if (writing_direction != old_writing_direction || new_block) {
373 line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
374 pdf_str.add_str_double(
" ",
prec(a));
375 pdf_str.add_str_double(
" ",
prec(b));
376 pdf_str.add_str_double(
" ",
prec(c));
377 pdf_str.add_str_double(
" ",
prec(d));
378 pdf_str.add_str_double(
" ",
prec(x));
379 pdf_str.add_str_double(
" ",
prec(y));
383 double dx = x - old_x;
384 double dy = y - old_y;
385 pdf_str.add_str_double(
" ",
prec(dx * a + dy * b));
386 pdf_str.add_str_double(
" ",
prec(dx * c + dy * d));
391 old_writing_direction = writing_direction;
398 bool bold, italic, underlined, monospace, serif, smallcaps;
400 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
401 &serif, &smallcaps, &fontsize, &font_id);
402 const int kDefaultFontsize = 8;
404 fontsize = kDefaultFontsize;
405 if (fontsize != old_fontsize) {
407 snprintf(textfont,
sizeof(textfont),
"/f-0-0 %d Tf ", fontsize);
409 old_fontsize = fontsize;
416 int pdf_word_len = 0;
418 const char *grapheme = res_it->GetUTF8Text(
RIL_SYMBOL);
419 if (grapheme && grapheme[0] !=
'\0') {
423 for (
int i = 0; i < unicodes.
length(); i++) {
424 int code = unicodes[i];
426 if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
427 tprintf(
"Dropping invalid codepoint %d\n", code);
430 if (code < 0x10000) {
431 snprintf(utf16,
sizeof(utf16),
"<%04X>", code);
433 int a = code - 0x010000;
434 int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
435 int low_surrogate = (0x03FF & a) + 0xDC00;
436 snprintf(utf16,
sizeof(utf16),
"<%04X%04X>",
437 high_surrogate, low_surrogate);
446 if (word_length > 0 && pdf_word_len > 0 && fontsize > 0) {
448 kCharWidth *
prec(100.0 * word_length / (fontsize * pdf_word_len));
449 pdf_str.add_str_double(
"", h_stretch);
455 if (last_word_in_line) {
458 if (last_word_in_block) {
462 char *ret =
new char[pdf_str.length() + 1];
463 strcpy(ret, pdf_str.string());
472 n = snprintf(buf,
sizeof(buf),
475 0xDE, 0xAD, 0xBE, 0xEB);
476 if (n >=
sizeof(buf))
return false;
477 AppendPDFObject(buf);
480 n = snprintf(buf,
sizeof(buf),
488 if (n >=
sizeof(buf))
return false;
489 AppendPDFObject(buf);
497 n = snprintf(buf,
sizeof(buf),
500 " /BaseFont /GlyphLessFont\n"
501 " /DescendantFonts [ %ld 0 R ]\n"
502 " /Encoding /Identity-H\n"
504 " /ToUnicode %ld 0 R\n"
511 if (n >=
sizeof(buf))
return false;
512 AppendPDFObject(buf);
515 n = snprintf(buf,
sizeof(buf),
518 " /BaseFont /GlyphLessFont\n"
519 " /CIDToGIDMap %ld 0 R\n"
522 " /Ordering (Identity)\n"
523 " /Registry (Adobe)\n"
526 " /FontDescriptor %ld 0 R\n"
527 " /Subtype /CIDFontType2\n"
535 if (n >=
sizeof(buf))
return false;
536 AppendPDFObject(buf);
539 const int kCIDToGIDMapSize = 2 * (1 << 16);
540 unsigned char *cidtogidmap =
new unsigned char[kCIDToGIDMapSize];
541 for (
int i = 0; i < kCIDToGIDMapSize; i++) {
542 cidtogidmap[i] = (i % 2) ? 1 : 0;
545 unsigned char *comp =
546 zlibCompress(cidtogidmap, kCIDToGIDMapSize, &len);
547 delete[] cidtogidmap;
548 n = snprintf(buf,
sizeof(buf),
551 " /Length %lu /Filter /FlateDecode\n"
553 "stream\n", (
unsigned long)len);
554 if (n >=
sizeof(buf)) {
559 long objsize = strlen(buf);
560 AppendData(reinterpret_cast<char *>(comp), len);
563 const char *endstream_endobj =
567 objsize += strlen(endstream_endobj);
568 AppendPDFObjectDIY(objsize);
571 "/CIDInit /ProcSet findresource begin\n"
576 " /Registry (Adobe)\n"
580 "/CMapName /Adobe-Identify-UCS def\n"
582 "1 begincodespacerange\n"
584 "endcodespacerange\n"
586 "<0000> <FFFF> <0000>\n"
589 "CMapName currentdict /CMap defineresource pop\n"
594 n = snprintf(buf,
sizeof(buf),
596 "<< /Length %lu >>\n"
600 "endobj\n", (
unsigned long) strlen(stream), stream);
601 if (n >=
sizeof(buf))
return false;
602 AppendPDFObject(buf);
605 const int kCharHeight = 2;
606 n = snprintf(buf,
sizeof(buf),
613 " /FontBBox [ 0 0 %d %d ]\n"
614 " /FontFile2 %ld 0 R\n"
615 " /FontName /GlyphLessFont\n"
618 " /Type /FontDescriptor\n"
627 if (n >=
sizeof(buf))
return false;
628 AppendPDFObject(buf);
630 n = snprintf(buf,
sizeof(buf),
"%s/pdf.ttf", datadir_);
631 if (n >=
sizeof(buf))
return false;
632 FILE *fp = fopen(buf,
"rb");
634 tprintf(
"Can not open file \"%s\"!\n", buf);
637 fseek(fp, 0, SEEK_END);
638 long int size = ftell(fp);
639 fseek(fp, 0, SEEK_SET);
640 char *buffer =
new char[size];
641 if (fread(buffer, 1, size, fp) != size) {
648 n = snprintf(buf,
sizeof(buf),
654 "stream\n", size, size);
655 if (n >=
sizeof(buf)) {
660 objsize = strlen(buf);
665 objsize += strlen(endstream_endobj);
666 AppendPDFObjectDIY(objsize);
670 bool TessPDFRenderer::imageToPDFObj(Pix *pix,
674 long int *pdf_object_size) {
679 if (!pdf_object_size || !pdf_object)
682 *pdf_object_size = 0;
686 L_COMP_DATA *cid = NULL;
687 const int kJpegQuality = 85;
695 findFileFormat(filename, &format);
696 if (pixGetSpp(pix) == 4 && format == IFF_PNG) {
698 sad = pixGenerateCIData(pix, L_FLATE_ENCODE, 0, 0, &cid);
700 sad = l_generateCIDataForPdf(filename, pix, kJpegQuality, &cid);
704 l_CIDataDestroy(&cid);
708 const char *group4 =
"";
712 filter =
"/FlateDecode";
715 filter =
"/DCTDecode";
718 filter =
"/CCITTFaxDecode";
722 filter =
"/JPXDecode";
725 l_CIDataDestroy(&cid);
732 const char *colorspace;
733 if (cid->ncolors > 0) {
734 n = snprintf(b0,
sizeof(b0),
735 " /ColorSpace [ /Indexed /DeviceRGB %d %s ]\n",
736 cid->ncolors - 1, cid->cmapdatahex);
737 if (n >=
sizeof(b0)) {
738 l_CIDataDestroy(&cid);
745 colorspace =
" /ColorSpace /DeviceGray\n";
748 colorspace =
" /ColorSpace /DeviceRGB\n";
751 l_CIDataDestroy(&cid);
756 int predictor = (cid->predictor) ? 14 : 1;
759 n = snprintf(b1,
sizeof(b1),
763 " /Subtype /Image\n",
764 objnum, (
unsigned long) cid->nbytescomp);
765 if (n >=
sizeof(b1)) {
766 l_CIDataDestroy(&cid);
770 n = snprintf(b2,
sizeof(b2),
773 " /BitsPerComponent %d\n"
781 " /BitsPerComponent %d\n"
785 cid->w, cid->h, cid->bps, filter, predictor, cid->spp,
786 group4, cid->w, cid->bps);
787 if (n >=
sizeof(b2)) {
788 l_CIDataDestroy(&cid);
796 size_t b1_len = strlen(b1);
797 size_t b2_len = strlen(b2);
798 size_t b3_len = strlen(b3);
799 size_t colorspace_len = strlen(colorspace);
802 b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len;
803 *pdf_object =
new char[*pdf_object_size];
805 l_CIDataDestroy(&cid);
809 char *p = *pdf_object;
810 memcpy(p, b1, b1_len);
812 memcpy(p, colorspace, colorspace_len);
814 memcpy(p, b2, b2_len);
816 memcpy(p, cid->datacomp, cid->nbytescomp);
817 p += cid->nbytescomp;
818 memcpy(p, b3, b3_len);
819 l_CIDataDestroy(&cid);
829 if (!pix || ppi <= 0)
831 double width = pixGetWidth(pix) * 72.0 / ppi;
832 double height = pixGetHeight(pix) * 72.0 / ppi;
835 n = snprintf(buf,
sizeof(buf),
840 " /MediaBox [0 0 %.2f %.2f]\n"
841 " /Contents %ld 0 R\n"
844 " /XObject << /Im1 %ld 0 R >>\n"
845 " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
846 " /Font << /f-0-0 %ld 0 R >>\n"
857 if (n >=
sizeof(buf))
return false;
859 AppendPDFObject(buf);
862 char* pdftext = GetPDFTextObjects(api, width, height);
863 long pdftext_len = strlen(pdftext);
864 unsigned char *pdftext_casted =
reinterpret_cast<unsigned char *
>(pdftext);
866 unsigned char *comp_pdftext =
867 zlibCompress(pdftext_casted, pdftext_len, &len);
868 long comp_pdftext_len = len;
869 n = snprintf(buf,
sizeof(buf),
872 " /Length %ld /Filter /FlateDecode\n"
874 "stream\n", obj_, comp_pdftext_len);
875 if (n >=
sizeof(buf)) {
877 lept_free(comp_pdftext);
881 long objsize = strlen(buf);
882 AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
883 objsize += comp_pdftext_len;
884 lept_free(comp_pdftext);
890 objsize += strlen(b2);
891 AppendPDFObjectDIY(objsize);
894 if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
898 AppendPDFObjectDIY(objsize);
915 const long int kPagesObjectNumber = 2;
916 offsets_[kPagesObjectNumber] = offsets_.
back();
917 n = snprintf(buf,
sizeof(buf),
921 " /Kids [ ", kPagesObjectNumber);
922 if (n >=
sizeof(buf))
return false;
924 size_t pages_objsize = strlen(buf);
925 for (
size_t i = 0; i < pages_.
size(); i++) {
926 n = snprintf(buf,
sizeof(buf),
927 "%ld 0 R ", pages_[i]);
928 if (n >=
sizeof(buf))
return false;
930 pages_objsize += strlen(buf);
932 n = snprintf(buf,
sizeof(buf),
936 "endobj\n", pages_.
size());
937 if (n >=
sizeof(buf))
return false;
939 pages_objsize += strlen(buf);
940 offsets_.
back() += pages_objsize;
943 char* datestr = l_getFormattedDate();
944 n = snprintf(buf,
sizeof(buf),
947 " /Producer (Tesseract %s)\n"
948 " /CreationDate (D:%s)\n"
953 if (n >=
sizeof(buf))
return false;
954 AppendPDFObject(buf);
955 n = snprintf(buf,
sizeof(buf),
958 "0000000000 65535 f \n", obj_);
959 if (n >=
sizeof(buf))
return false;
961 for (
int i = 1; i < obj_; i++) {
962 n = snprintf(buf,
sizeof(buf),
"%010ld 00000 n \n", offsets_[i]);
963 if (n >=
sizeof(buf))
return false;
966 n = snprintf(buf,
sizeof(buf),
980 if (n >=
sizeof(buf))
return false;
void AppendData(const char *s, int len)
#define TESSERACT_VERSION_STR
void AppendString(const char *s)
void ClipBaseline(int ppi, int x1, int y1, int x2, int y2, int *line_x1, int *line_y1, int *line_x2, int *line_y2)
void AffineMatrix(int writing_direction, int line_x1, int line_y1, int line_x2, int line_y2, double *a, double *b, double *c, double *d)
static bool UTF8ToUnicode(const char *utf8_str, GenericVector< int > *unicodes)
const char * GetInputName()
long dist2(int x1, int y1, int x2, int y2)
virtual bool EndDocumentHandler()
virtual bool AddImageHandler(TessBaseAPI *api)
int GetSourceYResolution()
void GetWordBaseline(int writing_direction, int ppi, int height, int word_x1, int word_y1, int word_x2, int word_y2, int line_x1, int line_y1, int line_x2, int line_y2, double *x0, double *y0, double *length)
const char * title() const
TessPDFRenderer(const char *outputbase, const char *datadir)
virtual bool BeginDocumentHandler()
struct TessBaseAPI TessBaseAPI