tesseract  4.1.0
tesseract::TessPDFRenderer Class Reference

#include <renderer.h>

Inheritance diagram for tesseract::TessPDFRenderer:
tesseract::TessResultRenderer

Public Member Functions

 TessPDFRenderer (const char *outputbase, const char *datadir, bool textonly=false)
 
- Public Member Functions inherited from tesseract::TessResultRenderer
virtual ~TessResultRenderer ()
 
void insert (TessResultRenderer *next)
 
TessResultRenderernext ()
 
bool BeginDocument (const char *title)
 
bool AddImage (TessBaseAPI *api)
 
bool EndDocument ()
 
const char * file_extension () const
 
const char * title () const
 
bool happy ()
 
int imagenum () const
 

Protected Member Functions

bool BeginDocumentHandler () override
 
bool AddImageHandler (TessBaseAPI *api) override
 
bool EndDocumentHandler () override
 
- Protected Member Functions inherited from tesseract::TessResultRenderer
 TessResultRenderer (const char *outputbase, const char *extension)
 
void AppendString (const char *s)
 
void AppendData (const char *s, int len)
 

Detailed Description

Renders tesseract output into searchable PDF

Definition at line 213 of file renderer.h.

Constructor & Destructor Documentation

tesseract::TessPDFRenderer::TessPDFRenderer ( const char *  outputbase,
const char *  datadir,
bool  textonly = false 
)

Definition at line 179 of file pdfrenderer.cpp.

181  : TessResultRenderer(outputbase, "pdf"),
182  datadir_(datadir) {
183  obj_ = 0;
184  textonly_ = textonly;
185  offsets_.push_back(0);
186 }
TessResultRenderer(const char *outputbase, const char *extension)
Definition: renderer.cpp:33
int push_back(T object)

Member Function Documentation

bool tesseract::TessPDFRenderer::AddImageHandler ( TessBaseAPI api)
overrideprotectedvirtual

Implements tesseract::TessResultRenderer.

Definition at line 784 of file pdfrenderer.cpp.

784  {
785  Pix *pix = api->GetInputImage();
786  const char* filename = api->GetInputName();
787  int ppi = api->GetSourceYResolution();
788  if (!pix || ppi <= 0)
789  return false;
790  double width = pixGetWidth(pix) * 72.0 / ppi;
791  double height = pixGetHeight(pix) * 72.0 / ppi;
792 
793  std::stringstream xobject;
794  if (!textonly_) {
795  xobject << "/XObject << /Im1 " << (obj_ + 2) << " 0 R >>\n";
796  }
797 
798  // PAGE
799  std::stringstream stream;
800  // Use "C" locale (needed for double values width and height).
801  stream.imbue(std::locale::classic());
802  stream.precision(2);
803  stream << std::fixed <<
804  obj_ << " 0 obj\n"
805  "<<\n"
806  " /Type /Page\n"
807  " /Parent 2 0 R\n" // Pages object
808  " /MediaBox [0 0 " << width << " " << height << "]\n"
809  " /Contents " << (obj_ + 1) << " 0 R\n" // Contents object
810  " /Resources\n"
811  " <<\n"
812  " " << xobject.str() << // Image object
813  " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
814  " /Font << /f-0-0 3 0 R >>\n" // Type0 Font
815  " >>\n"
816  ">>\n"
817  "endobj\n";
818  pages_.push_back(obj_);
819  AppendPDFObject(stream.str().c_str());
820 
821  // CONTENTS
822  const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
823  const size_t pdftext_len = strlen(pdftext.get());
824  size_t len;
825  unsigned char *comp_pdftext = zlibCompress(
826  reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
827  long comp_pdftext_len = len;
828  stream.str("");
829  stream <<
830  obj_ << " 0 obj\n"
831  "<<\n"
832  " /Length " << comp_pdftext_len << " /Filter /FlateDecode\n"
833  ">>\n"
834  "stream\n";
835  AppendString(stream.str().c_str());
836  long objsize = stream.str().size();
837  AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
838  objsize += comp_pdftext_len;
839  lept_free(comp_pdftext);
840  const char *b2 =
841  "endstream\n"
842  "endobj\n";
843  AppendString(b2);
844  objsize += strlen(b2);
845  AppendPDFObjectDIY(objsize);
846 
847  if (!textonly_) {
848  char *pdf_object = nullptr;
849  int jpg_quality;
850  api->GetIntVariable("jpg_quality", &jpg_quality);
851  if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize,
852  jpg_quality)) {
853  return false;
854  }
855  AppendData(pdf_object, objsize);
856  AppendPDFObjectDIY(objsize);
857  delete[] pdf_object;
858  }
859  return true;
860 }
void AppendString(const char *s)
Definition: renderer.cpp:102
int push_back(T object)
void AppendData(const char *s, int len)
Definition: renderer.cpp:106
bool tesseract::TessPDFRenderer::BeginDocumentHandler ( )
overrideprotectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 494 of file pdfrenderer.cpp.

494  {
495  AppendPDFObject("%PDF-1.5\n%\xDE\xAD\xBE\xEB\n");
496 
497  // CATALOG
498  AppendPDFObject("1 0 obj\n"
499  "<<\n"
500  " /Type /Catalog\n"
501  " /Pages 2 0 R\n"
502  ">>\nendobj\n");
503 
504  // We are reserving object #2 for the /Pages
505  // object, which I am going to create and write
506  // at the end of the PDF file.
507  AppendPDFObject("");
508 
509  // TYPE0 FONT
510  AppendPDFObject("3 0 obj\n"
511  "<<\n"
512  " /BaseFont /GlyphLessFont\n"
513  " /DescendantFonts [ 4 0 R ]\n" // CIDFontType2 font
514  " /Encoding /Identity-H\n"
515  " /Subtype /Type0\n"
516  " /ToUnicode 6 0 R\n" // ToUnicode
517  " /Type /Font\n"
518  ">>\n"
519  "endobj\n");
520 
521  // CIDFONTTYPE2
522  std::stringstream stream;
523  stream <<
524  "4 0 obj\n"
525  "<<\n"
526  " /BaseFont /GlyphLessFont\n"
527  " /CIDToGIDMap 5 0 R\n" // CIDToGIDMap
528  " /CIDSystemInfo\n"
529  " <<\n"
530  " /Ordering (Identity)\n"
531  " /Registry (Adobe)\n"
532  " /Supplement 0\n"
533  " >>\n"
534  " /FontDescriptor 7 0 R\n" // Font descriptor
535  " /Subtype /CIDFontType2\n"
536  " /Type /Font\n"
537  " /DW " << (1000 / kCharWidth) << "\n"
538  ">>\n"
539  "endobj\n";
540  AppendPDFObject(stream.str().c_str());
541 
542  // CIDTOGIDMAP
543  const int kCIDToGIDMapSize = 2 * (1 << 16);
544  const std::unique_ptr<unsigned char[]> cidtogidmap(
545  new unsigned char[kCIDToGIDMapSize]);
546  for (int i = 0; i < kCIDToGIDMapSize; i++) {
547  cidtogidmap[i] = (i % 2) ? 1 : 0;
548  }
549  size_t len;
550  unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
551  stream.str("");
552  stream <<
553  "5 0 obj\n"
554  "<<\n"
555  " /Length " << len << " /Filter /FlateDecode\n"
556  ">>\n"
557  "stream\n";
558  AppendString(stream.str().c_str());
559  long objsize = stream.str().size();
560  AppendData(reinterpret_cast<char *>(comp), len);
561  objsize += len;
562  lept_free(comp);
563  const char *endstream_endobj =
564  "endstream\n"
565  "endobj\n";
566  AppendString(endstream_endobj);
567  objsize += strlen(endstream_endobj);
568  AppendPDFObjectDIY(objsize);
569 
570  const char stream2[] =
571  "/CIDInit /ProcSet findresource begin\n"
572  "12 dict begin\n"
573  "begincmap\n"
574  "/CIDSystemInfo\n"
575  "<<\n"
576  " /Registry (Adobe)\n"
577  " /Ordering (UCS)\n"
578  " /Supplement 0\n"
579  ">> def\n"
580  "/CMapName /Adobe-Identify-UCS def\n"
581  "/CMapType 2 def\n"
582  "1 begincodespacerange\n"
583  "<0000> <FFFF>\n"
584  "endcodespacerange\n"
585  "1 beginbfrange\n"
586  "<0000> <FFFF> <0000>\n"
587  "endbfrange\n"
588  "endcmap\n"
589  "CMapName currentdict /CMap defineresource pop\n"
590  "end\n"
591  "end\n";
592 
593  // TOUNICODE
594  stream.str("");
595  stream <<
596  "6 0 obj\n"
597  "<< /Length " << (sizeof(stream2) - 1) << " >>\n"
598  "stream\n" << stream2 <<
599  "endstream\n"
600  "endobj\n";
601  AppendPDFObject(stream.str().c_str());
602 
603  // FONT DESCRIPTOR
604  stream.str("");
605  stream <<
606  "7 0 obj\n"
607  "<<\n"
608  " /Ascent 1000\n"
609  " /CapHeight 1000\n"
610  " /Descent -1\n" // Spec says must be negative
611  " /Flags 5\n" // FixedPitch + Symbolic
612  " /FontBBox [ 0 0 " << (1000 / kCharWidth) << " 1000 ]\n"
613  " /FontFile2 8 0 R\n"
614  " /FontName /GlyphLessFont\n"
615  " /ItalicAngle 0\n"
616  " /StemV 80\n"
617  " /Type /FontDescriptor\n"
618  ">>\n"
619  "endobj\n";
620  AppendPDFObject(stream.str().c_str());
621 
622  stream.str("");
623  stream << datadir_.c_str() << "/pdf.ttf";
624  FILE *fp = fopen(stream.str().c_str(), "rb");
625  if (!fp) {
626  tprintf("Cannot open file \"%s\"!\n", stream.str().c_str());
627  return false;
628  }
629  fseek(fp, 0, SEEK_END);
630  long int size = ftell(fp);
631  if (size < 0) {
632  fclose(fp);
633  return false;
634  }
635  fseek(fp, 0, SEEK_SET);
636  const std::unique_ptr<char[]> buffer(new char[size]);
637  if (!tesseract::DeSerialize(fp, buffer.get(), size)) {
638  fclose(fp);
639  return false;
640  }
641  fclose(fp);
642  // FONTFILE2
643  stream.str("");
644  stream <<
645  "8 0 obj\n"
646  "<<\n"
647  " /Length " << size << "\n"
648  " /Length1 " << size << "\n"
649  ">>\n"
650  "stream\n";
651  AppendString(stream.str().c_str());
652  objsize = stream.str().size();
653  AppendData(buffer.get(), size);
654  objsize += size;
655  AppendString(endstream_endobj);
656  objsize += strlen(endstream_endobj);
657  AppendPDFObjectDIY(objsize);
658  return true;
659 }
void AppendString(const char *s)
Definition: renderer.cpp:102
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void AppendData(const char *s, int len)
Definition: renderer.cpp:106
bool DeSerialize(FILE *fp, char *data, size_t n)
Definition: serialis.cpp:27
bool tesseract::TessPDFRenderer::EndDocumentHandler ( )
overrideprotectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 863 of file pdfrenderer.cpp.

863  {
864  // We reserved the /Pages object number early, so that the /Page
865  // objects could refer to their parent. We finally have enough
866  // information to go fill it in. Using lower level calls to manipulate
867  // the offset record in two spots, because we are placing objects
868  // out of order in the file.
869 
870  // PAGES
871  const long int kPagesObjectNumber = 2;
872  offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
873  std::stringstream stream;
874  stream << kPagesObjectNumber << " 0 obj\n<<\n /Type /Pages\n /Kids [ ";
875  AppendString(stream.str().c_str());
876  size_t pages_objsize = stream.str().size();
877  for (size_t i = 0; i < pages_.unsigned_size(); i++) {
878  stream.str("");
879  stream << pages_[i] << " 0 R ";
880  AppendString(stream.str().c_str());
881  pages_objsize += stream.str().size();
882  }
883  stream.str("");
884  stream << "]\n /Count " << pages_.size() << "\n>>\nendobj\n";
885  AppendString(stream.str().c_str());
886  pages_objsize += stream.str().size();
887  offsets_.back() += pages_objsize; // manipulation #2
888 
889  // INFO
890  STRING utf16_title = "FEFF"; // byte_order_marker
891  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());
892  char utf16[kMaxBytesPerCodepoint];
893  for (char32 code : unicodes) {
894  if (CodepointToUtf16be(code, utf16)) {
895  utf16_title += utf16;
896  }
897  }
898 
899  char* datestr = l_getFormattedDate();
900  stream.str("");
901  stream
902  << obj_ << " 0 obj\n"
903  "<<\n"
904  " /Producer (Tesseract " << tesseract::TessBaseAPI::Version() << ")\n"
905  " /CreationDate (D:" << datestr << ")\n"
906  " /Title <" << utf16_title.c_str() << ">\n"
907  ">>\n"
908  "endobj\n";
909  lept_free(datestr);
910  AppendPDFObject(stream.str().c_str());
911  stream.str("");
912  stream << "xref\n0 " << obj_ << "\n0000000000 65535 f \n";
913  AppendString(stream.str().c_str());
914  for (int i = 1; i < obj_; i++) {
915  stream.str("");
916  stream.width(10);
917  stream.fill('0');
918  stream << offsets_[i] << " 00000 n \n";
919  AppendString(stream.str().c_str());
920  }
921  stream.str("");
922  stream
923  << "trailer\n<<\n /Size " << obj_ << "\n"
924  " /Root 1 0 R\n" // catalog
925  " /Info " << (obj_ - 1) << " 0 R\n" // info
926  ">>\nstartxref\n" << offsets_.back() << "\n%%EOF\n";
927  AppendString(stream.str().c_str());
928  return true;
929 }
Definition: strngs.h:45
void AppendString(const char *s)
Definition: renderer.cpp:102
static const char * Version()
Definition: baseapi.cpp:227
signed int char32
size_t unsigned_size() const
Definition: genericvector.h:74
const char * title() const
Definition: renderer.h:87
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:215
T & back() const
const char * c_str() const
Definition: strngs.cpp:205

The documentation for this class was generated from the following files: