tesseract  3.04.01
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::TessPDFRenderer Class Reference

#include <renderer.h>

Inheritance diagram for tesseract::TessPDFRenderer:
tesseract::TessResultRenderer

Public Member Functions

 TessPDFRenderer (const char *outputbase, const char *datadir)
 
- Public Member Functions inherited from tesseract::TessResultRenderer
virtual ~TessResultRenderer ()
 
void insert (TessResultRenderer *next)
 
TessResultRenderernext ()
 
bool BeginDocument (const char *title)
 
bool AddImage (TessBaseAPI *api)
 
bool EndDocument ()
 
const char * file_extension () const
 
const char * title () const
 
int imagenum () const
 

Protected Member Functions

virtual bool BeginDocumentHandler ()
 
virtual bool AddImageHandler (TessBaseAPI *api)
 
virtual bool EndDocumentHandler ()
 
- Protected Member Functions inherited from tesseract::TessResultRenderer
 TessResultRenderer (const char *outputbase, const char *extension)
 
void AppendString (const char *s)
 
void AppendData (const char *s, int len)
 

Detailed Description

Renders tesseract output into searchable PDF

Definition at line 168 of file renderer.h.

Constructor & Destructor Documentation

tesseract::TessPDFRenderer::TessPDFRenderer ( const char *  outputbase,
const char *  datadir 
)

Definition at line 164 of file pdfrenderer.cpp.

165  : TessResultRenderer(outputbase, "pdf") {
166  obj_ = 0;
167  datadir_ = datadir;
168  offsets_.push_back(0);
169 }
int push_back(T object)
TessResultRenderer(const char *outputbase, const char *extension)
Definition: renderer.cpp:16

Member Function Documentation

bool tesseract::TessPDFRenderer::AddImageHandler ( TessBaseAPI api)
protectedvirtual

Implements tesseract::TessResultRenderer.

Definition at line 823 of file pdfrenderer.cpp.

823  {
824  size_t n;
825  char buf[kBasicBufSize];
826  Pix *pix = api->GetInputImage();
827  char *filename = (char *)api->GetInputName();
828  int ppi = api->GetSourceYResolution();
829  if (!pix || ppi <= 0)
830  return false;
831  double width = pixGetWidth(pix) * 72.0 / ppi;
832  double height = pixGetHeight(pix) * 72.0 / ppi;
833 
834  // PAGE
835  n = snprintf(buf, sizeof(buf),
836  "%ld 0 obj\n"
837  "<<\n"
838  " /Type /Page\n"
839  " /Parent %ld 0 R\n"
840  " /MediaBox [0 0 %.2f %.2f]\n"
841  " /Contents %ld 0 R\n"
842  " /Resources\n"
843  " <<\n"
844  " /XObject << /Im1 %ld 0 R >>\n"
845  " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
846  " /Font << /f-0-0 %ld 0 R >>\n"
847  " >>\n"
848  ">>\n"
849  "endobj\n",
850  obj_,
851  2L, // Pages object
852  width,
853  height,
854  obj_ + 1, // Contents object
855  obj_ + 2, // Image object
856  3L); // Type0 Font
857  if (n >= sizeof(buf)) return false;
858  pages_.push_back(obj_);
859  AppendPDFObject(buf);
860 
861  // CONTENTS
862  char* pdftext = GetPDFTextObjects(api, width, height);
863  long pdftext_len = strlen(pdftext);
864  unsigned char *pdftext_casted = reinterpret_cast<unsigned char *>(pdftext);
865  size_t len;
866  unsigned char *comp_pdftext =
867  zlibCompress(pdftext_casted, pdftext_len, &len);
868  long comp_pdftext_len = len;
869  n = snprintf(buf, sizeof(buf),
870  "%ld 0 obj\n"
871  "<<\n"
872  " /Length %ld /Filter /FlateDecode\n"
873  ">>\n"
874  "stream\n", obj_, comp_pdftext_len);
875  if (n >= sizeof(buf)) {
876  delete[] pdftext;
877  lept_free(comp_pdftext);
878  return false;
879  }
880  AppendString(buf);
881  long objsize = strlen(buf);
882  AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
883  objsize += comp_pdftext_len;
884  lept_free(comp_pdftext);
885  delete[] pdftext;
886  const char *b2 =
887  "endstream\n"
888  "endobj\n";
889  AppendString(b2);
890  objsize += strlen(b2);
891  AppendPDFObjectDIY(objsize);
892 
893  char *pdf_object;
894  if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
895  return false;
896  }
897  AppendData(pdf_object, objsize);
898  AppendPDFObjectDIY(objsize);
899  delete[] pdf_object;
900  return true;
901 }
void AppendData(const char *s, int len)
Definition: renderer.cpp:87
int push_back(T object)
void AppendString(const char *s)
Definition: renderer.cpp:83
const int kBasicBufSize
bool tesseract::TessPDFRenderer::BeginDocumentHandler ( )
protectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 468 of file pdfrenderer.cpp.

468  {
469  char buf[kBasicBufSize];
470  size_t n;
471 
472  n = snprintf(buf, sizeof(buf),
473  "%%PDF-1.5\n"
474  "%%%c%c%c%c\n",
475  0xDE, 0xAD, 0xBE, 0xEB);
476  if (n >= sizeof(buf)) return false;
477  AppendPDFObject(buf);
478 
479  // CATALOG
480  n = snprintf(buf, sizeof(buf),
481  "1 0 obj\n"
482  "<<\n"
483  " /Type /Catalog\n"
484  " /Pages %ld 0 R\n"
485  ">>\n"
486  "endobj\n",
487  2L);
488  if (n >= sizeof(buf)) return false;
489  AppendPDFObject(buf);
490 
491  // We are reserving object #2 for the /Pages
492  // object, which I am going to create and write
493  // at the end of the PDF file.
494  AppendPDFObject("");
495 
496  // TYPE0 FONT
497  n = snprintf(buf, sizeof(buf),
498  "3 0 obj\n"
499  "<<\n"
500  " /BaseFont /GlyphLessFont\n"
501  " /DescendantFonts [ %ld 0 R ]\n"
502  " /Encoding /Identity-H\n"
503  " /Subtype /Type0\n"
504  " /ToUnicode %ld 0 R\n"
505  " /Type /Font\n"
506  ">>\n"
507  "endobj\n",
508  4L, // CIDFontType2 font
509  6L // ToUnicode
510  );
511  if (n >= sizeof(buf)) return false;
512  AppendPDFObject(buf);
513 
514  // CIDFONTTYPE2
515  n = snprintf(buf, sizeof(buf),
516  "4 0 obj\n"
517  "<<\n"
518  " /BaseFont /GlyphLessFont\n"
519  " /CIDToGIDMap %ld 0 R\n"
520  " /CIDSystemInfo\n"
521  " <<\n"
522  " /Ordering (Identity)\n"
523  " /Registry (Adobe)\n"
524  " /Supplement 0\n"
525  " >>\n"
526  " /FontDescriptor %ld 0 R\n"
527  " /Subtype /CIDFontType2\n"
528  " /Type /Font\n"
529  " /DW %d\n"
530  ">>\n"
531  "endobj\n",
532  5L, // CIDToGIDMap
533  7L, // Font descriptor
534  1000 / kCharWidth);
535  if (n >= sizeof(buf)) return false;
536  AppendPDFObject(buf);
537 
538  // CIDTOGIDMAP
539  const int kCIDToGIDMapSize = 2 * (1 << 16);
540  unsigned char *cidtogidmap = new unsigned char[kCIDToGIDMapSize];
541  for (int i = 0; i < kCIDToGIDMapSize; i++) {
542  cidtogidmap[i] = (i % 2) ? 1 : 0;
543  }
544  size_t len;
545  unsigned char *comp =
546  zlibCompress(cidtogidmap, kCIDToGIDMapSize, &len);
547  delete[] cidtogidmap;
548  n = snprintf(buf, sizeof(buf),
549  "5 0 obj\n"
550  "<<\n"
551  " /Length %lu /Filter /FlateDecode\n"
552  ">>\n"
553  "stream\n", (unsigned long)len);
554  if (n >= sizeof(buf)) {
555  lept_free(comp);
556  return false;
557  }
558  AppendString(buf);
559  long objsize = strlen(buf);
560  AppendData(reinterpret_cast<char *>(comp), len);
561  objsize += len;
562  lept_free(comp);
563  const char *endstream_endobj =
564  "endstream\n"
565  "endobj\n";
566  AppendString(endstream_endobj);
567  objsize += strlen(endstream_endobj);
568  AppendPDFObjectDIY(objsize);
569 
570  const char *stream =
571  "/CIDInit /ProcSet findresource begin\n"
572  "12 dict begin\n"
573  "begincmap\n"
574  "/CIDSystemInfo\n"
575  "<<\n"
576  " /Registry (Adobe)\n"
577  " /Ordering (UCS)\n"
578  " /Supplement 0\n"
579  ">> def\n"
580  "/CMapName /Adobe-Identify-UCS def\n"
581  "/CMapType 2 def\n"
582  "1 begincodespacerange\n"
583  "<0000> <FFFF>\n"
584  "endcodespacerange\n"
585  "1 beginbfrange\n"
586  "<0000> <FFFF> <0000>\n"
587  "endbfrange\n"
588  "endcmap\n"
589  "CMapName currentdict /CMap defineresource pop\n"
590  "end\n"
591  "end\n";
592 
593  // TOUNICODE
594  n = snprintf(buf, sizeof(buf),
595  "6 0 obj\n"
596  "<< /Length %lu >>\n"
597  "stream\n"
598  "%s"
599  "endstream\n"
600  "endobj\n", (unsigned long) strlen(stream), stream);
601  if (n >= sizeof(buf)) return false;
602  AppendPDFObject(buf);
603 
604  // FONT DESCRIPTOR
605  const int kCharHeight = 2; // Effect: highlights are half height
606  n = snprintf(buf, sizeof(buf),
607  "7 0 obj\n"
608  "<<\n"
609  " /Ascent %d\n"
610  " /CapHeight %d\n"
611  " /Descent -1\n" // Spec says must be negative
612  " /Flags 5\n" // FixedPitch + Symbolic
613  " /FontBBox [ 0 0 %d %d ]\n"
614  " /FontFile2 %ld 0 R\n"
615  " /FontName /GlyphLessFont\n"
616  " /ItalicAngle 0\n"
617  " /StemV 80\n"
618  " /Type /FontDescriptor\n"
619  ">>\n"
620  "endobj\n",
621  1000 / kCharHeight,
622  1000 / kCharHeight,
623  1000 / kCharWidth,
624  1000 / kCharHeight,
625  8L // Font data
626  );
627  if (n >= sizeof(buf)) return false;
628  AppendPDFObject(buf);
629 
630  n = snprintf(buf, sizeof(buf), "%s/pdf.ttf", datadir_);
631  if (n >= sizeof(buf)) return false;
632  FILE *fp = fopen(buf, "rb");
633  if (!fp) {
634  tprintf("Can not open file \"%s\"!\n", buf);
635  return false;
636  }
637  fseek(fp, 0, SEEK_END);
638  long int size = ftell(fp);
639  fseek(fp, 0, SEEK_SET);
640  char *buffer = new char[size];
641  if (fread(buffer, 1, size, fp) != size) {
642  fclose(fp);
643  delete[] buffer;
644  return false;
645  }
646  fclose(fp);
647  // FONTFILE2
648  n = snprintf(buf, sizeof(buf),
649  "8 0 obj\n"
650  "<<\n"
651  " /Length %ld\n"
652  " /Length1 %ld\n"
653  ">>\n"
654  "stream\n", size, size);
655  if (n >= sizeof(buf)) {
656  delete[] buffer;
657  return false;
658  }
659  AppendString(buf);
660  objsize = strlen(buf);
661  AppendData(buffer, size);
662  delete[] buffer;
663  objsize += size;
664  AppendString(endstream_endobj);
665  objsize += strlen(endstream_endobj);
666  AppendPDFObjectDIY(objsize);
667  return true;
668 }
void AppendData(const char *s, int len)
Definition: renderer.cpp:87
void AppendString(const char *s)
Definition: renderer.cpp:83
#define tprintf(...)
Definition: tprintf.h:31
const int kBasicBufSize
const int kCharWidth
bool tesseract::TessPDFRenderer::EndDocumentHandler ( )
protectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 904 of file pdfrenderer.cpp.

904  {
905  size_t n;
906  char buf[kBasicBufSize];
907 
908  // We reserved the /Pages object number early, so that the /Page
909  // objects could refer to their parent. We finally have enough
910  // information to go fill it in. Using lower level calls to manipulate
911  // the offset record in two spots, because we are placing objects
912  // out of order in the file.
913 
914  // PAGES
915  const long int kPagesObjectNumber = 2;
916  offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
917  n = snprintf(buf, sizeof(buf),
918  "%ld 0 obj\n"
919  "<<\n"
920  " /Type /Pages\n"
921  " /Kids [ ", kPagesObjectNumber);
922  if (n >= sizeof(buf)) return false;
923  AppendString(buf);
924  size_t pages_objsize = strlen(buf);
925  for (size_t i = 0; i < pages_.size(); i++) {
926  n = snprintf(buf, sizeof(buf),
927  "%ld 0 R ", pages_[i]);
928  if (n >= sizeof(buf)) return false;
929  AppendString(buf);
930  pages_objsize += strlen(buf);
931  }
932  n = snprintf(buf, sizeof(buf),
933  "]\n"
934  " /Count %d\n"
935  ">>\n"
936  "endobj\n", pages_.size());
937  if (n >= sizeof(buf)) return false;
938  AppendString(buf);
939  pages_objsize += strlen(buf);
940  offsets_.back() += pages_objsize; // manipulation #2
941 
942  // INFO
943  char* datestr = l_getFormattedDate();
944  n = snprintf(buf, sizeof(buf),
945  "%ld 0 obj\n"
946  "<<\n"
947  " /Producer (Tesseract %s)\n"
948  " /CreationDate (D:%s)\n"
949  " /Title (%s)"
950  ">>\n"
951  "endobj\n", obj_, TESSERACT_VERSION_STR, datestr, title());
952  lept_free(datestr);
953  if (n >= sizeof(buf)) return false;
954  AppendPDFObject(buf);
955  n = snprintf(buf, sizeof(buf),
956  "xref\n"
957  "0 %ld\n"
958  "0000000000 65535 f \n", obj_);
959  if (n >= sizeof(buf)) return false;
960  AppendString(buf);
961  for (int i = 1; i < obj_; i++) {
962  n = snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]);
963  if (n >= sizeof(buf)) return false;
964  AppendString(buf);
965  }
966  n = snprintf(buf, sizeof(buf),
967  "trailer\n"
968  "<<\n"
969  " /Size %ld\n"
970  " /Root %ld 0 R\n"
971  " /Info %ld 0 R\n"
972  ">>\n"
973  "startxref\n"
974  "%ld\n"
975  "%%%%EOF\n",
976  obj_,
977  1L, // catalog
978  obj_ - 1, // info
979  offsets_.back());
980  if (n >= sizeof(buf)) return false;
981  AppendString(buf);
982  return true;
983 }
int size() const
Definition: genericvector.h:72
#define TESSERACT_VERSION_STR
Definition: baseapi.h:23
void AppendString(const char *s)
Definition: renderer.cpp:83
T & back() const
const int kBasicBufSize
const char * title() const
Definition: renderer.h:80

The documentation for this class was generated from the following files: