tesseract 3.04.01

tesseract::TessPDFRenderer Class Reference

#include <renderer.h>

Inheritance diagram for tesseract::TessPDFRenderer:
tesseract::TessResultRenderer

List of all members.

Public Member Functions

 TessPDFRenderer (const char *outputbase, const char *datadir)

Protected Member Functions

virtual bool BeginDocumentHandler ()
virtual bool AddImageHandler (TessBaseAPI *api)
virtual bool EndDocumentHandler ()

Detailed Description

Renders tesseract output into searchable PDF

Definition at line 168 of file renderer.h.


Constructor & Destructor Documentation

tesseract::TessPDFRenderer::TessPDFRenderer ( const char *  outputbase,
const char *  datadir 
)

Definition at line 164 of file pdfrenderer.cpp.

    : TessResultRenderer(outputbase, "pdf") {
  obj_  = 0;
  datadir_ = datadir;
  offsets_.push_back(0);
}

Member Function Documentation

bool tesseract::TessPDFRenderer::AddImageHandler ( TessBaseAPI api) [protected, virtual]

Implements tesseract::TessResultRenderer.

Definition at line 823 of file pdfrenderer.cpp.

                                                      {
  size_t n;
  char buf[kBasicBufSize];
  Pix *pix = api->GetInputImage();
  char *filename = (char *)api->GetInputName();
  int ppi = api->GetSourceYResolution();
  if (!pix || ppi <= 0)
    return false;
  double width = pixGetWidth(pix) * 72.0 / ppi;
  double height = pixGetHeight(pix) * 72.0 / ppi;

  // PAGE
  n = snprintf(buf, sizeof(buf),
               "%ld 0 obj\n"
               "<<\n"
               "  /Type /Page\n"
               "  /Parent %ld 0 R\n"
               "  /MediaBox [0 0 %.2f %.2f]\n"
               "  /Contents %ld 0 R\n"
               "  /Resources\n"
               "  <<\n"
               "    /XObject << /Im1 %ld 0 R >>\n"
               "    /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
               "    /Font << /f-0-0 %ld 0 R >>\n"
               "  >>\n"
               ">>\n"
               "endobj\n",
               obj_,
               2L,            // Pages object
               width,
               height,
               obj_ + 1,      // Contents object
               obj_ + 2,      // Image object
               3L);           // Type0 Font
  if (n >= sizeof(buf)) return false;
  pages_.push_back(obj_);
  AppendPDFObject(buf);

  // CONTENTS
  char* pdftext = GetPDFTextObjects(api, width, height);
  long pdftext_len = strlen(pdftext);
  unsigned char *pdftext_casted = reinterpret_cast<unsigned char *>(pdftext);
  size_t len;
  unsigned char *comp_pdftext =
      zlibCompress(pdftext_casted, pdftext_len, &len);
  long comp_pdftext_len = len;
  n = snprintf(buf, sizeof(buf),
               "%ld 0 obj\n"
               "<<\n"
               "  /Length %ld /Filter /FlateDecode\n"
               ">>\n"
               "stream\n", obj_, comp_pdftext_len);
  if (n >= sizeof(buf)) {
    delete[] pdftext;
    lept_free(comp_pdftext);
    return false;
  }
  AppendString(buf);
  long objsize = strlen(buf);
  AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
  objsize += comp_pdftext_len;
  lept_free(comp_pdftext);
  delete[] pdftext;
  const char *b2 =
      "endstream\n"
      "endobj\n";
  AppendString(b2);
  objsize += strlen(b2);
  AppendPDFObjectDIY(objsize);

  char *pdf_object;
  if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
    return false;
  }
  AppendData(pdf_object, objsize);
  AppendPDFObjectDIY(objsize);
  delete[] pdf_object;
  return true;
}
bool tesseract::TessPDFRenderer::BeginDocumentHandler ( ) [protected, virtual]

Reimplemented from tesseract::TessResultRenderer.

Definition at line 468 of file pdfrenderer.cpp.

                                           {
  char buf[kBasicBufSize];
  size_t n;

  n = snprintf(buf, sizeof(buf),
               "%%PDF-1.5\n"
               "%%%c%c%c%c\n",
               0xDE, 0xAD, 0xBE, 0xEB);
  if (n >= sizeof(buf)) return false;
  AppendPDFObject(buf);

  // CATALOG
  n = snprintf(buf, sizeof(buf),
               "1 0 obj\n"
               "<<\n"
               "  /Type /Catalog\n"
               "  /Pages %ld 0 R\n"
               ">>\n"
               "endobj\n",
               2L);
  if (n >= sizeof(buf)) return false;
  AppendPDFObject(buf);

  // We are reserving object #2 for the /Pages
  // object, which I am going to create and write
  // at the end of the PDF file.
  AppendPDFObject("");

  // TYPE0 FONT
  n = snprintf(buf, sizeof(buf),
               "3 0 obj\n"
               "<<\n"
               "  /BaseFont /GlyphLessFont\n"
               "  /DescendantFonts [ %ld 0 R ]\n"
               "  /Encoding /Identity-H\n"
               "  /Subtype /Type0\n"
               "  /ToUnicode %ld 0 R\n"
               "  /Type /Font\n"
               ">>\n"
               "endobj\n",
               4L,         // CIDFontType2 font
               6L          // ToUnicode
               );
  if (n >= sizeof(buf)) return false;
  AppendPDFObject(buf);

  // CIDFONTTYPE2
  n = snprintf(buf, sizeof(buf),
               "4 0 obj\n"
               "<<\n"
               "  /BaseFont /GlyphLessFont\n"
               "  /CIDToGIDMap %ld 0 R\n"
               "  /CIDSystemInfo\n"
               "  <<\n"
               "     /Ordering (Identity)\n"
               "     /Registry (Adobe)\n"
               "     /Supplement 0\n"
               "  >>\n"
               "  /FontDescriptor %ld 0 R\n"
               "  /Subtype /CIDFontType2\n"
               "  /Type /Font\n"
               "  /DW %d\n"
               ">>\n"
               "endobj\n",
               5L,         // CIDToGIDMap
               7L,         // Font descriptor
               1000 / kCharWidth);
  if (n >= sizeof(buf)) return false;
  AppendPDFObject(buf);

  // CIDTOGIDMAP
  const int kCIDToGIDMapSize = 2 * (1 << 16);
  unsigned char *cidtogidmap = new unsigned char[kCIDToGIDMapSize];
  for (int i = 0; i < kCIDToGIDMapSize; i++) {
    cidtogidmap[i] = (i % 2) ? 1 : 0;
  }
  size_t len;
  unsigned char *comp =
      zlibCompress(cidtogidmap, kCIDToGIDMapSize, &len);
  delete[] cidtogidmap;
  n = snprintf(buf, sizeof(buf),
               "5 0 obj\n"
               "<<\n"
               "  /Length %lu /Filter /FlateDecode\n"
               ">>\n"
               "stream\n", (unsigned long)len);
  if (n >= sizeof(buf)) {
    lept_free(comp);
    return false;
  }
  AppendString(buf);
  long objsize = strlen(buf);
  AppendData(reinterpret_cast<char *>(comp), len);
  objsize += len;
  lept_free(comp);
  const char *endstream_endobj =
      "endstream\n"
      "endobj\n";
  AppendString(endstream_endobj);
  objsize += strlen(endstream_endobj);
  AppendPDFObjectDIY(objsize);

  const char *stream =
      "/CIDInit /ProcSet findresource begin\n"
      "12 dict begin\n"
      "begincmap\n"
      "/CIDSystemInfo\n"
      "<<\n"
      "  /Registry (Adobe)\n"
      "  /Ordering (UCS)\n"
      "  /Supplement 0\n"
      ">> def\n"
      "/CMapName /Adobe-Identify-UCS def\n"
      "/CMapType 2 def\n"
      "1 begincodespacerange\n"
      "<0000> <FFFF>\n"
      "endcodespacerange\n"
      "1 beginbfrange\n"
      "<0000> <FFFF> <0000>\n"
      "endbfrange\n"
      "endcmap\n"
      "CMapName currentdict /CMap defineresource pop\n"
      "end\n"
      "end\n";

  // TOUNICODE
  n = snprintf(buf, sizeof(buf),
               "6 0 obj\n"
               "<< /Length %lu >>\n"
               "stream\n"
               "%s"
               "endstream\n"
               "endobj\n", (unsigned long) strlen(stream), stream);
  if (n >= sizeof(buf)) return false;
  AppendPDFObject(buf);

  // FONT DESCRIPTOR
  const int kCharHeight = 2;  // Effect: highlights are half height
  n = snprintf(buf, sizeof(buf),
               "7 0 obj\n"
               "<<\n"
               "  /Ascent %d\n"
               "  /CapHeight %d\n"
               "  /Descent -1\n"       // Spec says must be negative
               "  /Flags 5\n"          // FixedPitch + Symbolic
               "  /FontBBox  [ 0 0 %d %d ]\n"
               "  /FontFile2 %ld 0 R\n"
               "  /FontName /GlyphLessFont\n"
               "  /ItalicAngle 0\n"
               "  /StemV 80\n"
               "  /Type /FontDescriptor\n"
               ">>\n"
               "endobj\n",
               1000 / kCharHeight,
               1000 / kCharHeight,
               1000 / kCharWidth,
               1000 / kCharHeight,
               8L      // Font data
               );
  if (n >= sizeof(buf)) return false;
  AppendPDFObject(buf);

  n = snprintf(buf, sizeof(buf), "%s/pdf.ttf", datadir_);
  if (n >= sizeof(buf)) return false;
  FILE *fp = fopen(buf, "rb");
  if (!fp) {
    tprintf("Can not open file \"%s\"!\n", buf);
    return false;
  }
  fseek(fp, 0, SEEK_END);
  long int size = ftell(fp);
  fseek(fp, 0, SEEK_SET);
  char *buffer = new char[size];
  if (fread(buffer, 1, size, fp) != size) {
    fclose(fp);
    delete[] buffer;
    return false;
  }
  fclose(fp);
  // FONTFILE2
  n = snprintf(buf, sizeof(buf),
               "8 0 obj\n"
               "<<\n"
               "  /Length %ld\n"
               "  /Length1 %ld\n"
               ">>\n"
               "stream\n", size, size);
  if (n >= sizeof(buf)) {
    delete[] buffer;
    return false;
  }
  AppendString(buf);
  objsize  = strlen(buf);
  AppendData(buffer, size);
  delete[] buffer;
  objsize += size;
  AppendString(endstream_endobj);
  objsize += strlen(endstream_endobj);
  AppendPDFObjectDIY(objsize);
  return true;
}
bool tesseract::TessPDFRenderer::EndDocumentHandler ( ) [protected, virtual]

Reimplemented from tesseract::TessResultRenderer.

Definition at line 904 of file pdfrenderer.cpp.

                                         {
  size_t n;
  char buf[kBasicBufSize];

  // We reserved the /Pages object number early, so that the /Page
  // objects could refer to their parent. We finally have enough
  // information to go fill it in. Using lower level calls to manipulate
  // the offset record in two spots, because we are placing objects
  // out of order in the file.

  // PAGES
  const long int kPagesObjectNumber = 2;
  offsets_[kPagesObjectNumber] = offsets_.back();  // manipulation #1
  n = snprintf(buf, sizeof(buf),
               "%ld 0 obj\n"
               "<<\n"
               "  /Type /Pages\n"
               "  /Kids [ ", kPagesObjectNumber);
  if (n >= sizeof(buf)) return false;
  AppendString(buf);
  size_t pages_objsize  = strlen(buf);
  for (size_t i = 0; i < pages_.size(); i++) {
    n = snprintf(buf, sizeof(buf),
                 "%ld 0 R ", pages_[i]);
    if (n >= sizeof(buf)) return false;
    AppendString(buf);
    pages_objsize += strlen(buf);
  }
  n = snprintf(buf, sizeof(buf),
               "]\n"
               "  /Count %d\n"
               ">>\n"
               "endobj\n", pages_.size());
  if (n >= sizeof(buf)) return false;
  AppendString(buf);
  pages_objsize += strlen(buf);
  offsets_.back() += pages_objsize;    // manipulation #2

  // INFO
  char* datestr = l_getFormattedDate();
  n = snprintf(buf, sizeof(buf),
               "%ld 0 obj\n"
               "<<\n"
               "  /Producer (Tesseract %s)\n"
               "  /CreationDate (D:%s)\n"
               "  /Title (%s)"
               ">>\n"
               "endobj\n", obj_, TESSERACT_VERSION_STR, datestr, title());
  lept_free(datestr);
  if (n >= sizeof(buf)) return false;
  AppendPDFObject(buf);
  n = snprintf(buf, sizeof(buf),
               "xref\n"
               "0 %ld\n"
               "0000000000 65535 f \n", obj_);
  if (n >= sizeof(buf)) return false;
  AppendString(buf);
  for (int i = 1; i < obj_; i++) {
    n = snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]);
    if (n >= sizeof(buf)) return false;
    AppendString(buf);
  }
  n = snprintf(buf, sizeof(buf),
               "trailer\n"
               "<<\n"
               "  /Size %ld\n"
               "  /Root %ld 0 R\n"
               "  /Info %ld 0 R\n"
               ">>\n"
               "startxref\n"
               "%ld\n"
               "%%%%EOF\n",
               obj_,
               1L,               // catalog
               obj_ - 1,         // info
               offsets_.back());
  if (n >= sizeof(buf)) return false;
  AppendString(buf);
  return true;
}

The documentation for this class was generated from the following files:
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines