|
tesseract 3.04.01
|
#include <renderer.h>
Public Member Functions | |
| TessPDFRenderer (const char *outputbase, const char *datadir) | |
Protected Member Functions | |
| virtual bool | BeginDocumentHandler () |
| virtual bool | AddImageHandler (TessBaseAPI *api) |
| virtual bool | EndDocumentHandler () |
Renders tesseract output into searchable PDF
Definition at line 168 of file renderer.h.
| tesseract::TessPDFRenderer::TessPDFRenderer | ( | const char * | outputbase, |
| const char * | datadir | ||
| ) |
Definition at line 164 of file pdfrenderer.cpp.
: TessResultRenderer(outputbase, "pdf") { obj_ = 0; datadir_ = datadir; offsets_.push_back(0); }
| bool tesseract::TessPDFRenderer::AddImageHandler | ( | TessBaseAPI * | api | ) | [protected, virtual] |
Implements tesseract::TessResultRenderer.
Definition at line 823 of file pdfrenderer.cpp.
{
size_t n;
char buf[kBasicBufSize];
Pix *pix = api->GetInputImage();
char *filename = (char *)api->GetInputName();
int ppi = api->GetSourceYResolution();
if (!pix || ppi <= 0)
return false;
double width = pixGetWidth(pix) * 72.0 / ppi;
double height = pixGetHeight(pix) * 72.0 / ppi;
// PAGE
n = snprintf(buf, sizeof(buf),
"%ld 0 obj\n"
"<<\n"
" /Type /Page\n"
" /Parent %ld 0 R\n"
" /MediaBox [0 0 %.2f %.2f]\n"
" /Contents %ld 0 R\n"
" /Resources\n"
" <<\n"
" /XObject << /Im1 %ld 0 R >>\n"
" /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
" /Font << /f-0-0 %ld 0 R >>\n"
" >>\n"
">>\n"
"endobj\n",
obj_,
2L, // Pages object
width,
height,
obj_ + 1, // Contents object
obj_ + 2, // Image object
3L); // Type0 Font
if (n >= sizeof(buf)) return false;
pages_.push_back(obj_);
AppendPDFObject(buf);
// CONTENTS
char* pdftext = GetPDFTextObjects(api, width, height);
long pdftext_len = strlen(pdftext);
unsigned char *pdftext_casted = reinterpret_cast<unsigned char *>(pdftext);
size_t len;
unsigned char *comp_pdftext =
zlibCompress(pdftext_casted, pdftext_len, &len);
long comp_pdftext_len = len;
n = snprintf(buf, sizeof(buf),
"%ld 0 obj\n"
"<<\n"
" /Length %ld /Filter /FlateDecode\n"
">>\n"
"stream\n", obj_, comp_pdftext_len);
if (n >= sizeof(buf)) {
delete[] pdftext;
lept_free(comp_pdftext);
return false;
}
AppendString(buf);
long objsize = strlen(buf);
AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
objsize += comp_pdftext_len;
lept_free(comp_pdftext);
delete[] pdftext;
const char *b2 =
"endstream\n"
"endobj\n";
AppendString(b2);
objsize += strlen(b2);
AppendPDFObjectDIY(objsize);
char *pdf_object;
if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
return false;
}
AppendData(pdf_object, objsize);
AppendPDFObjectDIY(objsize);
delete[] pdf_object;
return true;
}
| bool tesseract::TessPDFRenderer::BeginDocumentHandler | ( | ) | [protected, virtual] |
Reimplemented from tesseract::TessResultRenderer.
Definition at line 468 of file pdfrenderer.cpp.
{
char buf[kBasicBufSize];
size_t n;
n = snprintf(buf, sizeof(buf),
"%%PDF-1.5\n"
"%%%c%c%c%c\n",
0xDE, 0xAD, 0xBE, 0xEB);
if (n >= sizeof(buf)) return false;
AppendPDFObject(buf);
// CATALOG
n = snprintf(buf, sizeof(buf),
"1 0 obj\n"
"<<\n"
" /Type /Catalog\n"
" /Pages %ld 0 R\n"
">>\n"
"endobj\n",
2L);
if (n >= sizeof(buf)) return false;
AppendPDFObject(buf);
// We are reserving object #2 for the /Pages
// object, which I am going to create and write
// at the end of the PDF file.
AppendPDFObject("");
// TYPE0 FONT
n = snprintf(buf, sizeof(buf),
"3 0 obj\n"
"<<\n"
" /BaseFont /GlyphLessFont\n"
" /DescendantFonts [ %ld 0 R ]\n"
" /Encoding /Identity-H\n"
" /Subtype /Type0\n"
" /ToUnicode %ld 0 R\n"
" /Type /Font\n"
">>\n"
"endobj\n",
4L, // CIDFontType2 font
6L // ToUnicode
);
if (n >= sizeof(buf)) return false;
AppendPDFObject(buf);
// CIDFONTTYPE2
n = snprintf(buf, sizeof(buf),
"4 0 obj\n"
"<<\n"
" /BaseFont /GlyphLessFont\n"
" /CIDToGIDMap %ld 0 R\n"
" /CIDSystemInfo\n"
" <<\n"
" /Ordering (Identity)\n"
" /Registry (Adobe)\n"
" /Supplement 0\n"
" >>\n"
" /FontDescriptor %ld 0 R\n"
" /Subtype /CIDFontType2\n"
" /Type /Font\n"
" /DW %d\n"
">>\n"
"endobj\n",
5L, // CIDToGIDMap
7L, // Font descriptor
1000 / kCharWidth);
if (n >= sizeof(buf)) return false;
AppendPDFObject(buf);
// CIDTOGIDMAP
const int kCIDToGIDMapSize = 2 * (1 << 16);
unsigned char *cidtogidmap = new unsigned char[kCIDToGIDMapSize];
for (int i = 0; i < kCIDToGIDMapSize; i++) {
cidtogidmap[i] = (i % 2) ? 1 : 0;
}
size_t len;
unsigned char *comp =
zlibCompress(cidtogidmap, kCIDToGIDMapSize, &len);
delete[] cidtogidmap;
n = snprintf(buf, sizeof(buf),
"5 0 obj\n"
"<<\n"
" /Length %lu /Filter /FlateDecode\n"
">>\n"
"stream\n", (unsigned long)len);
if (n >= sizeof(buf)) {
lept_free(comp);
return false;
}
AppendString(buf);
long objsize = strlen(buf);
AppendData(reinterpret_cast<char *>(comp), len);
objsize += len;
lept_free(comp);
const char *endstream_endobj =
"endstream\n"
"endobj\n";
AppendString(endstream_endobj);
objsize += strlen(endstream_endobj);
AppendPDFObjectDIY(objsize);
const char *stream =
"/CIDInit /ProcSet findresource begin\n"
"12 dict begin\n"
"begincmap\n"
"/CIDSystemInfo\n"
"<<\n"
" /Registry (Adobe)\n"
" /Ordering (UCS)\n"
" /Supplement 0\n"
">> def\n"
"/CMapName /Adobe-Identify-UCS def\n"
"/CMapType 2 def\n"
"1 begincodespacerange\n"
"<0000> <FFFF>\n"
"endcodespacerange\n"
"1 beginbfrange\n"
"<0000> <FFFF> <0000>\n"
"endbfrange\n"
"endcmap\n"
"CMapName currentdict /CMap defineresource pop\n"
"end\n"
"end\n";
// TOUNICODE
n = snprintf(buf, sizeof(buf),
"6 0 obj\n"
"<< /Length %lu >>\n"
"stream\n"
"%s"
"endstream\n"
"endobj\n", (unsigned long) strlen(stream), stream);
if (n >= sizeof(buf)) return false;
AppendPDFObject(buf);
// FONT DESCRIPTOR
const int kCharHeight = 2; // Effect: highlights are half height
n = snprintf(buf, sizeof(buf),
"7 0 obj\n"
"<<\n"
" /Ascent %d\n"
" /CapHeight %d\n"
" /Descent -1\n" // Spec says must be negative
" /Flags 5\n" // FixedPitch + Symbolic
" /FontBBox [ 0 0 %d %d ]\n"
" /FontFile2 %ld 0 R\n"
" /FontName /GlyphLessFont\n"
" /ItalicAngle 0\n"
" /StemV 80\n"
" /Type /FontDescriptor\n"
">>\n"
"endobj\n",
1000 / kCharHeight,
1000 / kCharHeight,
1000 / kCharWidth,
1000 / kCharHeight,
8L // Font data
);
if (n >= sizeof(buf)) return false;
AppendPDFObject(buf);
n = snprintf(buf, sizeof(buf), "%s/pdf.ttf", datadir_);
if (n >= sizeof(buf)) return false;
FILE *fp = fopen(buf, "rb");
if (!fp) {
tprintf("Can not open file \"%s\"!\n", buf);
return false;
}
fseek(fp, 0, SEEK_END);
long int size = ftell(fp);
fseek(fp, 0, SEEK_SET);
char *buffer = new char[size];
if (fread(buffer, 1, size, fp) != size) {
fclose(fp);
delete[] buffer;
return false;
}
fclose(fp);
// FONTFILE2
n = snprintf(buf, sizeof(buf),
"8 0 obj\n"
"<<\n"
" /Length %ld\n"
" /Length1 %ld\n"
">>\n"
"stream\n", size, size);
if (n >= sizeof(buf)) {
delete[] buffer;
return false;
}
AppendString(buf);
objsize = strlen(buf);
AppendData(buffer, size);
delete[] buffer;
objsize += size;
AppendString(endstream_endobj);
objsize += strlen(endstream_endobj);
AppendPDFObjectDIY(objsize);
return true;
}
| bool tesseract::TessPDFRenderer::EndDocumentHandler | ( | ) | [protected, virtual] |
Reimplemented from tesseract::TessResultRenderer.
Definition at line 904 of file pdfrenderer.cpp.
{
size_t n;
char buf[kBasicBufSize];
// We reserved the /Pages object number early, so that the /Page
// objects could refer to their parent. We finally have enough
// information to go fill it in. Using lower level calls to manipulate
// the offset record in two spots, because we are placing objects
// out of order in the file.
// PAGES
const long int kPagesObjectNumber = 2;
offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
n = snprintf(buf, sizeof(buf),
"%ld 0 obj\n"
"<<\n"
" /Type /Pages\n"
" /Kids [ ", kPagesObjectNumber);
if (n >= sizeof(buf)) return false;
AppendString(buf);
size_t pages_objsize = strlen(buf);
for (size_t i = 0; i < pages_.size(); i++) {
n = snprintf(buf, sizeof(buf),
"%ld 0 R ", pages_[i]);
if (n >= sizeof(buf)) return false;
AppendString(buf);
pages_objsize += strlen(buf);
}
n = snprintf(buf, sizeof(buf),
"]\n"
" /Count %d\n"
">>\n"
"endobj\n", pages_.size());
if (n >= sizeof(buf)) return false;
AppendString(buf);
pages_objsize += strlen(buf);
offsets_.back() += pages_objsize; // manipulation #2
// INFO
char* datestr = l_getFormattedDate();
n = snprintf(buf, sizeof(buf),
"%ld 0 obj\n"
"<<\n"
" /Producer (Tesseract %s)\n"
" /CreationDate (D:%s)\n"
" /Title (%s)"
">>\n"
"endobj\n", obj_, TESSERACT_VERSION_STR, datestr, title());
lept_free(datestr);
if (n >= sizeof(buf)) return false;
AppendPDFObject(buf);
n = snprintf(buf, sizeof(buf),
"xref\n"
"0 %ld\n"
"0000000000 65535 f \n", obj_);
if (n >= sizeof(buf)) return false;
AppendString(buf);
for (int i = 1; i < obj_; i++) {
n = snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]);
if (n >= sizeof(buf)) return false;
AppendString(buf);
}
n = snprintf(buf, sizeof(buf),
"trailer\n"
"<<\n"
" /Size %ld\n"
" /Root %ld 0 R\n"
" /Info %ld 0 R\n"
">>\n"
"startxref\n"
"%ld\n"
"%%%%EOF\n",
obj_,
1L, // catalog
obj_ - 1, // info
offsets_.back());
if (n >= sizeof(buf)) return false;
AppendString(buf);
return true;
}