30 std::stringstream& alto_str) {
31 int left, top, right, bottom;
32 it->
BoundingBox(level, &left, &top, &right, &bottom);
36 int height = bottom - top;
37 int width = right - left;
39 alto_str <<
" HPOS=\"" << hpos <<
"\"";
40 alto_str <<
" VPOS=\"" << vpos <<
"\"";
41 alto_str <<
" WIDTH=\"" << width <<
"\"";
42 alto_str <<
" HEIGHT=\"" << height <<
"\"";
46 alto_str <<
" WC=\"0." << wc <<
"\"";
57 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" 58 "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" " 59 "xmlns:xlink=\"http://www.w3.org/1999/xlink\" " 60 "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" " 61 "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# " 62 "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n" 64 "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n" 65 "\t\t<sourceImageInformation>\n" 72 "\t\t</sourceImageInformation>\n" 73 "\t\t<OCRProcessing ID=\"OCR_0\">\n" 74 "\t\t\t<ocrProcessingStep>\n" 75 "\t\t\t\t<processingSoftware>\n" 76 "\t\t\t\t\t<softwareName>tesseract ");
80 "\t\t\t\t</processingSoftware>\n" 81 "\t\t\t</ocrProcessingStep>\n" 82 "\t\t</OCRProcessing>\n" 94 if (text ==
nullptr)
return false;
118 return GetAltoText(
nullptr, page_number);
126 if (tesseract_ ==
nullptr || (page_res_ ==
nullptr && Recognize(monitor) < 0))
129 int lcnt = 0, bcnt = 0, wcnt = 0;
131 if (input_file_ ==
nullptr) SetInputName(
nullptr);
136 MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
nullptr, 0);
137 wchar_t* uni16_str =
new WCHAR[str16_len];
138 str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
139 uni16_str, str16_len);
140 int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len,
nullptr,
141 0,
nullptr,
nullptr);
142 char* utf8_str =
new char[utf8_len];
143 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
145 *input_file_ = utf8_str;
150 std::stringstream alto_str;
152 <<
"\t\t<Page WIDTH=\"" << rect_width_ <<
"\" HEIGHT=\"" 154 <<
"\" PHYSICAL_IMG_NR=\"" << page_number <<
"\"" 155 <<
" ID=\"page_" << page_number <<
"\">\n" 156 <<
"\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\"" 157 <<
" WIDTH=\"" << rect_width_ <<
"\"" 158 <<
" HEIGHT=\"" << rect_height_ <<
"\">\n";
168 alto_str <<
"\t\t\t\t<TextBlock ID=\"block_" << bcnt <<
"\"";
169 AddBoxToAlto(res_it,
RIL_BLOCK, alto_str);
174 alto_str <<
"\t\t\t\t\t<TextLine ID=\"line_" << lcnt <<
"\"";
179 alto_str <<
"\t\t\t\t\t\t<String ID=\"string_" << wcnt <<
"\"";
180 AddBoxToAlto(res_it,
RIL_WORD, alto_str);
181 alto_str <<
" CONTENT=\"";
186 int left, top, right, bottom;
190 const std::unique_ptr<const char[]> grapheme(
192 if (grapheme && grapheme[0] != 0) {
193 alto_str <<
HOcrEscape(grapheme.get()).c_str();
202 if (last_word_in_line) {
203 alto_str <<
"\n\t\t\t\t\t</TextLine>\n";
209 int width = left - hpos;
210 alto_str <<
"<SP WIDTH=\"" << width <<
"\" VPOS=\"" << vpos
211 <<
"\" HPOS=\"" << hpos <<
"\"/>\n";
214 if (last_word_in_block) {
215 alto_str <<
"\t\t\t\t</TextBlock>\n";
220 alto_str <<
"\t\t\t</PrintSpace>\n" 222 const std::string& text = alto_str.str();
224 char* result =
new char[text.length() + 1];
225 strcpy(result, text.c_str());
STRING HOcrEscape(const char *text)
bool IsAtBeginningOf(PageIteratorLevel level) const override
virtual char * GetUTF8Text(PageIteratorLevel level) const
TessAltoRenderer(const char *outputbase)
bool EndDocumentHandler() override
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
void AppendString(const char *s)
bool BeginDocumentHandler() override
static const char * Version()
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
bool AddImageHandler(TessBaseAPI *api) override
bool Empty(PageIteratorLevel level) const
const char * title() const
char * GetAltoText(ETEXT_DESC *monitor, int page_number)
bool Next(PageIteratorLevel level) override
float Confidence(PageIteratorLevel level) const