tesseract  4.1.0
altorenderer.cpp
Go to the documentation of this file.
1 // File: altorenderer.cpp
2 // Description: ALTO rendering interface
3 // Author: Jake Sebright
4 
5 // (C) Copyright 2018
6 // Licensed under the Apache License, Version 2.0 (the "License");
7 // you may not use this file except in compliance with the License.
8 // You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #include <memory>
17 #include <sstream> // for std::stringstream
18 #include "baseapi.h"
19 #ifdef _WIN32
20 # include "host.h" // windows.h for MultiByteToWideChar, ...
21 #endif
22 #include "renderer.h"
23 
24 namespace tesseract {
25 
29 static void AddBoxToAlto(const ResultIterator* it, PageIteratorLevel level,
30  std::stringstream& alto_str) {
31  int left, top, right, bottom;
32  it->BoundingBox(level, &left, &top, &right, &bottom);
33 
34  int hpos = left;
35  int vpos = top;
36  int height = bottom - top;
37  int width = right - left;
38 
39  alto_str << " HPOS=\"" << hpos << "\"";
40  alto_str << " VPOS=\"" << vpos << "\"";
41  alto_str << " WIDTH=\"" << width << "\"";
42  alto_str << " HEIGHT=\"" << height << "\"";
43 
44  if (level == RIL_WORD) {
45  int wc = it->Confidence(RIL_WORD);
46  alto_str << " WC=\"0." << wc << "\"";
47  } else {
48  alto_str << ">";
49  }
50 }
51 
57  "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
58  "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
59  "xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
60  "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
61  "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
62  "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
63  "\t<Description>\n"
64  "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
65  "\t\t<sourceImageInformation>\n"
66  "\t\t\t<fileName>");
67 
69 
71  "\t\t\t</fileName>\n"
72  "\t\t</sourceImageInformation>\n"
73  "\t\t<OCRProcessing ID=\"OCR_0\">\n"
74  "\t\t\t<ocrProcessingStep>\n"
75  "\t\t\t\t<processingSoftware>\n"
76  "\t\t\t\t\t<softwareName>tesseract ");
79  "</softwareName>\n"
80  "\t\t\t\t</processingSoftware>\n"
81  "\t\t\t</ocrProcessingStep>\n"
82  "\t\t</OCRProcessing>\n"
83  "\t</Description>\n"
84  "\t<Layout>\n");
85 
86  return true;
87 }
88 
93  const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
94  if (text == nullptr) return false;
95 
96  AppendString(text.get());
97 
98  return true;
99 }
100 
105  AppendString("\t</Layout>\n</alto>\n");
106 
107  return true;
108 }
109 
110 TessAltoRenderer::TessAltoRenderer(const char* outputbase)
111  : TessResultRenderer(outputbase, "xml") {}
112 
117 char* TessBaseAPI::GetAltoText(int page_number) {
118  return GetAltoText(nullptr, page_number);
119 }
120 
125 char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
126  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
127  return nullptr;
128 
129  int lcnt = 0, bcnt = 0, wcnt = 0;
130 
131  if (input_file_ == nullptr) SetInputName(nullptr);
132 
133 #ifdef _WIN32
134  // convert input name from ANSI encoding to utf-8
135  int str16_len =
136  MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0);
137  wchar_t* uni16_str = new WCHAR[str16_len];
138  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
139  uni16_str, str16_len);
140  int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
141  0, nullptr, nullptr);
142  char* utf8_str = new char[utf8_len];
143  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
144  nullptr, nullptr);
145  *input_file_ = utf8_str;
146  delete[] uni16_str;
147  delete[] utf8_str;
148 #endif
149 
150  std::stringstream alto_str;
151  alto_str
152  << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\""
153  << rect_height_
154  << "\" PHYSICAL_IMG_NR=\"" << page_number << "\""
155  << " ID=\"page_" << page_number << "\">\n"
156  << "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
157  << " WIDTH=\"" << rect_width_ << "\""
158  << " HEIGHT=\"" << rect_height_ << "\">\n";
159 
160  ResultIterator* res_it = GetIterator();
161  while (!res_it->Empty(RIL_BLOCK)) {
162  if (res_it->Empty(RIL_WORD)) {
163  res_it->Next(RIL_WORD);
164  continue;
165  }
166 
167  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
168  alto_str << "\t\t\t\t<TextBlock ID=\"block_" << bcnt << "\"";
169  AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
170  alto_str << "\n";
171  }
172 
173  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
174  alto_str << "\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
175  AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
176  alto_str << "\n";
177  }
178 
179  alto_str << "\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
180  AddBoxToAlto(res_it, RIL_WORD, alto_str);
181  alto_str << " CONTENT=\"";
182 
183  bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
184  bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
185 
186  int left, top, right, bottom;
187  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
188 
189  do {
190  const std::unique_ptr<const char[]> grapheme(
191  res_it->GetUTF8Text(RIL_SYMBOL));
192  if (grapheme && grapheme[0] != 0) {
193  alto_str << HOcrEscape(grapheme.get()).c_str();
194  }
195  res_it->Next(RIL_SYMBOL);
196  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
197 
198  alto_str << "\"/>";
199 
200  wcnt++;
201 
202  if (last_word_in_line) {
203  alto_str << "\n\t\t\t\t\t</TextLine>\n";
204  lcnt++;
205  } else {
206  int hpos = right;
207  int vpos = top;
208  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
209  int width = left - hpos;
210  alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos
211  << "\" HPOS=\"" << hpos << "\"/>\n";
212  }
213 
214  if (last_word_in_block) {
215  alto_str << "\t\t\t\t</TextBlock>\n";
216  bcnt++;
217  }
218  }
219 
220  alto_str << "\t\t\t</PrintSpace>\n"
221  << "\t\t</Page>\n";
222  const std::string& text = alto_str.str();
223 
224  char* result = new char[text.length() + 1];
225  strcpy(result, text.c_str());
226  delete res_it;
227  return result;
228 }
229 
230 } // namespace tesseract
STRING HOcrEscape(const char *text)
Definition: baseapi.cpp:2268
bool IsAtBeginningOf(PageIteratorLevel level) const override
virtual char * GetUTF8Text(PageIteratorLevel level) const
TessAltoRenderer(const char *outputbase)
bool EndDocumentHandler() override
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
void AppendString(const char *s)
Definition: renderer.cpp:102
bool BeginDocumentHandler() override
static const char * Version()
Definition: baseapi.cpp:227
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
bool AddImageHandler(TessBaseAPI *api) override
bool Empty(PageIteratorLevel level) const
const char * title() const
Definition: renderer.h:87
char * GetAltoText(ETEXT_DESC *monitor, int page_number)
bool Next(PageIteratorLevel level) override
float Confidence(PageIteratorLevel level) const