tesseract  4.1.0
Advanced API

Functions

void tesseract::TessBaseAPI::SetImage (const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
 
void tesseract::TessBaseAPI::SetImage (Pix *pix)
 
void tesseract::TessBaseAPI::SetSourceResolution (int ppi)
 
void tesseract::TessBaseAPI::SetRectangle (int left, int top, int width, int height)
 
void tesseract::TessBaseAPI::SetThresholder (ImageThresholder *thresholder)
 
Pix * tesseract::TessBaseAPI::GetThresholdedImage ()
 
Boxa * tesseract::TessBaseAPI::GetRegions (Pixa **pixa)
 
Boxa * tesseract::TessBaseAPI::GetTextlines (bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids)
 
Boxa * tesseract::TessBaseAPI::GetTextlines (Pixa **pixa, int **blockids)
 
Boxa * tesseract::TessBaseAPI::GetStrips (Pixa **pixa, int **blockids)
 
Boxa * tesseract::TessBaseAPI::GetWords (Pixa **pixa)
 
Boxa * tesseract::TessBaseAPI::GetConnectedComponents (Pixa **cc)
 
Boxa * tesseract::TessBaseAPI::GetComponentImages (PageIteratorLevel level, bool text_only, bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids)
 
Boxa * tesseract::TessBaseAPI::GetComponentImages (const PageIteratorLevel level, const bool text_only, Pixa **pixa, int **blockids)
 
int tesseract::TessBaseAPI::GetThresholdedImageScaleFactor () const
 
PageIterator * tesseract::TessBaseAPI::AnalyseLayout ()
 
PageIterator * tesseract::TessBaseAPI::AnalyseLayout (bool merge_similar_words)
 
int tesseract::TessBaseAPI::Recognize (ETEXT_DESC *monitor)
 
int tesseract::TessBaseAPI::RecognizeForChopTest (ETEXT_DESC *monitor)
 
bool tesseract::TessBaseAPI::ProcessPages (const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
 
bool tesseract::TessBaseAPI::ProcessPagesInternal (const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
 
bool tesseract::TessBaseAPI::ProcessPage (Pix *pix, int page_index, const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
 
ResultIterator * tesseract::TessBaseAPI::GetIterator ()
 
MutableIterator * tesseract::TessBaseAPI::GetMutableIterator ()
 
char * tesseract::TessBaseAPI::GetUTF8Text ()
 
char * tesseract::TessBaseAPI::GetHOCRText (ETEXT_DESC *monitor, int page_number)
 
char * tesseract::TessBaseAPI::GetHOCRText (int page_number)
 
char * tesseract::TessBaseAPI::GetAltoText (ETEXT_DESC *monitor, int page_number)
 
char * tesseract::TessBaseAPI::GetAltoText (int page_number)
 
char * tesseract::TessBaseAPI::GetTSVText (int page_number)
 
char * tesseract::TessBaseAPI::GetLSTMBoxText (int page_number)
 
char * tesseract::TessBaseAPI::GetBoxText (int page_number)
 
char * tesseract::TessBaseAPI::GetWordStrBoxText (int page_number)
 
char * tesseract::TessBaseAPI::GetUNLVText ()
 
bool tesseract::TessBaseAPI::DetectOrientationScript (int *orient_deg, float *orient_conf, const char **script_name, float *script_conf)
 
char * tesseract::TessBaseAPI::GetOsdText (int page_number)
 
int tesseract::TessBaseAPI::MeanTextConf ()
 
int * tesseract::TessBaseAPI::AllWordConfidences ()
 
bool tesseract::TessBaseAPI::AdaptToWordStr (PageSegMode mode, const char *wordstr)
 
void tesseract::TessBaseAPI::Clear ()
 
void tesseract::TessBaseAPI::End ()
 
static void tesseract::TessBaseAPI::ClearPersistentCache ()
 
int tesseract::TessBaseAPI::IsValidWord (const char *word)
 
bool tesseract::TessBaseAPI::IsValidCharacter (const char *utf8_character)
 
bool tesseract::TessBaseAPI::GetTextDirection (int *out_offset, float *out_slope)
 
void tesseract::TessBaseAPI::SetDictFunc (DictFunc f)
 
void tesseract::TessBaseAPI::SetProbabilityInContextFunc (ProbabilityInContextFunc f)
 
bool tesseract::TessBaseAPI::DetectOS (OSResults *)
 
void tesseract::TessBaseAPI::GetBlockTextOrientations (int **block_orientation, bool **vertical_writing)
 
void tesseract::TessBaseAPI::SetFillLatticeFunc (FillLatticeFunc f)
 
BLOCK_LIST * tesseract::TessBaseAPI::FindLinesCreateBlockList ()
 
static void tesseract::TessBaseAPI::DeleteBlockList (BLOCK_LIST *block_list)
 
static ROWtesseract::TessBaseAPI::MakeTessOCRRow (float baseline, float xheight, float descender, float ascender)
 
static TBLOBtesseract::TessBaseAPI::MakeTBLOB (Pix *pix)
 
static void tesseract::TessBaseAPI::NormalizeTBLOB (TBLOB *tblob, ROW *row, bool numeric_mode)
 
void tesseract::TessBaseAPI::GetFeaturesForBlob (TBLOB *blob, INT_FEATURE_STRUCT *int_features, int *num_features, int *feature_outline_index)
 
static ROWtesseract::TessBaseAPI::FindRowForBox (BLOCK_LIST *blocks, int left, int top, int right, int bottom)
 
void tesseract::TessBaseAPI::RunAdaptiveClassifier (TBLOB *blob, int num_max_matches, int *unichar_ids, float *ratings, int *num_matches_returned)
 
const char * tesseract::TessBaseAPI::GetUnichar (int unichar_id)
 
const Dawg * tesseract::TessBaseAPI::GetDawg (int i) const
 
int tesseract::TessBaseAPI::NumDawgs () const
 
Tesseract * tesseract::TessBaseAPI::tesseract () const
 
OcrEngineMode tesseract::TessBaseAPI::oem () const
 
void tesseract::TessBaseAPI::InitTruthCallback (TruthCallback *cb)
 
void tesseract::TessBaseAPI::set_min_orientation_margin (double margin)
 

Detailed Description

The following methods break TesseractRect into pieces, so you can get hold of the thresholded image, get the text in different formats, get bounding boxes, confidences etc.

Function Documentation

bool tesseract::TessBaseAPI::AdaptToWordStr ( PageSegMode  mode,
const char *  wordstr 
)

Applies the given word to the adaptive classifier if possible. The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can tell the boundaries of the graphemes. Assumes that SetImage/SetRectangle have been used to set the image to the given word. The mode arg should be PSM_SINGLE_WORD or PSM_CIRCLE_WORD, as that will be used to control layout analysis. The currently set PageSegMode is preserved. Returns false if adaption was not possible for some reason.

Definition at line 1757 of file baseapi.cpp.

1757  {
1758  int debug = 0;
1759  GetIntVariable("applybox_debug", &debug);
1760  bool success = true;
1761  PageSegMode current_psm = GetPageSegMode();
1762  SetPageSegMode(mode);
1763  SetVariable("classify_enable_learning", "0");
1764  const std::unique_ptr<const char[]> text(GetUTF8Text());
1765  if (debug) {
1766  tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr);
1767  }
1768  if (text != nullptr) {
1769  PAGE_RES_IT it(page_res_);
1770  WERD_RES* word_res = it.word();
1771  if (word_res != nullptr) {
1772  word_res->word->set_text(wordstr);
1773  // Check to see if text matches wordstr.
1774  int w = 0;
1775  int t;
1776  for (t = 0; text[t] != '\0'; ++t) {
1777  if (text[t] == '\n' || text[t] == ' ')
1778  continue;
1779  while (wordstr[w] == ' ') ++w;
1780  if (text[t] != wordstr[w])
1781  break;
1782  ++w;
1783  }
1784  if (text[t] != '\0' || wordstr[w] != '\0') {
1785  // No match.
1786  delete page_res_;
1787  GenericVector<TBOX> boxes;
1791  PAGE_RES_IT pr_it(page_res_);
1792  if (pr_it.word() == nullptr)
1793  success = false;
1794  else
1795  word_res = pr_it.word();
1796  } else {
1797  word_res->BestChoiceToCorrectText();
1798  }
1799  if (success) {
1800  tesseract_->EnableLearning = true;
1801  tesseract_->LearnWord(nullptr, word_res);
1802  }
1803  } else {
1804  success = false;
1805  }
1806  } else {
1807  success = false;
1808  }
1809  SetPageSegMode(current_psm);
1810  return success;
1811 }
void set_text(const char *new_text)
Definition: werd.h:115
bool GetIntVariable(const char *name, int *value) const
Definition: baseapi.cpp:292
PageSegMode GetPageSegMode() const
Definition: baseapi.cpp:516
void SetPageSegMode(PageSegMode mode)
Definition: baseapi.cpp:509
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:510
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:207
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:712
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void BestChoiceToCorrectText()
Definition: pageres.cpp:927
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:250
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
bool SetVariable(const char *name, const char *value)
Definition: baseapi.cpp:280
WERD * word
Definition: pageres.h:188
BLOCK_LIST * block_list_
The page layout.
Definition: baseapi.h:894
int * tesseract::TessBaseAPI::AllWordConfidences ( )

Returns all word confidences (between 0 and 100) in an array, terminated by -1. The calling function must delete [] after use. The number of confidences should correspond to the number of space- delimited words in GetUTF8Text.

Returns an array of all word confidences, terminated by -1.

Definition at line 1722 of file baseapi.cpp.

1722  {
1723  if (tesseract_ == nullptr ||
1724  (!recognition_done_ && Recognize(nullptr) < 0))
1725  return nullptr;
1726  int n_word = 0;
1727  PAGE_RES_IT res_it(page_res_);
1728  for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward())
1729  n_word++;
1730 
1731  int* conf = new int[n_word+1];
1732  n_word = 0;
1733  for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
1734  WERD_RES *word = res_it.word();
1735  WERD_CHOICE* choice = word->best_choice;
1736  int w_conf = static_cast<int>(100 + 5 * choice->certainty());
1737  // This is the eq for converting Tesseract confidence to 1..100
1738  if (w_conf < 0) w_conf = 0;
1739  if (w_conf > 100) w_conf = 100;
1740  conf[n_word++] = w_conf;
1741  }
1742  conf[n_word] = -1;
1743  return conf;
1744 }
bool recognition_done_
page_res_ contains recognition data.
Definition: baseapi.h:901
float certainty() const
Definition: ratngs.h:330
WERD_CHOICE * best_choice
Definition: pageres.h:234
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:824
WERD * word
Definition: pageres.h:188
PageIterator * tesseract::TessBaseAPI::AnalyseLayout ( )

Runs page layout analysis in the mode set by SetPageSegMode. May optionally be called prior to Recognize to get access to just the page layout results. Returns an iterator to the results. If merge_similar_words is true, words are combined where suitable for use with a line recognizer. Use if you want to use AnalyseLayout to find the textlines, and then want to process textline fragments with an external line recognizer. Returns nullptr on error or an empty page. The returned iterator must be deleted after use. WARNING! This class points to data held within the TessBaseAPI class, and therefore can only be used while the TessBaseAPI class still exists and has not been subjected to a call of Init, SetImage, Recognize, Clear, End DetectOS, or anything else that changes the internal PAGE_RES.

Definition at line 804 of file baseapi.cpp.

804 { return AnalyseLayout(false); }
PageIterator * AnalyseLayout()
Definition: baseapi.cpp:804
PageIterator * tesseract::TessBaseAPI::AnalyseLayout ( bool  merge_similar_words)

Definition at line 806 of file baseapi.cpp.

806  {
807  if (FindLines() == 0) {
808  if (block_list_->empty())
809  return nullptr; // The page was empty.
810  page_res_ = new PAGE_RES(merge_similar_words, block_list_, nullptr);
811  DetectParagraphs(false);
812  return new PageIterator(
816  }
817  return nullptr;
818 }
TESS_LOCAL void DetectParagraphs(bool after_text_recognition)
Definition: baseapi.cpp:2236
int GetScaledYResolution() const
Definition: thresholder.h:92
TESS_LOCAL int FindLines()
Definition: baseapi.cpp:2026
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:892
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
BLOCK_LIST * block_list_
The page layout.
Definition: baseapi.h:894
void tesseract::TessBaseAPI::Clear ( )

Free up recognition results and any stored image data, without actually freeing any recognition data that would be time-consuming to reload. Afterwards, you must call SetImage or TesseractRect before doing any Recognize or Get* operation.

Definition at line 1820 of file baseapi.cpp.

1820  {
1821  if (thresholder_ != nullptr)
1822  thresholder_->Clear();
1823  ClearResults();
1824  if (tesseract_ != nullptr) SetInputImage(nullptr);
1825 }
virtual void Clear()
Destroy the Pix if there is one, freeing memory.
Definition: thresholder.cpp:48
void SetInputImage(Pix *pix)
Definition: baseapi.cpp:948
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:892
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
void tesseract::TessBaseAPI::ClearPersistentCache ( )
static

Clear any library-level memory caches. There are a variety of expensive-to-load constant data structures (mostly language dictionaries) that are cached globally – surviving the Init() and End() of individual TessBaseAPI's. This function allows the clearing of these caches.

Definition at line 1868 of file baseapi.cpp.

1868  {
1870 }
void DeleteUnusedDawgs()
Definition: dawg_cache.h:43
static TESS_API DawgCache * GlobalDawgCache()
Definition: dict.cpp:193
void tesseract::TessBaseAPI::DeleteBlockList ( BLOCK_LIST *  block_list)
static

Delete a block list. This is to keep BLOCK_LIST pointer opaque and let go of including the other headers.

Definition at line 2304 of file baseapi.cpp.

2304  {
2305  delete block_list;
2306 }
bool tesseract::TessBaseAPI::DetectOrientationScript ( int *  orient_deg,
float *  orient_conf,
const char **  script_name,
float *  script_conf 
)

Detect the orientation of the input image and apparent script (alphabet). orient_deg is the detected clockwise rotation of the input image in degrees (0, 90, 180, 270) orient_conf is the confidence (15.0 is reasonably confident) script_name is an ASCII string, the name of the script, e.g. "Latin" script_conf is confidence level in the script Returns true on success and writes values to each parameter as an output

Definition at line 1644 of file baseapi.cpp.

1646  {
1647  OSResults osr;
1648 
1649  bool osd = DetectOS(&osr);
1650  if (!osd) {
1651  return false;
1652  }
1653 
1654  int orient_id = osr.best_result.orientation_id;
1655  int script_id = osr.get_best_script(orient_id);
1656  if (orient_conf) *orient_conf = osr.best_result.oconfidence;
1657  if (orient_deg) *orient_deg = orient_id * 90; // convert quadrant to degrees
1658 
1659  if (script_name) {
1660  const char* script = osr.unicharset->get_script_from_script_id(script_id);
1661 
1662  *script_name = script;
1663  }
1664 
1665  if (script_conf) *script_conf = osr.best_result.sconfidence;
1666 
1667  return true;
1668 }
float sconfidence
Definition: osdetect.h:45
float oconfidence
Definition: osdetect.h:46
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:854
OSBestResult best_result
Definition: osdetect.h:81
UNICHARSET * unicharset
Definition: osdetect.h:80
TESS_API int get_best_script(int orientation_id) const
Definition: osdetect.cpp:112
int orientation_id
Definition: osdetect.h:43
bool DetectOS(OSResults *)
Definition: baseapi.cpp:2158
bool tesseract::TessBaseAPI::DetectOS ( OSResults osr)

Estimates the Orientation And Script of the image.

Returns
true if the image was processed successfully.

Estimates the Orientation And Script of the image. Returns true if the image was processed successfully.

Definition at line 2158 of file baseapi.cpp.

2158  {
2159  if (tesseract_ == nullptr)
2160  return false;
2161  ClearResults();
2162  if (tesseract_->pix_binary() == nullptr &&
2164  return false;
2165  }
2166 
2167  if (input_file_ == nullptr)
2168  input_file_ = new STRING(kInputFile);
2170 }
STRING * input_file_
Name used by training code.
Definition: baseapi.h:896
Definition: strngs.h:45
virtual TESS_LOCAL bool Threshold(Pix **pix)
Definition: baseapi.cpp:1972
int orientation_and_script_detection(STRING &filename, OSResults *osr, tesseract::Tesseract *tess)
Definition: osdetect.cpp:190
Pix * pix_binary() const
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
void tesseract::TessBaseAPI::End ( )

Close down tesseract and free up all memory. End() is equivalent to destructing and reconstructing your TessBaseAPI. Once End() has been used, none of the other API functions may be used other than Init and anything declared above it in the class definition.

Definition at line 1833 of file baseapi.cpp.

1833  {
1834  Clear();
1835  delete thresholder_;
1836  thresholder_ = nullptr;
1837  delete page_res_;
1838  page_res_ = nullptr;
1839  delete block_list_;
1840  block_list_ = nullptr;
1841  if (paragraph_models_ != nullptr) {
1843  delete paragraph_models_;
1844  paragraph_models_ = nullptr;
1845  }
1846  if (osd_tesseract_ == tesseract_) osd_tesseract_ = nullptr;
1847  delete tesseract_;
1848  tesseract_ = nullptr;
1849  delete osd_tesseract_;
1850  osd_tesseract_ = nullptr;
1851  delete equ_detect_;
1852  equ_detect_ = nullptr;
1853  delete input_file_;
1854  input_file_ = nullptr;
1855  delete output_file_;
1856  output_file_ = nullptr;
1857  delete datapath_;
1858  datapath_ = nullptr;
1859  delete language_;
1860  language_ = nullptr;
1861 }
Tesseract * osd_tesseract_
For orientation & script detection.
Definition: baseapi.h:889
void delete_data_pointers()
STRING * input_file_
Name used by training code.
Definition: baseapi.h:896
STRING * datapath_
Current location of tessdata.
Definition: baseapi.h:898
STRING * language_
Last initialized language.
Definition: baseapi.h:899
GenericVector< ParagraphModel * > * paragraph_models_
Definition: baseapi.h:893
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:892
STRING * output_file_
Name used by debug code.
Definition: baseapi.h:897
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
EquationDetect * equ_detect_
The equation detector.
Definition: baseapi.h:890
BLOCK_LIST * block_list_
The page layout.
Definition: baseapi.h:894
BLOCK_LIST * tesseract::TessBaseAPI::FindLinesCreateBlockList ( )

Find lines from the image making the BLOCK_LIST.

Definition at line 2292 of file baseapi.cpp.

2292  {
2293  ASSERT_HOST(FindLines() == 0);
2294  BLOCK_LIST* result = block_list_;
2295  block_list_ = nullptr;
2296  return result;
2297 }
TESS_LOCAL int FindLines()
Definition: baseapi.cpp:2026
#define ASSERT_HOST(x)
Definition: errcode.h:88
BLOCK_LIST * block_list_
The page layout.
Definition: baseapi.h:894
ROW * tesseract::TessBaseAPI::FindRowForBox ( BLOCK_LIST *  blocks,
int  left,
int  top,
int  right,
int  bottom 
)
static

This method returns the row to which a box of specified dimensions would belong. If no good match is found, it returns nullptr.

Definition at line 2593 of file baseapi.cpp.

2594  {
2595  TBOX box(left, bottom, right, top);
2596  BLOCK_IT b_it(blocks);
2597  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
2598  BLOCK* block = b_it.data();
2599  if (!box.major_overlap(block->pdblk.bounding_box()))
2600  continue;
2601  ROW_IT r_it(block->row_list());
2602  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
2603  ROW* row = r_it.data();
2604  if (!box.major_overlap(row->bounding_box()))
2605  continue;
2606  WERD_IT w_it(row->word_list());
2607  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
2608  WERD* word = w_it.data();
2609  if (box.major_overlap(word->bounding_box()))
2610  return row;
2611  }
2612  }
2613  }
2614  return nullptr;
2615 }
Definition: werd.h:56
Definition: rect.h:34
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:60
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:191
TBOX bounding_box() const
Definition: werd.cpp:148
WERD_LIST * word_list()
Definition: ocrrow.h:55
TBOX bounding_box() const
Definition: ocrrow.h:88
Definition: ocrrow.h:36
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:117
Definition: ocrblock.h:29
char * tesseract::TessBaseAPI::GetAltoText ( ETEXT_DESC monitor,
int  page_number 
)

Make an XML-formatted string with Alto markup from the internal data structures.

Make an XML-formatted string with ALTO markup from the internal data structures.

Definition at line 125 of file altorenderer.cpp.

125  {
126  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
127  return nullptr;
128 
129  int lcnt = 0, bcnt = 0, wcnt = 0;
130 
131  if (input_file_ == nullptr) SetInputName(nullptr);
132 
133 #ifdef _WIN32
134  // convert input name from ANSI encoding to utf-8
135  int str16_len =
136  MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0);
137  wchar_t* uni16_str = new WCHAR[str16_len];
138  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
139  uni16_str, str16_len);
140  int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
141  0, nullptr, nullptr);
142  char* utf8_str = new char[utf8_len];
143  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
144  nullptr, nullptr);
145  *input_file_ = utf8_str;
146  delete[] uni16_str;
147  delete[] utf8_str;
148 #endif
149 
150  std::stringstream alto_str;
151  alto_str
152  << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\""
153  << rect_height_
154  << "\" PHYSICAL_IMG_NR=\"" << page_number << "\""
155  << " ID=\"page_" << page_number << "\">\n"
156  << "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
157  << " WIDTH=\"" << rect_width_ << "\""
158  << " HEIGHT=\"" << rect_height_ << "\">\n";
159 
160  ResultIterator* res_it = GetIterator();
161  while (!res_it->Empty(RIL_BLOCK)) {
162  if (res_it->Empty(RIL_WORD)) {
163  res_it->Next(RIL_WORD);
164  continue;
165  }
166 
167  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
168  alto_str << "\t\t\t\t<TextBlock ID=\"block_" << bcnt << "\"";
169  AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
170  alto_str << "\n";
171  }
172 
173  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
174  alto_str << "\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
175  AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
176  alto_str << "\n";
177  }
178 
179  alto_str << "\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
180  AddBoxToAlto(res_it, RIL_WORD, alto_str);
181  alto_str << " CONTENT=\"";
182 
183  bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
184  bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
185 
186  int left, top, right, bottom;
187  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
188 
189  do {
190  const std::unique_ptr<const char[]> grapheme(
191  res_it->GetUTF8Text(RIL_SYMBOL));
192  if (grapheme && grapheme[0] != 0) {
193  alto_str << HOcrEscape(grapheme.get()).c_str();
194  }
195  res_it->Next(RIL_SYMBOL);
196  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
197 
198  alto_str << "\"/>";
199 
200  wcnt++;
201 
202  if (last_word_in_line) {
203  alto_str << "\n\t\t\t\t\t</TextLine>\n";
204  lcnt++;
205  } else {
206  int hpos = right;
207  int vpos = top;
208  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
209  int width = left - hpos;
210  alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos
211  << "\" HPOS=\"" << hpos << "\"/>\n";
212  }
213 
214  if (last_word_in_block) {
215  alto_str << "\t\t\t\t</TextBlock>\n";
216  bcnt++;
217  }
218  }
219 
220  alto_str << "\t\t\t</PrintSpace>\n"
221  << "\t\t</Page>\n";
222  const std::string& text = alto_str.str();
223 
224  char* result = new char[text.length() + 1];
225  strcpy(result, text.c_str());
226  delete res_it;
227  return result;
228 }
STRING HOcrEscape(const char *text)
Definition: baseapi.cpp:2268
STRING * input_file_
Name used by training code.
Definition: baseapi.h:896
void SetInputName(const char *name)
Definition: baseapi.cpp:265
ResultIterator * GetIterator()
Definition: baseapi.cpp:1282
const char * string() const
Definition: strngs.cpp:194
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:824
char * tesseract::TessBaseAPI::GetAltoText ( int  page_number)

Make an XML-formatted string with Alto markup from the internal data structures.

Make an XML-formatted string with ALTO markup from the internal data structures.

Definition at line 117 of file altorenderer.cpp.

117  {
118  return GetAltoText(nullptr, page_number);
119 }
char * GetAltoText(ETEXT_DESC *monitor, int page_number)
void tesseract::TessBaseAPI::GetBlockTextOrientations ( int **  block_orientation,
bool **  vertical_writing 
)

Return text orientation of each block as determined by an earlier run of layout analysis.

Return text orientation of each block as determined in an earlier page layout analysis operation. Orientation is returned as the number of ccw 90-degree rotations (in [0..3]) required to make the text in the block upright (readable). Note that this may not necessary be the block orientation preferred for recognition (such as the case of vertical CJK text).

Also returns whether the text in the block is believed to have vertical writing direction (when in an upright page orientation).

The returned array is of length equal to the number of text blocks, which may be less than the total number of blocks. The ordering is intended to be consistent with GetTextLines().

Definition at line 2191 of file baseapi.cpp.

2192  {
2193  delete[] *block_orientation;
2194  *block_orientation = nullptr;
2195  delete[] *vertical_writing;
2196  *vertical_writing = nullptr;
2197  BLOCK_IT block_it(block_list_);
2198 
2199  block_it.move_to_first();
2200  int num_blocks = 0;
2201  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2202  if (!block_it.data()->pdblk.poly_block()->IsText()) {
2203  continue;
2204  }
2205  ++num_blocks;
2206  }
2207  if (!num_blocks) {
2208  tprintf("WARNING: Found no blocks\n");
2209  return;
2210  }
2211  *block_orientation = new int[num_blocks];
2212  *vertical_writing = new bool[num_blocks];
2213  block_it.move_to_first();
2214  int i = 0;
2215  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
2216  block_it.forward()) {
2217  if (!block_it.data()->pdblk.poly_block()->IsText()) {
2218  continue;
2219  }
2220  FCOORD re_rotation = block_it.data()->re_rotation();
2221  float re_theta = re_rotation.angle();
2222  FCOORD classify_rotation = block_it.data()->classify_rotation();
2223  float classify_theta = classify_rotation.angle();
2224  double rot_theta = - (re_theta - classify_theta) * 2.0 / M_PI;
2225  if (rot_theta < 0) rot_theta += 4;
2226  int num_rotations = static_cast<int>(rot_theta + 0.5);
2227  (*block_orientation)[i] = num_rotations;
2228  // The classify_rotation is non-zero only if the text has vertical
2229  // writing direction.
2230  (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
2231  ++i;
2232  }
2233 }
float angle() const
find angle
Definition: points.h:247
Definition: points.h:188
float y() const
Definition: points.h:210
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
BLOCK_LIST * block_list_
The page layout.
Definition: baseapi.h:894
char * tesseract::TessBaseAPI::GetBoxText ( int  page_number)

The recognized text is returned as a char* which is coded in the same format as a box file used in training. Constructs coordinates in the original image - not just the rectangle. page_number is a 0-based page index that will appear in the box file. Returned string must be freed with the delete [] operator.

The recognized text is returned as a char* which is coded as a UTF8 box file. page_number is a 0-base page index that will appear in the box file. Returned string must be freed with the delete [] operator.

Definition at line 1478 of file baseapi.cpp.

1478  {
1479  if (tesseract_ == nullptr ||
1480  (!recognition_done_ && Recognize(nullptr) < 0))
1481  return nullptr;
1482  int blob_count;
1483  int utf8_length = TextLength(&blob_count);
1484  int total_length = blob_count * kBytesPerBoxFileLine + utf8_length +
1486  char* result = new char[total_length];
1487  result[0] = '\0';
1488  int output_length = 0;
1489  LTRResultIterator* it = GetLTRIterator();
1490  do {
1491  int left, top, right, bottom;
1492  if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
1493  const std::unique_ptr</*non-const*/ char[]> text(
1494  it->GetUTF8Text(RIL_SYMBOL));
1495  // Tesseract uses space for recognition failure. Fix to a reject
1496  // character, kTesseractReject so we don't create illegal box files.
1497  for (int i = 0; text[i] != '\0'; ++i) {
1498  if (text[i] == ' ')
1499  text[i] = kTesseractReject;
1500  }
1501  snprintf(result + output_length, total_length - output_length,
1502  "%s %d %d %d %d %d\n", text.get(), left, image_height_ - bottom,
1503  right, image_height_ - top, page_number);
1504  output_length += strlen(result + output_length);
1505  // Just in case...
1506  if (output_length + kMaxBytesPerLine > total_length)
1507  break;
1508  }
1509  } while (it->Next(RIL_SYMBOL));
1510  delete it;
1511  return result;
1512 }
TESS_LOCAL int TextLength(int *blob_count)
Definition: baseapi.cpp:2127
const int kMaxBytesPerLine
Definition: baseapi.cpp:1469
TESS_LOCAL LTRResultIterator * GetLTRIterator()
Definition: baseapi.cpp:1265
const char kTesseractReject
Definition: baseapi.cpp:100
bool recognition_done_
page_res_ contains recognition data.
Definition: baseapi.h:901
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:824
const int kBytesPerBoxFileLine
Definition: baseapi.cpp:1460
Boxa * tesseract::TessBaseAPI::GetComponentImages ( PageIteratorLevel  level,
bool  text_only,
bool  raw_image,
int  raw_padding,
Pixa **  pixa,
int **  blockids,
int **  paraids 
)

Get the given level kind of components (block, textline, word etc.) as a leptonica-style Boxa, Pixa pair, in reading order. Can be called before or after Recognize. If blockids is not nullptr, the block-id of each component is also returned as an array of one element per component. delete [] after use. If blockids is not nullptr, the paragraph-id of each component with its block is also returned as an array of one element per component. delete [] after use. If raw_image is true, then portions of the original image are extracted instead of the thresholded image and padded with raw_padding. If text_only is true, then only text components are returned.

Get the given level kind of components (block, textline, word etc.) as a leptonica-style Boxa, Pixa pair, in reading order. Can be called before or after Recognize. If blockids is not nullptr, the block-id of each component is also returned as an array of one element per component. delete [] after use. If text_only is true, then only text components are returned.

Definition at line 700 of file baseapi.cpp.

704  {
705  PageIterator* page_it = GetIterator();
706  if (page_it == nullptr)
707  page_it = AnalyseLayout();
708  if (page_it == nullptr)
709  return nullptr; // Failed.
710 
711  // Count the components to get a size for the arrays.
712  int component_count = 0;
713  int left, top, right, bottom;
714 
715  TessResultCallback<bool>* get_bbox = nullptr;
716  if (raw_image) {
717  // Get bounding box in original raw image with padding.
719  level, raw_padding,
720  &left, &top, &right, &bottom);
721  } else {
722  // Get bounding box from binarized imaged. Note that this could be
723  // differently scaled from the original image.
724  get_bbox = NewPermanentTessCallback(page_it,
726  level, &left, &top, &right, &bottom);
727  }
728  do {
729  if (get_bbox->Run() &&
730  (!text_only || PTIsTextType(page_it->BlockType())))
731  ++component_count;
732  } while (page_it->Next(level));
733 
734  Boxa* boxa = boxaCreate(component_count);
735  if (pixa != nullptr)
736  *pixa = pixaCreate(component_count);
737  if (blockids != nullptr)
738  *blockids = new int[component_count];
739  if (paraids != nullptr)
740  *paraids = new int[component_count];
741 
742  int blockid = 0;
743  int paraid = 0;
744  int component_index = 0;
745  page_it->Begin();
746  do {
747  if (get_bbox->Run() &&
748  (!text_only || PTIsTextType(page_it->BlockType()))) {
749  Box* lbox = boxCreate(left, top, right - left, bottom - top);
750  boxaAddBox(boxa, lbox, L_INSERT);
751  if (pixa != nullptr) {
752  Pix* pix = nullptr;
753  if (raw_image) {
754  pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left,
755  &top);
756  } else {
757  pix = page_it->GetBinaryImage(level);
758  }
759  pixaAddPix(*pixa, pix, L_INSERT);
760  pixaAddBox(*pixa, lbox, L_CLONE);
761  }
762  if (paraids != nullptr) {
763  (*paraids)[component_index] = paraid;
764  if (page_it->IsAtFinalElement(RIL_PARA, level))
765  ++paraid;
766  }
767  if (blockids != nullptr) {
768  (*blockids)[component_index] = blockid;
769  if (page_it->IsAtFinalElement(RIL_BLOCK, level)) {
770  ++blockid;
771  paraid = 0;
772  }
773  }
774  ++component_index;
775  }
776  } while (page_it->Next(level));
777  delete page_it;
778  delete get_bbox;
779  return boxa;
780 }
bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
bool PTIsTextType(PolyBlockType type)
Definition: publictypes.h:82
ResultIterator * GetIterator()
Definition: baseapi.cpp:1282
virtual R Run()=0
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
PageIterator * AnalyseLayout()
Definition: baseapi.cpp:804
Boxa* tesseract::TessBaseAPI::GetComponentImages ( const PageIteratorLevel  level,
const bool  text_only,
Pixa **  pixa,
int **  blockids 
)
inline

Definition at line 450 of file baseapi.h.

452  {
453  return GetComponentImages(level, text_only, false, 0, pixa, blockids, nullptr);
454  }
Boxa * GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:700
Boxa * tesseract::TessBaseAPI::GetConnectedComponents ( Pixa **  pixa)

Gets the individual connected (text) components (created after pages segmentation step, but before recognition) as a leptonica-style Boxa, Pixa pair, in reading order. Can be called before or after Recognize. Note: the caller is responsible for calling boxaDestroy() on the returned Boxa array and pixaDestroy() on cc array.

Gets the individual connected (text) components (created after pages segmentation step, but before recognition) as a leptonica-style Boxa, Pixa pair, in reading order. Can be called before or after Recognize.

Definition at line 688 of file baseapi.cpp.

688  {
689  return GetComponentImages(RIL_SYMBOL, true, pixa, nullptr);
690 }
Boxa * GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:700
const Dawg * tesseract::TessBaseAPI::GetDawg ( int  i) const

Return the pointer to the i-th dawg loaded into tesseract_ object.

Definition at line 2257 of file baseapi.cpp.

2257  {
2258  if (tesseract_ == nullptr || i >= NumDawgs()) return nullptr;
2259  return tesseract_->getDict().GetDawg(i);
2260 }
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:423
Dict & getDict() override
int NumDawgs() const
Definition: baseapi.cpp:2263
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
void tesseract::TessBaseAPI::GetFeaturesForBlob ( TBLOB blob,
INT_FEATURE_STRUCT int_features,
int *  num_features,
int *  feature_outline_index 
)

This method returns the features associated with the input image.

This method returns the features associated with the input blob.

Definition at line 2565 of file baseapi.cpp.

2568  {
2569  GenericVector<int> outline_counts;
2572  INT_FX_RESULT_STRUCT fx_info;
2573  tesseract_->ExtractFeatures(*blob, false, &bl_features,
2574  &cn_features, &fx_info, &outline_counts);
2575  if (cn_features.empty() || cn_features.size() > MAX_NUM_INT_FEATURES) {
2576  *num_features = 0;
2577  return; // Feature extraction failed.
2578  }
2579  *num_features = cn_features.size();
2580  memcpy(int_features, &cn_features[0], *num_features * sizeof(cn_features[0]));
2581  // TODO(rays) Pass outline_counts back and simplify the calling code.
2582  if (feature_outline_index != nullptr) {
2583  int f = 0;
2584  for (int i = 0; i < outline_counts.size(); ++i) {
2585  while (f < outline_counts[i])
2586  feature_outline_index[f++] = i;
2587  }
2588  }
2589 }
static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
Definition: intfx.cpp:442
#define MAX_NUM_INT_FEATURES
Definition: intproto.h:129
bool empty() const
Definition: genericvector.h:89
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
int size() const
Definition: genericvector.h:70
char * tesseract::TessBaseAPI::GetHOCRText ( ETEXT_DESC monitor,
int  page_number 
)

Make a HTML-formatted string with hOCR markup from the internal data structures. page_number is 0-based but will appear in the output as 1-based. monitor can be used to cancel the recognition receive progress callbacks Returned string must be freed with the delete [] operator.

Make a HTML-formatted string with hOCR markup from the internal data structures. page_number is 0-based but will appear in the output as 1-based. Image name/input_file_ can be set by SetInputName before calling GetHOCRText STL removed from original patch submission and refactored by rays. Returned string must be freed with the delete [] operator.

Definition at line 132 of file hocrrenderer.cpp.

132  {
133  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
134  return nullptr;
135 
136  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, gcnt = 1;
137  int page_id = page_number + 1; // hOCR uses 1-based page numbers.
138  bool para_is_ltr = true; // Default direction is LTR
139  const char* paragraph_lang = nullptr;
140  bool font_info = false;
141  bool hocr_boxes = false;
142  GetBoolVariable("hocr_font_info", &font_info);
143  GetBoolVariable("hocr_char_boxes", &hocr_boxes);
144 
145  if (input_file_ == nullptr) SetInputName(nullptr);
146 
147 #ifdef _WIN32
148  // convert input name from ANSI encoding to utf-8
149  int str16_len =
150  MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0);
151  wchar_t* uni16_str = new WCHAR[str16_len];
152  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
153  uni16_str, str16_len);
154  int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
155  0, nullptr, nullptr);
156  char* utf8_str = new char[utf8_len];
157  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
158  nullptr, nullptr);
159  *input_file_ = utf8_str;
160  delete[] uni16_str;
161  delete[] utf8_str;
162 #endif
163 
164  std::stringstream hocr_str;
165  // Use "C" locale (needed for double values x_size and x_descenders).
166  hocr_str.imbue(std::locale::classic());
167  // Use 8 digits for double values.
168  hocr_str.precision(8);
169  hocr_str << " <div class='ocr_page'";
170  hocr_str << " id='"
171  << "page_" << page_id << "'";
172  hocr_str << " title='image \"";
173  if (input_file_) {
174  hocr_str << HOcrEscape(input_file_->string()).c_str();
175  } else {
176  hocr_str << "unknown";
177  }
178  hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " "
179  << rect_width_ << " " << rect_height_ << "; ppageno " << page_number
180  << "'>\n";
181 
182  std::unique_ptr<ResultIterator> res_it(GetIterator());
183  while (!res_it->Empty(RIL_BLOCK)) {
184  if (res_it->Empty(RIL_WORD)) {
185  res_it->Next(RIL_WORD);
186  continue;
187  }
188 
189  // Open any new block/paragraph/textline.
190  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
191  para_is_ltr = true; // reset to default direction
192  hocr_str << " <div class='ocr_carea'"
193  << " id='"
194  << "block_" << page_id << "_" << bcnt << "'";
195  AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);
196  }
197  if (res_it->IsAtBeginningOf(RIL_PARA)) {
198  hocr_str << "\n <p class='ocr_par'";
199  para_is_ltr = res_it->ParagraphIsLtr();
200  if (!para_is_ltr) {
201  hocr_str << " dir='rtl'";
202  }
203  hocr_str << " id='"
204  << "par_" << page_id << "_" << pcnt << "'";
205  paragraph_lang = res_it->WordRecognitionLanguage();
206  if (paragraph_lang) {
207  hocr_str << " lang='" << paragraph_lang << "'";
208  }
209  AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
210  }
211  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
212  hocr_str << "\n <span class='";
213  switch (res_it->BlockType()) {
214  case PT_HEADING_TEXT:
215  hocr_str << "ocr_header";
216  break;
217  case PT_PULLOUT_TEXT:
218  hocr_str << "ocr_textfloat";
219  break;
220  case PT_CAPTION_TEXT:
221  hocr_str << "ocr_caption";
222  break;
223  default:
224  hocr_str << "ocr_line";
225  }
226  hocr_str << "' id='"
227  << "line_" << page_id << "_" << lcnt << "'";
228  AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
229  }
230 
231  // Now, process the word...
232  std::vector<std::vector<std::pair<const char*, float>>>* choiceMap =
233  nullptr;
235 
236  choiceMap = res_it->GetBestLSTMSymbolChoices();
237  }
238  hocr_str << "\n <span class='ocrx_word'"
239  << " id='"
240  << "word_" << page_id << "_" << wcnt << "'";
241  int left, top, right, bottom;
242  bool bold, italic, underlined, monospace, serif, smallcaps;
243  int pointsize, font_id;
244  const char* font_name;
245  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
246  font_name =
247  res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
248  &serif, &smallcaps, &pointsize, &font_id);
249  hocr_str << " title='bbox " << left << " " << top << " " << right << " "
250  << bottom << "; x_wconf "
251  << static_cast<int>(res_it->Confidence(RIL_WORD));
252  if (font_info) {
253  if (font_name) {
254  hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
255  }
256  hocr_str << "; x_fsize " << pointsize;
257  }
258  hocr_str << "'";
259  const char* lang = res_it->WordRecognitionLanguage();
260  if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
261  hocr_str << " lang='" << lang << "'";
262  }
263  switch (res_it->WordDirection()) {
264  // Only emit direction if different from current paragraph direction
265  case DIR_LEFT_TO_RIGHT:
266  if (!para_is_ltr) hocr_str << " dir='ltr'";
267  break;
268  case DIR_RIGHT_TO_LEFT:
269  if (para_is_ltr) hocr_str << " dir='rtl'";
270  break;
271  case DIR_MIX:
272  case DIR_NEUTRAL:
273  default: // Do nothing.
274  break;
275  }
276  hocr_str << ">";
277  bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
278  bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
279  bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
280  if (bold) hocr_str << "<strong>";
281  if (italic) hocr_str << "<em>";
282  do {
283  const std::unique_ptr<const char[]> grapheme(
284  res_it->GetUTF8Text(RIL_SYMBOL));
285  if (grapheme && grapheme[0] != 0) {
286  if (hocr_boxes) {
287  res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
288  hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes "
289  << left << " " << top << " " << right << " " << bottom
290  << "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
291  }
292  hocr_str << HOcrEscape(grapheme.get()).c_str();
293  if (hocr_boxes) {
294  hocr_str << "</span>";
295  }
296  }
297  res_it->Next(RIL_SYMBOL);
298  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
299  if (italic) hocr_str << "</em>";
300  if (bold) hocr_str << "</strong>";
301  // If the lstm choice mode is required it is added here
302  if (tesseract_->lstm_choice_mode == 1 && choiceMap != nullptr) {
303  for (auto timestep : *choiceMap) {
304  hocr_str << "\n <span class='ocrx_cinfo'"
305  << " id='"
306  << "timestep_" << page_id << "_" << wcnt << "_" << tcnt << "'"
307  << ">";
308  for (std::pair<const char*, float> conf : timestep) {
309  hocr_str << "<span class='ocr_glyph'"
310  << " id='"
311  << "choice_" << page_id << "_" << wcnt << "_" << gcnt << "'"
312  << " title='x_confs " << int(conf.second * 100) << "'>"
313  << conf.first << "</span>";
314  gcnt++;
315  }
316  hocr_str << "</span>";
317  tcnt++;
318  }
319  } else if (tesseract_->lstm_choice_mode == 2 && choiceMap != nullptr) {
320  for (auto timestep : *choiceMap) {
321  if (timestep.size() > 0) {
322  hocr_str << "\n <span class='ocrx_cinfo'"
323  << " id='"
324  << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
325  << "'>";
326  for (auto & j : timestep) {
327  hocr_str << "<span class='ocr_glyph'"
328  << " id='"
329  << "choice_" << page_id << "_" << wcnt << "_" << gcnt
330  << "'"
331  << " title='x_confs " << int(j.second * 100)
332  << "'>" << j.first << "</span>";
333  gcnt++;
334  }
335  hocr_str << "</span>";
336  tcnt++;
337  }
338  }
339  }
340  hocr_str << "</span>";
341  tcnt = 1;
342  gcnt = 1;
343  wcnt++;
344  // Close any ending block/paragraph/textline.
345  if (last_word_in_line) {
346  hocr_str << "\n </span>";
347  lcnt++;
348  }
349  if (last_word_in_para) {
350  hocr_str << "\n </p>\n";
351  pcnt++;
352  para_is_ltr = true; // back to default direction
353  }
354  if (last_word_in_block) {
355  hocr_str << " </div>\n";
356  bcnt++;
357  }
358  }
359  hocr_str << " </div>\n";
360 
361  const std::string& text = hocr_str.str();
362  char* result = new char[text.length() + 1];
363  strcpy(result, text.c_str());
364  return result;
365 }
STRING HOcrEscape(const char *text)
Definition: baseapi.cpp:2268
STRING * input_file_
Name used by training code.
Definition: baseapi.h:896
void SetInputName(const char *name)
Definition: baseapi.cpp:265
ResultIterator * GetIterator()
Definition: baseapi.cpp:1282
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:300
const char * string() const
Definition: strngs.cpp:194
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:824
const char * c_str() const
Definition: strngs.cpp:205
char * tesseract::TessBaseAPI::GetHOCRText ( int  page_number)

Make a HTML-formatted string with hOCR markup from the internal data structures. page_number is 0-based but will appear in the output as 1-based. Returned string must be freed with the delete [] operator.

Make a HTML-formatted string with hOCR markup from the internal data structures. page_number is 0-based but will appear in the output as 1-based. Image name/input_file_ can be set by SetInputName before calling GetHOCRText STL removed from original patch submission and refactored by rays. Returned string must be freed with the delete [] operator.

Definition at line 119 of file hocrrenderer.cpp.

119  {
120  return GetHOCRText(nullptr, page_number);
121 }
char * GetHOCRText(ETEXT_DESC *monitor, int page_number)
ResultIterator * tesseract::TessBaseAPI::GetIterator ( )

Get a reading-order iterator to the results of LayoutAnalysis and/or Recognize. The returned iterator must be deleted after use. WARNING! This class points to data held within the TessBaseAPI class, and therefore can only be used while the TessBaseAPI class still exists and has not been subjected to a call of Init, SetImage, Recognize, Clear, End DetectOS, or anything else that changes the internal PAGE_RES.

Definition at line 1282 of file baseapi.cpp.

1282  {
1283  if (tesseract_ == nullptr || page_res_ == nullptr)
1284  return nullptr;
1285  return ResultIterator::StartOfParagraph(LTRResultIterator(
1289 }
static ResultIterator * StartOfParagraph(const LTRResultIterator &resit)
int GetScaledYResolution() const
Definition: thresholder.h:92
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:892
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
char * tesseract::TessBaseAPI::GetLSTMBoxText ( int  page_number = 0)

Make a box file for LSTM training from the internal data structures. Constructs coordinates in the original image - not just the rectangle. page_number is a 0-based page index that will appear in the box file. Returned string must be freed with the delete [] operator.

Definition at line 38 of file lstmboxrenderer.cpp.

38  {
39  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
40  return nullptr;
41 
42  STRING lstm_box_str("");
43  bool first_word = true;
44  int left = 0, top = 0, right = 0, bottom = 0;
45 
46  LTRResultIterator* res_it = GetLTRIterator();
47  while (!res_it->Empty(RIL_BLOCK)) {
48  if (res_it->Empty(RIL_SYMBOL)) {
49  res_it->Next(RIL_SYMBOL);
50  continue;
51  }
52  if (!first_word) {
53  if (!(res_it->IsAtBeginningOf(RIL_TEXTLINE))) {
54  if (res_it->IsAtBeginningOf(RIL_WORD)) {
55  lstm_box_str.add_str_int(" ", left);
56  AddBoxToLSTM(right, bottom, top, image_height_, page_number,
57  &lstm_box_str);
58  lstm_box_str += "\n"; // end of row for word
59  } // word
60  } else {
61  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
62  lstm_box_str.add_str_int("\t ", left);
63  AddBoxToLSTM(right, bottom, top, image_height_, page_number,
64  &lstm_box_str);
65  lstm_box_str += "\n"; // end of row for line
66  } // line
67  }
68  } // not first word
69  first_word = false;
70  // Use bounding box for whole line for everything
71  res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
72  do {
73  lstm_box_str +=
74  std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
75  res_it->Next(RIL_SYMBOL);
76  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));
77  lstm_box_str.add_str_int(" ", left);
78  AddBoxToLSTM(right, bottom, top, image_height_, page_number, &lstm_box_str);
79  lstm_box_str += "\n"; // end of row for symbol
80  }
81  if (!first_word) { // if first_word is true => empty page
82  lstm_box_str.add_str_int("\t ", left);
83  AddBoxToLSTM(right, bottom, top, image_height_, page_number, &lstm_box_str);
84  lstm_box_str += "\n"; // end of PAGE
85  }
86  char* ret = new char[lstm_box_str.length() + 1];
87  strcpy(ret, lstm_box_str.string());
88  delete res_it;
89  return ret;
90 }
Definition: strngs.h:45
TESS_LOCAL LTRResultIterator * GetLTRIterator()
Definition: baseapi.cpp:1265
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:824
MutableIterator * tesseract::TessBaseAPI::GetMutableIterator ( )

Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. The returned iterator must be deleted after use. WARNING! This class points to data held within the TessBaseAPI class, and therefore can only be used while the TessBaseAPI class still exists and has not been subjected to a call of Init, SetImage, Recognize, Clear, End DetectOS, or anything else that changes the internal PAGE_RES.

Definition at line 1299 of file baseapi.cpp.

1299  {
1300  if (tesseract_ == nullptr || page_res_ == nullptr)
1301  return nullptr;
1302  return new MutableIterator(page_res_, tesseract_,
1306 }
int GetScaledYResolution() const
Definition: thresholder.h:92
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:892
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
char * tesseract::TessBaseAPI::GetOsdText ( int  page_number)

The recognized text is returned as a char* which is coded as UTF8 and must be freed with the delete [] operator. page_number is a 0-based page index that will appear in the osd file.

Definition at line 1675 of file baseapi.cpp.

1675  {
1676  int orient_deg;
1677  float orient_conf;
1678  const char* script_name;
1679  float script_conf;
1680 
1681  if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name,
1682  &script_conf))
1683  return nullptr;
1684 
1685  // clockwise rotation needed to make the page upright
1686  int rotate = OrientationIdToValue(orient_deg / 90);
1687 
1688  std::stringstream stream;
1689  // Use "C" locale (needed for float values orient_conf and script_conf).
1690  stream.imbue(std::locale::classic());
1691  // Use fixed notation with 2 digits after the decimal point for float values.
1692  stream.precision(2);
1693  stream
1694  << std::fixed
1695  << "Page number: " << page_number << "\n"
1696  << "Orientation in degrees: " << orient_deg << "\n"
1697  << "Rotate: " << rotate << "\n"
1698  << "Orientation confidence: " << orient_conf << "\n"
1699  << "Script: " << script_name << "\n"
1700  << "Script confidence: " << script_conf << "\n";
1701  const std::string& text = stream.str();
1702  char* result = new char[text.length() + 1];
1703  strcpy(result, text.c_str());
1704  return result;
1705 }
int OrientationIdToValue(const int &id)
Definition: osdetect.cpp:566
bool DetectOrientationScript(int *orient_deg, float *orient_conf, const char **script_name, float *script_conf)
Definition: baseapi.cpp:1644
Boxa * tesseract::TessBaseAPI::GetRegions ( Pixa **  pixa)

Get the result of page layout analysis as a leptonica-style Boxa, Pixa pair, in reading order. Can be called before or after Recognize.

Definition at line 643 of file baseapi.cpp.

643  {
644  return GetComponentImages(RIL_BLOCK, false, pixa, nullptr);
645 }
Boxa * GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:700
Boxa * tesseract::TessBaseAPI::GetStrips ( Pixa **  pixa,
int **  blockids 
)

Get textlines and strips of image regions as a leptonica-style Boxa, Pixa pair, in reading order. Enables downstream handling of non-rectangular regions. Can be called before or after Recognize. If blockids is not nullptr, the block-id of each line is also returned as an array of one element per line. delete [] after use.

Definition at line 669 of file baseapi.cpp.

669  {
670  return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
671 }
Boxa * GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:700
bool tesseract::TessBaseAPI::GetTextDirection ( int *  out_offset,
float *  out_slope 
)

Definition at line 1887 of file baseapi.cpp.

1887  {
1888  PageIterator* it = AnalyseLayout();
1889  if (it == nullptr) {
1890  return false;
1891  }
1892  int x1, x2, y1, y2;
1893  it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
1894  // Calculate offset and slope (NOTE: Kind of ugly)
1895  if (x2 <= x1) x2 = x1 + 1;
1896  // Convert the point pair to slope/offset of the baseline (in image coords.)
1897  *out_slope = static_cast<float>(y2 - y1) / (x2 - x1);
1898  *out_offset = static_cast<int>(y1 - *out_slope * x1);
1899  // Get the y-coord of the baseline at the left and right edges of the
1900  // textline's bounding box.
1901  int left, top, right, bottom;
1902  if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) {
1903  delete it;
1904  return false;
1905  }
1906  int left_y = IntCastRounded(*out_slope * left + *out_offset);
1907  int right_y = IntCastRounded(*out_slope * right + *out_offset);
1908  // Shift the baseline down so it passes through the nearest bottom-corner
1909  // of the textline's bounding box. This is the difference between the y
1910  // at the lowest (max) edge of the box and the actual box bottom.
1911  *out_offset += bottom - std::max(left_y, right_y);
1912  // Switch back to bottom-up tesseract coordinates. Requires negation of
1913  // the slope and height - offset for the offset.
1914  *out_slope = -*out_slope;
1915  *out_offset = rect_height_ - *out_offset;
1916  delete it;
1917 
1918  return true;
1919 }
int IntCastRounded(double x)
Definition: helpers.h:175
PageIterator * AnalyseLayout()
Definition: baseapi.cpp:804
Boxa * tesseract::TessBaseAPI::GetTextlines ( bool  raw_image,
int  raw_padding,
Pixa **  pixa,
int **  blockids,
int **  paraids 
)

Get the textlines as a leptonica-style Boxa, Pixa pair, in reading order. Can be called before or after Recognize. If raw_image is true, then extract from the original image instead of the thresholded image and pad by raw_padding pixels. If blockids is not nullptr, the block-id of each line is also returned as an array of one element per line. delete [] after use. If paraids is not nullptr, the paragraph-id of each line within its block is also returned as an array of one element per line. delete [] after use.

Get the textlines as a leptonica-style Boxa, Pixa pair, in reading order. Can be called before or after Recognize. If blockids is not nullptr, the block-id of each line is also returned as an array of one element per line. delete [] after use. If paraids is not nullptr, the paragraph-id of each line within its block is also returned as an array of one element per line. delete [] after use.

Definition at line 655 of file baseapi.cpp.

656  {
657  return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding,
658  pixa, blockids, paraids);
659 }
Boxa * GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:700
Boxa* tesseract::TessBaseAPI::GetTextlines ( Pixa **  pixa,
int **  blockids 
)
inline

Definition at line 401 of file baseapi.h.

401  {
402  return GetTextlines(false, 0, pixa, blockids, nullptr);
403  }
Boxa * GetTextlines(bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:655
Pix * tesseract::TessBaseAPI::GetThresholdedImage ( )

Get a copy of the internal thresholded image from Tesseract. Caller takes ownership of the Pix and must pixDestroy it. May be called any time after SetImage, or after TesseractRect.

ONLY available after SetImage if you have Leptonica installed. Get a copy of the internal thresholded image from Tesseract.

Definition at line 629 of file baseapi.cpp.

629  {
630  if (tesseract_ == nullptr || thresholder_ == nullptr) return nullptr;
631  if (tesseract_->pix_binary() == nullptr &&
633  return nullptr;
634  }
635  return pixClone(tesseract_->pix_binary());
636 }
virtual TESS_LOCAL bool Threshold(Pix **pix)
Definition: baseapi.cpp:1972
Pix * pix_binary() const
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:892
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
int tesseract::TessBaseAPI::GetThresholdedImageScaleFactor ( ) const

Returns the scale factor of the thresholded image that would be returned by GetThresholdedImage() and the various GetX() methods that call GetComponentImages(). Returns 0 if no thresholder has been set.

Definition at line 782 of file baseapi.cpp.

782  {
783  if (thresholder_ == nullptr) {
784  return 0;
785  }
786  return thresholder_->GetScaleFactor();
787 }
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:892
char * tesseract::TessBaseAPI::GetTSVText ( int  page_number)

Make a TSV-formatted string from the internal data structures. page_number is 0-based but will appear in the output as 1-based. Returned string must be freed with the delete [] operator.

Definition at line 1341 of file baseapi.cpp.

1341  {
1342  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
1343  return nullptr;
1344 
1345  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1346  int page_id = page_number + 1; // we use 1-based page numbers.
1347 
1348  STRING tsv_str("");
1349 
1350  int page_num = page_id;
1351  int block_num = 0;
1352  int par_num = 0;
1353  int line_num = 0;
1354  int word_num = 0;
1355 
1356  tsv_str.add_str_int("1\t", page_num); // level 1 - page
1357  tsv_str.add_str_int("\t", block_num);
1358  tsv_str.add_str_int("\t", par_num);
1359  tsv_str.add_str_int("\t", line_num);
1360  tsv_str.add_str_int("\t", word_num);
1361  tsv_str.add_str_int("\t", rect_left_);
1362  tsv_str.add_str_int("\t", rect_top_);
1363  tsv_str.add_str_int("\t", rect_width_);
1364  tsv_str.add_str_int("\t", rect_height_);
1365  tsv_str += "\t-1\t\n";
1366 
1367  ResultIterator* res_it = GetIterator();
1368  while (!res_it->Empty(RIL_BLOCK)) {
1369  if (res_it->Empty(RIL_WORD)) {
1370  res_it->Next(RIL_WORD);
1371  continue;
1372  }
1373 
1374  // Add rows for any new block/paragraph/textline.
1375  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1376  block_num++;
1377  par_num = 0;
1378  line_num = 0;
1379  word_num = 0;
1380  tsv_str.add_str_int("2\t", page_num); // level 2 - block
1381  tsv_str.add_str_int("\t", block_num);
1382  tsv_str.add_str_int("\t", par_num);
1383  tsv_str.add_str_int("\t", line_num);
1384  tsv_str.add_str_int("\t", word_num);
1385  AddBoxToTSV(res_it, RIL_BLOCK, &tsv_str);
1386  tsv_str += "\t-1\t\n"; // end of row for block
1387  }
1388  if (res_it->IsAtBeginningOf(RIL_PARA)) {
1389  par_num++;
1390  line_num = 0;
1391  word_num = 0;
1392  tsv_str.add_str_int("3\t", page_num); // level 3 - paragraph
1393  tsv_str.add_str_int("\t", block_num);
1394  tsv_str.add_str_int("\t", par_num);
1395  tsv_str.add_str_int("\t", line_num);
1396  tsv_str.add_str_int("\t", word_num);
1397  AddBoxToTSV(res_it, RIL_PARA, &tsv_str);
1398  tsv_str += "\t-1\t\n"; // end of row for para
1399  }
1400  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1401  line_num++;
1402  word_num = 0;
1403  tsv_str.add_str_int("4\t", page_num); // level 4 - line
1404  tsv_str.add_str_int("\t", block_num);
1405  tsv_str.add_str_int("\t", par_num);
1406  tsv_str.add_str_int("\t", line_num);
1407  tsv_str.add_str_int("\t", word_num);
1408  AddBoxToTSV(res_it, RIL_TEXTLINE, &tsv_str);
1409  tsv_str += "\t-1\t\n"; // end of row for line
1410  }
1411 
1412  // Now, process the word...
1413  int left, top, right, bottom;
1414  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
1415  word_num++;
1416  tsv_str.add_str_int("5\t", page_num); // level 5 - word
1417  tsv_str.add_str_int("\t", block_num);
1418  tsv_str.add_str_int("\t", par_num);
1419  tsv_str.add_str_int("\t", line_num);
1420  tsv_str.add_str_int("\t", word_num);
1421  tsv_str.add_str_int("\t", left);
1422  tsv_str.add_str_int("\t", top);
1423  tsv_str.add_str_int("\t", right - left);
1424  tsv_str.add_str_int("\t", bottom - top);
1425  tsv_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
1426  tsv_str += "\t";
1427 
1428  // Increment counts if at end of block/paragraph/textline.
1429  if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) lcnt++;
1430  if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) pcnt++;
1431  if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++;
1432 
1433  do {
1434  tsv_str +=
1435  std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
1436  res_it->Next(RIL_SYMBOL);
1437  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1438  tsv_str += "\n"; // end of row
1439  wcnt++;
1440  }
1441 
1442  char* ret = new char[tsv_str.length() + 1];
1443  strcpy(ret, tsv_str.string());
1444  delete res_it;
1445  return ret;
1446 }
Definition: strngs.h:45
ResultIterator * GetIterator()
Definition: baseapi.cpp:1282
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:824
const char * tesseract::TessBaseAPI::GetUnichar ( int  unichar_id)

This method returns the string form of the specified unichar.

Definition at line 2252 of file baseapi.cpp.

2252  {
2253  return tesseract_->unicharset.id_to_unichar(unichar_id);
2254 }
UNICHARSET unicharset
Definition: ccutil.h:71
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
char * tesseract::TessBaseAPI::GetUNLVText ( )

The recognized text is returned as a char* which is coded as UNLV format Latin-1 with specific reject and suspect codes. Returned string must be freed with the delete [] operator.

Definition at line 1532 of file baseapi.cpp.

1532  {
1533  if (tesseract_ == nullptr ||
1534  (!recognition_done_ && Recognize(nullptr) < 0))
1535  return nullptr;
1536  bool tilde_crunch_written = false;
1537  bool last_char_was_newline = true;
1538  bool last_char_was_tilde = false;
1539 
1540  int total_length = TextLength(nullptr);
1541  PAGE_RES_IT page_res_it(page_res_);
1542  char* result = new char[total_length];
1543  char* ptr = result;
1544  for (page_res_it.restart_page(); page_res_it.word () != nullptr;
1545  page_res_it.forward()) {
1546  WERD_RES *word = page_res_it.word();
1547  // Process the current word.
1548  if (word->unlv_crunch_mode != CR_NONE) {
1549  if (word->unlv_crunch_mode != CR_DELETE &&
1550  (!tilde_crunch_written ||
1551  (word->unlv_crunch_mode == CR_KEEP_SPACE &&
1552  word->word->space() > 0 &&
1553  !word->word->flag(W_FUZZY_NON) &&
1554  !word->word->flag(W_FUZZY_SP)))) {
1555  if (!word->word->flag(W_BOL) &&
1556  word->word->space() > 0 &&
1557  !word->word->flag(W_FUZZY_NON) &&
1558  !word->word->flag(W_FUZZY_SP)) {
1559  /* Write a space to separate from preceding good text */
1560  *ptr++ = ' ';
1561  last_char_was_tilde = false;
1562  }
1563  if (!last_char_was_tilde) {
1564  // Write a reject char.
1565  last_char_was_tilde = true;
1566  *ptr++ = kUNLVReject;
1567  tilde_crunch_written = true;
1568  last_char_was_newline = false;
1569  }
1570  }
1571  } else {
1572  // NORMAL PROCESSING of non tilde crunched words.
1573  tilde_crunch_written = false;
1575  const char* wordstr = word->best_choice->unichar_string().string();
1576  const STRING& lengths = word->best_choice->unichar_lengths();
1577  int length = lengths.length();
1578  int i = 0;
1579  int offset = 0;
1580 
1581  if (last_char_was_tilde &&
1582  word->word->space() == 0 && wordstr[offset] == ' ') {
1583  // Prevent adjacent tilde across words - we know that adjacent tildes
1584  // within words have been removed.
1585  // Skip the first character.
1586  offset = lengths[i++];
1587  }
1588  if (i < length && wordstr[offset] != 0) {
1589  if (!last_char_was_newline)
1590  *ptr++ = ' ';
1591  else
1592  last_char_was_newline = false;
1593  for (; i < length; offset += lengths[i++]) {
1594  if (wordstr[offset] == ' ' ||
1595  wordstr[offset] == kTesseractReject) {
1596  *ptr++ = kUNLVReject;
1597  last_char_was_tilde = true;
1598  } else {
1599  if (word->reject_map[i].rejected())
1600  *ptr++ = kUNLVSuspect;
1601  UNICHAR ch(wordstr + offset, lengths[i]);
1602  int uni_ch = ch.first_uni();
1603  for (int j = 0; kUniChs[j] != 0; ++j) {
1604  if (kUniChs[j] == uni_ch) {
1605  uni_ch = kLatinChs[j];
1606  break;
1607  }
1608  }
1609  if (uni_ch <= 0xff) {
1610  *ptr++ = static_cast<char>(uni_ch);
1611  last_char_was_tilde = false;
1612  } else {
1613  *ptr++ = kUNLVReject;
1614  last_char_was_tilde = true;
1615  }
1616  }
1617  }
1618  }
1619  }
1620  if (word->word->flag(W_EOL) && !last_char_was_newline) {
1621  /* Add a new line output */
1622  *ptr++ = '\n';
1623  tilde_crunch_written = false;
1624  last_char_was_newline = true;
1625  last_char_was_tilde = false;
1626  }
1627  }
1628  *ptr++ = '\n';
1629  *ptr = '\0';
1630  return result;
1631 }
fuzzy nonspace
Definition: werd.h:40
Definition: strngs.h:45
TESS_LOCAL int TextLength(int *blob_count)
Definition: baseapi.cpp:2127
const STRING & unichar_string() const
Definition: ratngs.h:541
start of line
Definition: werd.h:32
int32_t length() const
Definition: strngs.cpp:189
fuzzy space
Definition: werd.h:39
end of line
Definition: werd.h:33
REJMAP reject_map
Definition: pageres.h:286
const char kTesseractReject
Definition: baseapi.cpp:100
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:309
const char * string() const
Definition: strngs.cpp:194
bool recognition_done_
page_res_ contains recognition data.
Definition: baseapi.h:901
uint8_t space()
Definition: werd.h:99
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:273
WERD_CHOICE * best_choice
Definition: pageres.h:234
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
const int kLatinChs[]
Definition: baseapi.cpp:1523
const char kUNLVSuspect
Definition: baseapi.cpp:104
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
const STRING & unichar_lengths() const
Definition: ratngs.h:548
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:824
const char kUNLVReject
Definition: baseapi.cpp:102
WERD * word
Definition: pageres.h:188
const int kUniChs[]
Definition: baseapi.cpp:1519
char * tesseract::TessBaseAPI::GetUTF8Text ( )

The recognized text is returned as a char* which is coded as UTF8 and must be freed with the delete [] operator.

Make a text string from the internal data structures.

Definition at line 1309 of file baseapi.cpp.

1309  {
1310  if (tesseract_ == nullptr ||
1311  (!recognition_done_ && Recognize(nullptr) < 0))
1312  return nullptr;
1313  STRING text("");
1314  ResultIterator *it = GetIterator();
1315  do {
1316  if (it->Empty(RIL_PARA)) continue;
1317  const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA));
1318  text += para_text.get();
1319  } while (it->Next(RIL_PARA));
1320  char* result = new char[text.length() + 1];
1321  strncpy(result, text.string(), text.length() + 1);
1322  delete it;
1323  return result;
1324 }
Definition: strngs.h:45
ResultIterator * GetIterator()
Definition: baseapi.cpp:1282
bool recognition_done_
page_res_ contains recognition data.
Definition: baseapi.h:901
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:824
Boxa * tesseract::TessBaseAPI::GetWords ( Pixa **  pixa)

Get the words as a leptonica-style Boxa, Pixa pair, in reading order. Can be called before or after Recognize.

Definition at line 678 of file baseapi.cpp.

678  {
679  return GetComponentImages(RIL_WORD, true, pixa, nullptr);
680 }
Boxa * GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:700
char * tesseract::TessBaseAPI::GetWordStrBoxText ( int  page_number = 0)

The recognized text is returned as a char* which is coded in the same format as a WordStr box file used in training. page_number is a 0-based page index that will appear in the box file. Returned string must be freed with the delete [] operator.

Create a UTF8 box file with WordStr strings from the internal data structures. page_number is a 0-base page index that will appear in the box file. Returned string must be freed with the delete [] operator.

Definition at line 31 of file wordstrboxrenderer.cpp.

31  {
32  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
33  return nullptr;
34 
35  STRING wordstr_box_str("");
36  int left = 0, top = 0, right = 0, bottom = 0;
37 
38  bool first_line = true;
39 
40  LTRResultIterator* res_it = GetLTRIterator();
41  while (!res_it->Empty(RIL_BLOCK)) {
42  if (res_it->Empty(RIL_WORD)) {
43  res_it->Next(RIL_WORD);
44  continue;
45  }
46 
47  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
48  if (!first_line) {
49  wordstr_box_str.add_str_int("\n\t ", right + 1);
50  wordstr_box_str.add_str_int(" ", image_height_ - bottom);
51  wordstr_box_str.add_str_int(" ", right + 5);
52  wordstr_box_str.add_str_int(" ", image_height_ - top);
53  wordstr_box_str.add_str_int(" ", page_number); // row for tab for EOL
54  wordstr_box_str += "\n";
55  } else {
56  first_line = false;
57  }
58  // Use bounding box for whole line for WordStr
59  res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
60  wordstr_box_str.add_str_int("WordStr ", left);
61  wordstr_box_str.add_str_int(" ", image_height_ - bottom);
62  wordstr_box_str.add_str_int(" ", right);
63  wordstr_box_str.add_str_int(" ", image_height_ - top);
64  wordstr_box_str.add_str_int(" ", page_number); // word
65  wordstr_box_str += " #";
66  }
67  do {
68  wordstr_box_str +=
69  std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
70  wordstr_box_str += " ";
71  res_it->Next(RIL_WORD);
72  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
73  }
74 
75  if (left != 0 && top != 0 && right != 0 && bottom != 0) {
76  wordstr_box_str.add_str_int("\n\t ", right + 1);
77  wordstr_box_str.add_str_int(" ", image_height_ - bottom);
78  wordstr_box_str.add_str_int(" ", right + 5);
79  wordstr_box_str.add_str_int(" ", image_height_ - top);
80  wordstr_box_str.add_str_int(" ", page_number); // row for tab for EOL
81  wordstr_box_str += "\n";
82  }
83  char* ret = new char[wordstr_box_str.length() + 1];
84  strcpy(ret, wordstr_box_str.string());
85  delete res_it;
86  return ret;
87 }
Definition: strngs.h:45
TESS_LOCAL LTRResultIterator * GetLTRIterator()
Definition: baseapi.cpp:1265
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:824
void tesseract::TessBaseAPI::InitTruthCallback ( TruthCallback cb)
inline

Definition at line 805 of file baseapi.h.

805 { truth_cb_ = cb; }
TruthCallback * truth_cb_
Definition: baseapi.h:902
bool tesseract::TessBaseAPI::IsValidCharacter ( const char *  utf8_character)

Definition at line 1880 of file baseapi.cpp.

1880  {
1881  return tesseract_->unicharset.contains_unichar(utf8_character);
1882 }
UNICHARSET unicharset
Definition: ccutil.h:71
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
int tesseract::TessBaseAPI::IsValidWord ( const char *  word)

Check whether a word is valid according to Tesseract's language model

Returns
0 if the word is invalid, non-zero if valid.
Warning
temporary! This function will be removed from here and placed in a separate API at some future time.

Check whether a word is valid according to Tesseract's language model returns 0 if the word is invalid, non-zero if valid

Definition at line 1876 of file baseapi.cpp.

1876  {
1877  return tesseract_->getDict().valid_word(word);
1878 }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:787
Dict & getDict() override
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
TBLOB * tesseract::TessBaseAPI::MakeTBLOB ( Pix *  pix)
static

Returns a TBLOB corresponding to the entire input image.

Creates a TBLOB* from the whole pix.

Definition at line 2326 of file baseapi.cpp.

2326  {
2327  int width = pixGetWidth(pix);
2328  int height = pixGetHeight(pix);
2329  BLOCK block("a character", true, 0, 0, 0, 0, width, height);
2330 
2331  // Create C_BLOBs from the page
2332  extract_edges(pix, &block);
2333 
2334  // Merge all C_BLOBs
2335  C_BLOB_LIST *list = block.blob_list();
2336  C_BLOB_IT c_blob_it(list);
2337  if (c_blob_it.empty())
2338  return nullptr;
2339  // Move all the outlines to the first blob.
2340  C_OUTLINE_IT ol_it(c_blob_it.data()->out_list());
2341  for (c_blob_it.forward();
2342  !c_blob_it.at_first();
2343  c_blob_it.forward()) {
2344  C_BLOB *c_blob = c_blob_it.data();
2345  ol_it.add_list_after(c_blob->out_list());
2346  }
2347  // Convert the first blob to the output TBLOB.
2348  return TBLOB::PolygonalCopy(false, c_blob_it.data());
2349 }
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:330
static TBLOB * PolygonalCopy(bool allow_detailed_fx, C_BLOB *src)
Definition: blobs.cpp:331
Definition: ocrblock.h:29
ROW * tesseract::TessBaseAPI::MakeTessOCRRow ( float  baseline,
float  xheight,
float  descender,
float  ascender 
)
static

Returns a ROW object created from the input row specification.

Definition at line 2309 of file baseapi.cpp.

2312  {
2313  int32_t xstarts[] = {-32000};
2314  double quad_coeffs[] = {0, 0, baseline};
2315  return new ROW(1,
2316  xstarts,
2317  quad_coeffs,
2318  xheight,
2319  ascender - (baseline + xheight),
2320  descender - baseline,
2321  0,
2322  0);
2323 }
Definition: ocrrow.h:36
int tesseract::TessBaseAPI::MeanTextConf ( )

Returns the (average) confidence value between 0 and 100.

Returns the average word confidence for Tesseract page result.

Definition at line 1710 of file baseapi.cpp.

1710  {
1711  int* conf = AllWordConfidences();
1712  if (!conf) return 0;
1713  int sum = 0;
1714  int *pt = conf;
1715  while (*pt >= 0) sum += *pt++;
1716  if (pt != conf) sum /= pt - conf;
1717  delete [] conf;
1718  return sum;
1719 }
void tesseract::TessBaseAPI::NormalizeTBLOB ( TBLOB tblob,
ROW row,
bool  numeric_mode 
)
static

This method baseline normalizes a TBLOB in-place. The input row is used for normalization. The denorm is an optional parameter in which the normalization-antidote is returned.

Definition at line 2356 of file baseapi.cpp.

2356  {
2357  TBOX box = tblob->bounding_box();
2358  float x_center = (box.left() + box.right()) / 2.0f;
2359  float baseline = row->base_line(x_center);
2360  float scale = kBlnXHeight / row->x_height();
2361  tblob->Normalize(nullptr, nullptr, nullptr, x_center, baseline, scale, scale,
2362  0.0f, static_cast<float>(kBlnBaselineOffset), false, nullptr);
2363 }
const int kBlnXHeight
Definition: normalis.h:24
float x_height() const
Definition: ocrrow.h:64
Definition: rect.h:34
TBOX bounding_box() const
Definition: blobs.cpp:472
const int kBlnBaselineOffset
Definition: normalis.h:25
int16_t right() const
Definition: rect.h:79
float base_line(float xpos) const
Definition: ocrrow.h:59
int16_t left() const
Definition: rect.h:72
void Normalize(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor, float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift, float final_yshift, bool inverse, Pix *pix)
Definition: blobs.cpp:401
int tesseract::TessBaseAPI::NumDawgs ( ) const

Return the number of dawgs loaded into tesseract_ object.

Definition at line 2263 of file baseapi.cpp.

2263  {
2264  return tesseract_ == nullptr ? 0 : tesseract_->getDict().NumDawgs();
2265 }
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:421
Dict & getDict() override
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
OcrEngineMode tesseract::TessBaseAPI::oem ( ) const
inline

Definition at line 803 of file baseapi.h.

803 { return last_oem_requested_; }
OcrEngineMode last_oem_requested_
Last ocr language mode requested.
Definition: baseapi.h:900
bool tesseract::TessBaseAPI::ProcessPage ( Pix *  pix,
int  page_index,
const char *  filename,
const char *  retry_config,
int  timeout_millisec,
TessResultRenderer renderer 
)

Turn a single image into symbolic text.

The pix is the image processed. filename and page_index are metadata used by side-effect processes, such as reading a box file or formatting as hOCR.

See ProcessPages for desciptions of other parameters.

Definition at line 1198 of file baseapi.cpp.

1200  {
1201  SetInputName(filename);
1202  SetImage(pix);
1203  bool failed = false;
1204 
1206  // Disabled character recognition
1207  PageIterator* it = AnalyseLayout();
1208 
1209  if (it == nullptr) {
1210  failed = true;
1211  } else {
1212  delete it;
1213  }
1215  failed = FindLines() != 0;
1216  } else if (timeout_millisec > 0) {
1217  // Running with a timeout.
1218  ETEXT_DESC monitor;
1219  monitor.cancel = nullptr;
1220  monitor.cancel_this = nullptr;
1221  monitor.set_deadline_msecs(timeout_millisec);
1222 
1223  // Now run the main recognition.
1224  failed = Recognize(&monitor) < 0;
1225  } else {
1226  // Normal layout and character recognition with no timeout.
1227  failed = Recognize(nullptr) < 0;
1228  }
1229 
1231 #ifndef ANDROID_BUILD
1232  Pix* page_pix = GetThresholdedImage();
1233  pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4);
1234 #endif // ANDROID_BUILD
1235  }
1236 
1237  if (failed && retry_config != nullptr && retry_config[0] != '\0') {
1238  // Save current config variables before switching modes.
1239  FILE* fp = fopen(kOldVarsFile, "wb");
1240  if (fp == nullptr) {
1241  tprintf("Error, failed to open file \"%s\"\n", kOldVarsFile);
1242  } else {
1243  PrintVariables(fp);
1244  fclose(fp);
1245  }
1246  // Switch to alternate mode for retry.
1247  ReadConfigFile(retry_config);
1248  SetImage(pix);
1249  Recognize(nullptr);
1250  // Restore saved config variables.
1251  ReadConfigFile(kOldVarsFile);
1252  }
1253 
1254  if (renderer && !failed) {
1255  failed = !renderer->AddImage(this);
1256  }
1257 
1258  return !failed;
1259 }
void SetInputName(const char *name)
Definition: baseapi.cpp:265
Orientation and script detection only.
Definition: publictypes.h:164
Pix * GetThresholdedImage()
Definition: baseapi.cpp:629
void ReadConfigFile(const char *filename)
Definition: baseapi.cpp:495
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: baseapi.cpp:574
Automatic page segmentation, but no OSD, or OCR.
Definition: publictypes.h:167
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:116
void PrintVariables(FILE *fp) const
Definition: baseapi.cpp:328
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void set_deadline_msecs(int32_t deadline_msecs)
Definition: ocrclass.h:129
TESS_LOCAL int FindLines()
Definition: baseapi.cpp:2026
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:824
PageIterator * AnalyseLayout()
Definition: baseapi.cpp:804
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:112
bool tesseract::TessBaseAPI::ProcessPages ( const char *  filename,
const char *  retry_config,
int  timeout_millisec,
TessResultRenderer renderer 
)

Turns images into symbolic text.

filename can point to a single image, a multi-page TIFF, or a plain text list of image filenames.

retry_config is useful for debugging. If not nullptr, you can fall back to an alternate configuration if a page fails for some reason.

timeout_millisec terminates processing if any single page takes too long. Set to 0 for unlimited time.

renderer is responible for creating the output. For example, use the TessTextRenderer if you want plaintext output, or the TessPDFRender to produce searchable PDF.

If tessedit_page_number is non-negative, will only process that single page. Works for multi-page tiff file, or filelist.

Returns true if successful, false on error.

Definition at line 1068 of file baseapi.cpp.

1070  {
1071  bool result =
1072  ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
1073  #ifndef DISABLED_LEGACY_ENGINE
1074  if (result) {
1077  tprintf("Write of TR file failed: %s\n", output_file_->string());
1078  return false;
1079  }
1080  }
1081  #endif // ndef DISABLED_LEGACY_ENGINE
1082  return result;
1083 }
bool WriteTRFile(const STRING &filename)
Definition: blobclass.cpp:101
bool ProcessPagesInternal(const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1096
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
STRING * output_file_
Name used by debug code.
Definition: baseapi.h:897
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
bool tesseract::TessBaseAPI::ProcessPagesInternal ( const char *  filename,
const char *  retry_config,
int  timeout_millisec,
TessResultRenderer renderer 
)

Definition at line 1096 of file baseapi.cpp.

1099  {
1100  bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
1101  if (stdInput) {
1102 #ifdef WIN32
1103  if (_setmode(_fileno(stdin), _O_BINARY) == -1)
1104  tprintf("ERROR: cin to binary: %s", strerror(errno));
1105 #endif // WIN32
1106  }
1107 
1108  if (stream_filelist) {
1109  return ProcessPagesFileList(stdin, nullptr, retry_config,
1110  timeout_millisec, renderer,
1112  }
1113 
1114  // At this point we are officially in autodection territory.
1115  // That means any data in stdin must be buffered, to make it
1116  // seekable.
1117  std::string buf;
1118  const l_uint8 *data = nullptr;
1119  if (stdInput) {
1120  buf.assign((std::istreambuf_iterator<char>(std::cin)),
1121  (std::istreambuf_iterator<char>()));
1122  data = reinterpret_cast<const l_uint8 *>(buf.data());
1123  } else {
1124  // Check whether the input file can be read.
1125  if (FILE* file = fopen(filename, "rb")) {
1126  fclose(file);
1127  } else {
1128  fprintf(stderr, "Error, cannot read input file %s: %s\n",
1129  filename, strerror(errno));
1130  return false;
1131  }
1132  }
1133 
1134  // Here is our autodetection
1135  int format;
1136  int r = (stdInput) ?
1137  findFileFormatBuffer(data, &format) :
1138  findFileFormat(filename, &format);
1139 
1140  // Maybe we have a filelist
1141  if (r != 0 || format == IFF_UNKNOWN) {
1142  STRING s;
1143  if (stdInput) {
1144  s = buf.c_str();
1145  } else {
1146  std::ifstream t(filename);
1147  std::string u((std::istreambuf_iterator<char>(t)),
1148  std::istreambuf_iterator<char>());
1149  s = u.c_str();
1150  }
1151  return ProcessPagesFileList(nullptr, &s, retry_config,
1152  timeout_millisec, renderer,
1154  }
1155 
1156  // Maybe we have a TIFF which is potentially multipage
1157  bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS ||
1158  format == IFF_TIFF_RLE || format == IFF_TIFF_G3 ||
1159  format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
1160 #if LIBLEPT_MAJOR_VERSION > 1 || LIBLEPT_MINOR_VERSION > 76
1161  format == IFF_TIFF_JPEG ||
1162 #endif
1163  format == IFF_TIFF_ZIP);
1164 
1165  // Fail early if we can, before producing any output
1166  Pix *pix = nullptr;
1167  if (!tiff) {
1168  pix = (stdInput) ? pixReadMem(data, buf.size()) : pixRead(filename);
1169  if (pix == nullptr) {
1170  return false;
1171  }
1172  }
1173 
1174  // Begin the output
1175  if (renderer && !renderer->BeginDocument(unknown_title_)) {
1176  pixDestroy(&pix);
1177  return false;
1178  }
1179 
1180  // Produce output
1181  r = (tiff) ?
1182  ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config,
1183  timeout_millisec, renderer,
1185  ProcessPage(pix, 0, filename, retry_config,
1186  timeout_millisec, renderer);
1187 
1188  // Clean up memory as needed
1189  pixDestroy(&pix);
1190 
1191  // End the output
1192  if (!r || (renderer && !renderer->EndDocument())) {
1193  return false;
1194  }
1195  return true;
1196 }
bool ProcessPage(Pix *pix, int page_index, const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1198
Definition: strngs.h:45
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
const char * c_str() const
Definition: strngs.cpp:205
int tesseract::TessBaseAPI::Recognize ( ETEXT_DESC monitor)

Recognize the image from SetAndThresholdImage, generating Tesseract internal structures. Returns 0 on success. Optional. The Get*Text functions below will call Recognize if needed. After Recognize, the output is kept internally until the next SetImage.

Recognize the tesseract global image and return the result as Tesseract internal structures.

Definition at line 824 of file baseapi.cpp.

824  {
825  if (tesseract_ == nullptr)
826  return -1;
827  if (FindLines() != 0)
828  return -1;
829  delete page_res_;
830  if (block_list_->empty()) {
831  page_res_ = new PAGE_RES(false, block_list_,
833  return 0; // Empty page.
834  }
835 
837  recognition_done_ = true;
838 #ifndef DISABLED_LEGACY_ENGINE
843  } else
844 #endif // ndef DISABLED_LEGACY_ENGINE
845  {
848  }
849 
850  if (page_res_ == nullptr) {
851  return -1;
852  }
853 
857  return 0;
858  }
859 #ifndef DISABLED_LEGACY_ENGINE
862  return 0;
863  }
864 #endif // ndef DISABLED_LEGACY_ENGINE
865 
866  if (truth_cb_ != nullptr) {
867  tesseract_->wordrec_run_blamer.set_value(true);
868  auto *page_it = new PageIterator(
873  image_height_, page_it, this->tesseract()->pix_grey());
874  delete page_it;
875  }
876 
877  int result = 0;
879  #ifndef GRAPHICS_DISABLED
881  #endif // GRAPHICS_DISABLED
882  // The page_res is invalid after an interactive session, so cleanup
883  // in a way that lets us continue to the next page without crashing.
884  delete page_res_;
885  page_res_ = nullptr;
886  return -1;
887  #ifndef DISABLED_LEGACY_ENGINE
889  STRING fontname;
890  ExtractFontName(*output_file_, &fontname);
892  } else if (tesseract_->tessedit_ambigs_training) {
893  FILE *training_output_file = tesseract_->init_recog_training(*input_file_);
894  // OCR the page segmented into words by tesseract.
896  *input_file_, page_res_, monitor, training_output_file);
897  fclose(training_output_file);
898  #endif // ndef DISABLED_LEGACY_ENGINE
899  } else {
900  // Now run the main recognition.
901  bool wait_for_text = true;
902  GetBoolVariable("paragraph_text_based", &wait_for_text);
903  if (!wait_for_text) DetectParagraphs(false);
904  if (tesseract_->recog_all_words(page_res_, monitor, nullptr, nullptr, 0)) {
905  if (wait_for_text) DetectParagraphs(true);
906  } else {
907  result = -1;
908  }
909  }
910  return result;
911 }
STRING * input_file_
Name used by training code.
Definition: baseapi.h:896
Definition: strngs.h:45
bool wordrec_run_blamer
Definition: wordrec.h:237
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:303
void ExtractFontName(const STRING &filename, STRING *fontname)
Definition: blobclass.cpp:46
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
Definition: applybox.cpp:807
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:300
virtual void Run(A1, A2, A3, A4)=0
TESS_LOCAL void DetectParagraphs(bool after_text_recognition)
Definition: baseapi.cpp:2236
TruthCallback * truth_cb_
Definition: baseapi.h:902
void recog_training_segmented(const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
Dict & getDict() override
bool recognition_done_
page_res_ contains recognition data.
Definition: baseapi.h:901
int GetScaledYResolution() const
Definition: thresholder.h:92
Tesseract * tesseract() const
Definition: baseapi.h:801
void TrainLineRecognizer(const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
Definition: linerec.cpp:43
TESS_LOCAL int FindLines()
Definition: baseapi.cpp:2026
bool tessedit_resegment_from_line_boxes
bool AnyLSTMLang() const
Pix * pix_grey() const
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:892
STRING * output_file_
Name used by debug code.
Definition: baseapi.h:897
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
void pgeditor_main(int width, int height, PAGE_RES *page_res)
Definition: pgedit.cpp:380
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
Definition: applybox.cpp:109
void CorrectClassifyWords(PAGE_RES *page_res)
Definition: applybox.cpp:780
BLOCK_LIST * block_list_
The page layout.
Definition: baseapi.h:894
FILE * init_recog_training(const STRING &fname)
int tesseract::TessBaseAPI::RecognizeForChopTest ( ETEXT_DESC monitor)

Methods to retrieve information after SetAndThresholdImage(), Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)Variant on Recognize used for testing chopper.

Tests the chopper by exhaustively running chop_one_blob.

Definition at line 915 of file baseapi.cpp.

915  {
916  if (tesseract_ == nullptr)
917  return -1;
918  if (thresholder_ == nullptr || thresholder_->IsEmpty()) {
919  tprintf("Please call SetImage before attempting recognition.\n");
920  return -1;
921  }
922  if (page_res_ != nullptr)
923  ClearResults();
924  if (FindLines() != 0)
925  return -1;
926  // Additional conditions under which chopper test cannot be run
927  if (tesseract_->interactive_display_mode) return -1;
928 
929  recognition_done_ = true;
930 
931  page_res_ = new PAGE_RES(false, block_list_,
933 
934  PAGE_RES_IT page_res_it(page_res_);
935 
936  while (page_res_it.word() != nullptr) {
937  WERD_RES *word_res = page_res_it.word();
938  GenericVector<TBOX> boxes;
939  tesseract_->MaximallyChopWord(boxes, page_res_it.block()->block,
940  page_res_it.row()->row, word_res);
941  page_res_it.forward();
942  }
943  return 0;
944 }
bool IsEmpty() const
Return true if no image has been set.
Definition: thresholder.cpp:53
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:243
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
bool recognition_done_
page_res_ contains recognition data.
Definition: baseapi.h:901
TESS_LOCAL int FindLines()
Definition: baseapi.cpp:2026
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:892
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
WERD * word
Definition: pageres.h:188
BLOCK_LIST * block_list_
The page layout.
Definition: baseapi.h:894
void tesseract::TessBaseAPI::RunAdaptiveClassifier ( TBLOB blob,
int  num_max_matches,
int *  unichar_ids,
float *  ratings,
int *  num_matches_returned 
)

Method to run adaptive classifier on a blob. It returns at max num_max_matches results.

Method to run adaptive classifier on a blob.

Definition at line 2618 of file baseapi.cpp.

2622  {
2623  auto* choices = new BLOB_CHOICE_LIST;
2624  tesseract_->AdaptiveClassifier(blob, choices);
2625  BLOB_CHOICE_IT choices_it(choices);
2626  int& index = *num_matches_returned;
2627  index = 0;
2628  for (choices_it.mark_cycle_pt();
2629  !choices_it.cycled_list() && index < num_max_matches;
2630  choices_it.forward()) {
2631  BLOB_CHOICE* choice = choices_it.data();
2632  unichar_ids[index] = choice->unichar_id();
2633  ratings[index] = choice->rating();
2634  ++index;
2635  }
2636  *num_matches_returned = index;
2637  delete choices;
2638 }
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
float rating() const
Definition: ratngs.h:80
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:191
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
void tesseract::TessBaseAPI::set_min_orientation_margin ( double  margin)

Definition at line 2173 of file baseapi.cpp.

2173  {
2174  tesseract_->min_orientation_margin.set_value(margin);
2175 }
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
void tesseract::TessBaseAPI::SetDictFunc ( DictFunc  f)

Sets Dict::letter_is_okay_ function to point to the given function.

Definition at line 1922 of file baseapi.cpp.

1922  {
1923  if (tesseract_ != nullptr) {
1925  }
1926 }
Dict & getDict() override
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:363
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
void tesseract::TessBaseAPI::SetFillLatticeFunc ( FillLatticeFunc  f)

Sets Wordrec::fill_lattice_ function to point to the given function.

Definition at line 1949 of file baseapi.cpp.

1949  {
1950  if (tesseract_ != nullptr) tesseract_->fill_lattice_ = f;
1951 }
void(Wordrec::* fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:485
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
void tesseract::TessBaseAPI::SetImage ( const unsigned char *  imagedata,
int  width,
int  height,
int  bytes_per_pixel,
int  bytes_per_line 
)

Provide an image for Tesseract to recognize. Format is as TesseractRect above. Copies the image buffer and converts to Pix. SetImage clears all recognition results, and sets the rectangle to the full image, so it may be followed immediately by a GetUTF8Text, and it will automatically perform recognition.

Definition at line 574 of file baseapi.cpp.

576  {
577  if (InternalSetImage()) {
578  thresholder_->SetImage(imagedata, width, height,
579  bytes_per_pixel, bytes_per_line);
581  }
582 }
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:65
void SetInputImage(Pix *pix)
Definition: baseapi.cpp:948
TESS_LOCAL bool InternalSetImage()
Definition: baseapi.cpp:1955
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:892
void tesseract::TessBaseAPI::SetImage ( Pix *  pix)

Provide an image for Tesseract to recognize. As with SetImage above, Tesseract takes its own copy of the image, so it need not persist until after Recognize. Pix vs raw, which to use? Use Pix where possible. Tesseract uses Pix as its internal representation and it is therefore more efficient to provide a Pix directly.

Definition at line 599 of file baseapi.cpp.

599  {
600  if (InternalSetImage()) {
601  if (pixGetSpp(pix) == 4 && pixGetInputFormat(pix) == IFF_PNG) {
602  // remove alpha channel from png
603  Pix* p1 = pixRemoveAlpha(pix);
604  pixSetSpp(p1, 3);
605  (void)pixCopy(pix, p1);
606  pixDestroy(&p1);
607  }
608  thresholder_->SetImage(pix);
610  }
611 }
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:65
void SetInputImage(Pix *pix)
Definition: baseapi.cpp:948
TESS_LOCAL bool InternalSetImage()
Definition: baseapi.cpp:1955
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:892
void tesseract::TessBaseAPI::SetProbabilityInContextFunc ( ProbabilityInContextFunc  f)

Sets Dict::probability_in_context_ function to point to the given function.

Sets Dict::probability_in_context_ function to point to the given function.

Parameters
fA single function that returns the probability of the current "character" (in general a utf-8 string), given the context of a previous utf-8 string.

Definition at line 1936 of file baseapi.cpp.

1936  {
1937  if (tesseract_ != nullptr) {
1939  // Set it for the sublangs too.
1940  int num_subs = tesseract_->num_sub_langs();
1941  for (int i = 0; i < num_subs; ++i) {
1943  }
1944  }
1945 }
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:375
Tesseract * get_sub_lang(int index) const
int num_sub_langs() const
Dict & getDict() override
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
void tesseract::TessBaseAPI::SetRectangle ( int  left,
int  top,
int  width,
int  height 
)

Restrict recognition to a sub-rectangle of the image. Call after SetImage. Each SetRectangle clears the recogntion results so multiple rectangles can be recognized with the same image.

Definition at line 618 of file baseapi.cpp.

618  {
619  if (thresholder_ == nullptr)
620  return;
621  thresholder_->SetRectangle(left, top, width, height);
622  ClearResults();
623 }
void SetRectangle(int left, int top, int width, int height)
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:892
void tesseract::TessBaseAPI::SetSourceResolution ( int  ppi)

Set the resolution of the source image in pixels per inch so font size information can be calculated in results. Call this after SetImage().

Definition at line 584 of file baseapi.cpp.

584  {
585  if (thresholder_)
587  else
588  tprintf("Please call SetImage before SetSourceResolution.\n");
589 }
void SetSourceYResolution(int ppi)
Definition: thresholder.h:85
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:892
void tesseract::TessBaseAPI::SetThresholder ( ImageThresholder thresholder)
inline

In extreme cases only, usually with a subclass of Thresholder, it is possible to provide a different Thresholder. The Thresholder may be preloaded with an image, settings etc, or they may be set after. Note that Tesseract takes ownership of the Thresholder and will delete it when it it is replaced or the API is destructed.

Definition at line 365 of file baseapi.h.

365  {
366  delete thresholder_;
367  thresholder_ = thresholder;
368  ClearResults();
369  }
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:892
Tesseract* tesseract::TessBaseAPI::tesseract ( ) const
inline

Definition at line 801 of file baseapi.h.

801 { return tesseract_; }
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888