tesseract  4.1.0
tesseractclass.h
Go to the documentation of this file.
1 // File: tesseractclass.h
3 // Description: The Tesseract class. It holds/owns everything needed
4 // to run Tesseract on a single language, and also a set of
5 // sub-Tesseracts to run sub-languages. For thread safety, *every*
6 // global variable goes in here, directly, or indirectly.
7 // This makes it safe to run multiple Tesseracts in different
8 // threads in parallel, and keeps the different language
9 // instances separate.
10 // Author: Ray Smith
11 //
12 // (C) Copyright 2008, Google Inc.
13 // Licensed under the Apache License, Version 2.0 (the "License");
14 // you may not use this file except in compliance with the License.
15 // You may obtain a copy of the License at
16 // http://www.apache.org/licenses/LICENSE-2.0
17 // Unless required by applicable law or agreed to in writing, software
18 // distributed under the License is distributed on an "AS IS" BASIS,
19 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 // See the License for the specific language governing permissions and
21 // limitations under the License.
22 //
24 
25 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H_
26 #define TESSERACT_CCMAIN_TESSERACTCLASS_H_
27 
28 #include <cstdint> // for int16_t, int32_t, uint16_t
29 #include <cstdio> // for FILE
30 #include "allheaders.h" // for pixDestroy, pixGetWidth, pixGetHe...
31 #include "control.h" // for ACCEPTABLE_WERD_TYPE
32 #include "debugpixa.h" // for DebugPixa
33 #include "devanagari_processing.h" // for ShiroRekhaSplitter
34 #ifndef DISABLED_LEGACY_ENGINE
35 #include "docqual.h" // for GARBAGE_LEVEL
36 #endif
37 #include "genericvector.h" // for GenericVector, PointerVector
38 #include "pageres.h" // for WERD_RES (ptr only), PAGE_RES (pt...
39 #include "params.h" // for BOOL_VAR_H, BoolParam, DoubleParam
40 #include "points.h" // for FCOORD
41 #include "publictypes.h" // for OcrEngineMode, PageSegMode, OEM_L...
42 #include "ratngs.h" // for ScriptPos, WERD_CHOICE (ptr only)
43 #include "strngs.h" // for STRING
44 #include "tessdatamanager.h" // for TessdataManager
45 #include "textord.h" // for Textord
46 #include "unichar.h" // for UNICHAR_ID
47 #include "wordrec.h" // for Wordrec
48 
49 class BLOCK_LIST;
50 class ETEXT_DESC;
51 struct OSResults;
52 class PAGE_RES;
53 class PAGE_RES_IT;
54 struct Pix;
55 class ROW;
56 class SVMenuNode;
57 class TBOX;
58 class TO_BLOCK_LIST;
59 class WERD;
60 class WERD_CHOICE;
61 class WERD_RES;
62 
63 // Top-level class for all tesseract global instance data.
64 // This class either holds or points to all data used by an instance
65 // of Tesseract, including the memory allocator. When this is
66 // complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT!
67 //
68 // NOTE to developers: Do not create cyclic dependencies through this class!
69 // The directory dependency tree must remain a tree! The keep this clean,
70 // lower-level code (eg in ccutil, the bottom level) must never need to
71 // know about the content of a higher-level directory.
72 // The following scheme will grant the easiest access to lower-level
73 // global members without creating a cyclic dependency:
74 //
75 // Class Hierarchy (^ = inheritance):
76 //
77 // CCUtil (ccutil/ccutil.h)
78 // ^ Members include: UNICHARSET
79 // CUtil (cutil/cutil_class.h)
80 // ^ Members include: TBLOB*, TEXTBLOCK*
81 // CCStruct (ccstruct/ccstruct.h)
82 // ^ Members include: Image
83 // Classify (classify/classify.h)
84 // ^ Members include: Dict
85 // WordRec (wordrec/wordrec.h)
86 // ^ Members include: WERD*, DENORM*
87 // Tesseract (ccmain/tesseractclass.h)
88 // Members include: Pix*
89 //
90 // Other important classes:
91 //
92 // TessBaseAPI (api/baseapi.h)
93 // Members include: BLOCK_LIST*, PAGE_RES*,
94 // Tesseract*, ImageThresholder*
95 // Dict (dict/dict.h)
96 // Members include: Image* (private)
97 //
98 // NOTE: that each level contains members that correspond to global
99 // data that is defined (and used) at that level, not necessarily where
100 // the type is defined so for instance:
101 // BOOL_VAR_H(textord_show_blobs, false, "Display unsorted blobs");
102 // goes inside the Textord class, not the cc_util class.
103 
104 namespace tesseract {
105 
106 class ColumnFinder;
107 class DocumentData;
108 class EquationDetect;
109 class ImageData;
110 class LSTMRecognizer;
111 class Tesseract;
112 
113 // A collection of various variables for statistics and debugging.
117  doc_blob_quality(0),
118  doc_outline_errs(0),
119  doc_char_quality(0),
120  good_char_count(0),
122  word_count(0),
123  dict_words(0),
124  tilde_crunch_written(false),
125  last_char_was_newline(true),
126  last_char_was_tilde(false),
128 
135  int32_t word_count; // count of word in the document
136  int32_t dict_words; // number of dicitionary words in the document
137  STRING dump_words_str; // accumulator used by dump_words()
138  // Flags used by write_results()
143 };
144 
145 // Struct to hold all the pointers to relevant data for processing a word.
146 struct WordData {
148  : word(nullptr), row(nullptr), block(nullptr), prev_word(nullptr) {}
149  explicit WordData(const PAGE_RES_IT& page_res_it)
150  : word(page_res_it.word()),
151  row(page_res_it.row()->row),
152  block(page_res_it.block()->block),
153  prev_word(nullptr) {}
154  WordData(BLOCK* block_in, ROW* row_in, WERD_RES* word_res)
155  : word(word_res), row(row_in), block(block_in), prev_word(nullptr) {}
156 
162 };
163 
164 // Definition of a Tesseract WordRecognizer. The WordData provides the context
165 // of row/block, in_word holds an initialized, possibly pre-classified word,
166 // that the recognizer may or may not consume (but if so it sets
167 // *in_word=nullptr) and produces one or more output words in out_words, which
168 // may be the consumed in_word, or may be generated independently. This api
169 // allows both a conventional tesseract classifier to work, or a line-level
170 // classifier that generates multiple words from a merged input.
171 using WordRecognizer = void (Tesseract::*)(const WordData&, WERD_RES**,
173 
174 class Tesseract : public Wordrec {
175  public:
176  Tesseract();
177  ~Tesseract() override;
178 
179  // Return appropriate dictionary
180  Dict& getDict() override;
181 
182  // Clear as much used memory as possible without resetting the adaptive
183  // classifier or losing any other classifier data.
184  void Clear();
185  // Clear all memory of adaption for this and all subclassifiers.
186  void ResetAdaptiveClassifier();
187  // Clear the document dictionary for this and all subclassifiers.
188  void ResetDocumentDictionary();
189 
190  // Set the equation detector.
191  void SetEquationDetect(EquationDetect* detector);
192 
193  // Simple accessors.
194  const FCOORD& reskew() const {
195  return reskew_;
196  }
197  // Destroy any existing pix and return a pointer to the pointer.
199  pixDestroy(&pix_binary_);
200  return &pix_binary_;
201  }
202  Pix* pix_binary() const {
203  return pix_binary_;
204  }
205  Pix* pix_grey() const {
206  return pix_grey_;
207  }
208  void set_pix_grey(Pix* grey_pix) {
209  pixDestroy(&pix_grey_);
210  pix_grey_ = grey_pix;
211  }
212  Pix* pix_original() const {
213  return pix_original_;
214  }
215  // Takes ownership of the given original_pix.
216  void set_pix_original(Pix* original_pix) {
217  pixDestroy(&pix_original_);
218  pix_original_ = original_pix;
219  // Clone to sublangs as well.
220  for (int i = 0; i < sub_langs_.size(); ++i) {
221  sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
222  : nullptr);
223  }
224  }
225  // Returns a pointer to a Pix representing the best available resolution image
226  // of the page, with best available bit depth as second priority. Result can
227  // be of any bit depth, but never color-mapped, as that has always been
228  // removed. Note that in grey and color, 0 is black and 255 is
229  // white. If the input was binary, then black is 1 and white is 0.
230  // To tell the difference pixGetDepth() will return 32, 8 or 1.
231  // In any case, the return value is a borrowed Pix, and should not be
232  // deleted or pixDestroyed.
233  Pix* BestPix() const {
234  if (pixGetWidth(pix_original_) == ImageWidth()) {
235  return pix_original_;
236  } else if (pix_grey_ != nullptr) {
237  return pix_grey_;
238  } else {
239  return pix_binary_;
240  }
241  }
242  void set_pix_thresholds(Pix* thresholds) {
243  pixDestroy(&pix_thresholds_);
244  pix_thresholds_ = thresholds;
245  }
246  int source_resolution() const {
247  return source_resolution_;
248  }
249  void set_source_resolution(int ppi) {
250  source_resolution_ = ppi;
251  }
252  int ImageWidth() const {
253  return pixGetWidth(pix_binary_);
254  }
255  int ImageHeight() const {
256  return pixGetHeight(pix_binary_);
257  }
258  Pix* scaled_color() const {
259  return scaled_color_;
260  }
261  int scaled_factor() const {
262  return scaled_factor_;
263  }
264  void SetScaledColor(int factor, Pix* color) {
265  scaled_factor_ = factor;
266  scaled_color_ = color;
267  }
268  const Textord& textord() const {
269  return textord_;
270  }
272  return &textord_;
273  }
274 
275  bool right_to_left() const {
276  return right_to_left_;
277  }
278  int num_sub_langs() const {
279  return sub_langs_.size();
280  }
281  Tesseract* get_sub_lang(int index) const {
282  return sub_langs_[index];
283  }
284  // Returns true if any language uses Tesseract (as opposed to LSTM).
285  bool AnyTessLang() const {
286  if (tessedit_ocr_engine_mode != OEM_LSTM_ONLY)
287  return true;
288  for (int i = 0; i < sub_langs_.size(); ++i) {
289  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_LSTM_ONLY)
290  return true;
291  }
292  return false;
293  }
294  // Returns true if any language uses the LSTM.
295  bool AnyLSTMLang() const {
296  if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY)
297  return true;
298  for (int i = 0; i < sub_langs_.size(); ++i) {
299  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) {
300  return true;
301  }
302  }
303  return false;
304  }
305 
306  void SetBlackAndWhitelist();
307 
308  // Perform steps to prepare underlying binary image/other data structures for
309  // page segmentation. Uses the strategy specified in the global variable
310  // pageseg_devanagari_split_strategy for perform splitting while preparing for
311  // page segmentation.
312  void PrepareForPageseg();
313 
314  // Perform steps to prepare underlying binary image/other data structures for
315  // Tesseract OCR. The current segmentation is required by this method.
316  // Uses the strategy specified in the global variable
317  // ocr_devanagari_split_strategy for performing splitting while preparing for
318  // Tesseract ocr.
319  void PrepareForTessOCR(BLOCK_LIST* block_list, Tesseract* osd_tess,
320  OSResults* osr);
321 
322  int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
323  Tesseract* osd_tess, OSResults* osr);
324  void SetupWordScripts(BLOCK_LIST* blocks);
325  int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
326  TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
327  Tesseract* osd_tess, OSResults* osr);
328  ColumnFinder* SetupPageSegAndDetectOrientation(
329  PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess,
330  OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
331  Pix** music_mask_pix);
332  // par_control.cpp
333  void PrerecAllWordsPar(const GenericVector<WordData>& words);
334 
336  // Generates training data for training a line recognizer, eg LSTM.
337  // Breaks the page into lines, according to the boxes, and writes them to a
338  // serialized DocumentData based on output_basename.
339  void TrainLineRecognizer(const STRING& input_imagename,
340  const STRING& output_basename,
341  BLOCK_LIST* block_list);
342  // Generates training data for training a line recognizer, eg LSTM.
343  // Breaks the boxes into lines, normalizes them, converts to ImageData and
344  // appends them to the given training_data.
345  void TrainFromBoxes(const GenericVector<TBOX>& boxes,
346  const GenericVector<STRING>& texts,
347  BLOCK_LIST* block_list, DocumentData* training_data);
348 
349  // Returns an Imagedata containing the image of the given textline,
350  // and ground truth boxes/truth text if available in the input.
351  // The image is not normalized in any way.
352  ImageData* GetLineData(const TBOX& line_box, const GenericVector<TBOX>& boxes,
353  const GenericVector<STRING>& texts, int start_box,
354  int end_box, const BLOCK& block);
355  // Helper gets the image of a rectangle, using the block.re_rotation() if
356  // needed to get to the image, and rotating the result back to horizontal
357  // layout. (CJK characters will be on their left sides) The vertical text flag
358  // is set in the returned ImageData if the text was originally vertical, which
359  // can be used to invoke a different CJK recognition engine. The revised_box
360  // is also returned to enable calculation of output bounding boxes.
361  ImageData* GetRectImage(const TBOX& box, const BLOCK& block, int padding,
362  TBOX* revised_box) const;
363  // Recognizes a word or group of words, converting to WERD_RES in *words.
364  // Analogous to classify_word_pass1, but can handle a group of words as well.
365  void LSTMRecognizeWord(const BLOCK& block, ROW* row, WERD_RES* word,
366  PointerVector<WERD_RES>* words);
367  // Apply segmentation search to the given set of words, within the constraints
368  // of the existing ratings matrix. If there is already a best_choice on a word
369  // leaves it untouched and just sets the done/accepted etc flags.
370  void SearchWords(PointerVector<WERD_RES>* words);
371 
373  bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box,
374  const char* word_config, int pass);
375  // Sets up the words ready for whichever engine is to be run
376  void SetupAllWordsPassN(int pass_n, const TBOX* target_word_box,
377  const char* word_config, PAGE_RES* page_res,
378  GenericVector<WordData>* words);
379  // Sets up the single word ready for whichever engine is to be run.
380  void SetupWordPassN(int pass_n, WordData* word);
381  // Runs word recognition on all the words.
382  bool RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor, PAGE_RES_IT* pr_it,
383  GenericVector<WordData>* words);
384  bool recog_all_words(PAGE_RES* page_res, ETEXT_DESC* monitor,
385  const TBOX* target_word_box, const char* word_config,
386  int dopasses);
387  void rejection_passes(PAGE_RES* page_res, ETEXT_DESC* monitor,
388  const TBOX* target_word_box, const char* word_config);
389  void bigram_correction_pass(PAGE_RES* page_res);
390  void blamer_pass(PAGE_RES* page_res);
391  // Sets script positions and detects smallcaps on all output words.
392  void script_pos_pass(PAGE_RES* page_res);
393  // Helper to recognize the word using the given (language-specific) tesseract.
394  // Returns positive if this recognizer found more new best words than the
395  // number kept from best_words.
396  int RetryWithLanguage(const WordData& word_data, WordRecognizer recognizer,
397  bool debug, WERD_RES** in_word,
398  PointerVector<WERD_RES>* best_words);
399  // Moves good-looking "noise"/diacritics from the reject list to the main
400  // blob list on the current word. Returns true if anything was done, and
401  // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
402  bool ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
403  bool* make_next_word_fuzzy);
404  // Attempts to put noise/diacritic outlines into the blobs that they overlap.
405  // Input: a set of noisy outlines that probably belong to the real_word.
406  // Output: outlines that overlapped blobs are set to nullptr and put back into
407  // the word, either in the blobs or in the reject list.
408  void AssignDiacriticsToOverlappingBlobs(
409  const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
410  PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
411  GenericVector<bool>* overlapped_any_blob,
412  GenericVector<C_BLOB*>* target_blobs);
413  // Attempts to assign non-overlapping outlines to their nearest blobs or
414  // make new blobs out of them.
415  void AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE*>& outlines,
416  int pass, WERD* real_word, PAGE_RES_IT* pr_it,
417  GenericVector<bool>* word_wanted,
418  GenericVector<C_BLOB*>* target_blobs);
419  // Starting with ok_outlines set to indicate which outlines overlap the blob,
420  // chooses the optimal set (approximately) and returns true if any outlines
421  // are desired, in which case ok_outlines indicates which ones.
422  bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold,
423  PAGE_RES_IT* pr_it, C_BLOB* blob,
424  const GenericVector<C_OUTLINE*>& outlines,
425  int num_outlines,
426  GenericVector<bool>* ok_outlines);
427  // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
428  // the inclusion of the outlines, and returns the certainty of the raw choice.
429  float ClassifyBlobPlusOutlines(const GenericVector<bool>& ok_outlines,
430  const GenericVector<C_OUTLINE*>& outlines,
431  int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
432  STRING* best_str);
433  // Classifies the given blob (part of word_data->word->word) as an individual
434  // word, using languages, chopper etc, returning only the certainty of the
435  // best raw choice, and undoing all the work done to fake out the word.
436  float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
437  STRING* best_str, float* c2);
438  void classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
439  WordData* word_data);
440  void classify_word_pass1(const WordData& word_data, WERD_RES** in_word,
441  PointerVector<WERD_RES>* out_words);
442  void recog_pseudo_word(PAGE_RES* page_res, // blocks to check
443  TBOX& selection_box);
444 
445  void fix_rep_char(PAGE_RES_IT* page_res_it);
446 
447  ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET& char_set,
448  const char* s,
449  const char* lengths);
450  void match_word_pass_n(int pass_n, WERD_RES* word, ROW* row, BLOCK* block);
451  void classify_word_pass2(const WordData& word_data, WERD_RES** in_word,
452  PointerVector<WERD_RES>* out_words);
453  void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES* word,
454  WERD_RES* new_word);
455  bool RunOldFixXht(WERD_RES* word, BLOCK* block, ROW* row);
456  bool TrainedXheightFix(WERD_RES* word, BLOCK* block, ROW* row);
457  // Runs recognition with the test baseline shift and x-height and returns true
458  // if there was an improvement in recognition result.
459  bool TestNewNormalization(int original_misfits, float baseline_shift,
460  float new_x_ht, WERD_RES* word, BLOCK* block,
461  ROW* row);
462  bool recog_interactive(PAGE_RES_IT* pr_it);
463 
464  // Set fonts of this word.
465  void set_word_fonts(WERD_RES* word);
466  void font_recognition_pass(PAGE_RES* page_res);
467  void dictionary_correction_pass(PAGE_RES* page_res);
468  bool check_debug_pt(WERD_RES* word, int location);
469 
471  bool SubAndSuperscriptFix(WERD_RES* word_res);
472  void GetSubAndSuperscriptCandidates(
473  const WERD_RES* word, int* num_rebuilt_leading, ScriptPos* leading_pos,
474  float* leading_certainty, int* num_rebuilt_trailing,
475  ScriptPos* trailing_pos, float* trailing_certainty, float* avg_certainty,
476  float* unlikely_threshold);
477  WERD_RES* TrySuperscriptSplits(int num_chopped_leading,
478  float leading_certainty, ScriptPos leading_pos,
479  int num_chopped_trailing,
480  float trailing_certainty,
481  ScriptPos trailing_pos, WERD_RES* word,
482  bool* is_good, int* retry_leading,
483  int* retry_trailing);
484  bool BelievableSuperscript(bool debug, const WERD_RES& word,
485  float certainty_threshold, int* left_ok,
486  int* right_ok) const;
487 
489 
490  void output_pass(PAGE_RES_IT& page_res_it, const TBOX* target_word_box);
491  void write_results(PAGE_RES_IT& page_res_it, // full info
492  char newline_type, // type of newline
493  bool force_eol // override tilde crunch?
494  );
495  void set_unlv_suspects(WERD_RES* word);
496  UNICHAR_ID get_rep_char(WERD_RES* word); // what char is repeated?
497  bool acceptable_number_string(const char* s, const char* lengths);
498  int16_t count_alphanums(const WERD_CHOICE& word);
499  int16_t count_alphas(const WERD_CHOICE& word);
500 
501  void read_config_file(const char* filename, SetParamConstraint constraint);
502  // Initialize for potentially a set of languages defined by the language
503  // string and recursively any additional languages required by any language
504  // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
505  // See init_tesseract_internal for args.
506  int init_tesseract(const char* arg0, const char* textbase,
507  const char* language, OcrEngineMode oem, char** configs,
508  int configs_size, const GenericVector<STRING>* vars_vec,
509  const GenericVector<STRING>* vars_values,
510  bool set_only_init_params, TessdataManager* mgr);
511  int init_tesseract(const char* datapath, const char* language,
512  OcrEngineMode oem) {
513  TessdataManager mgr;
514  return init_tesseract(datapath, nullptr, language, oem, nullptr, 0, nullptr,
515  nullptr, false, &mgr);
516  }
517  // Common initialization for a single language.
518  // arg0 is the datapath for the tessdata directory, which could be the
519  // path of the tessdata directory with no trailing /, or (if tessdata
520  // lives in the same directory as the executable, the path of the executable,
521  // hence the name arg0.
522  // textbase is an optional output file basename (used only for training)
523  // language is the language code to load.
524  // oem controls which engine(s) will operate on the image
525  // configs (argv) is an array of config filenames to load variables from.
526  // May be nullptr.
527  // configs_size (argc) is the number of elements in configs.
528  // vars_vec is an optional vector of variables to set.
529  // vars_values is an optional corresponding vector of values for the variables
530  // in vars_vec.
531  // If set_only_init_params is true, then only the initialization variables
532  // will be set.
533  int init_tesseract_internal(const char* arg0, const char* textbase,
534  const char* language, OcrEngineMode oem,
535  char** configs, int configs_size,
536  const GenericVector<STRING>* vars_vec,
537  const GenericVector<STRING>* vars_values,
538  bool set_only_init_params, TessdataManager* mgr);
539 
540  // Set the universal_id member of each font to be unique among all
541  // instances of the same font loaded.
542  void SetupUniversalFontIds();
543 
544  int init_tesseract_lm(const char* arg0, const char* textbase,
545  const char* language, TessdataManager* mgr);
546 
547  void recognize_page(STRING& image_name);
548  void end_tesseract();
549 
550  bool init_tesseract_lang_data(const char* arg0, const char* textbase,
551  const char* language, OcrEngineMode oem,
552  char** configs, int configs_size,
553  const GenericVector<STRING>* vars_vec,
554  const GenericVector<STRING>* vars_values,
555  bool set_only_init_params,
556  TessdataManager* mgr);
557 
558  void ParseLanguageString(const char* lang_str, GenericVector<STRING>* to_load,
559  GenericVector<STRING>* not_to_load);
560 
562  SVMenuNode* build_menu_new();
563 #ifndef GRAPHICS_DISABLED
564  void pgeditor_main(int width, int height, PAGE_RES* page_res);
565 #endif // GRAPHICS_DISABLED
566  void process_image_event( // action in image win
567  const SVEvent& event);
568  bool process_cmd_win_event( // UI command semantics
569  int32_t cmd_event, // which menu item?
570  char* new_value // any prompt data
571  );
572  void debug_word(PAGE_RES* page_res, const TBOX& selection_box);
573  void do_re_display(
574  bool (tesseract::Tesseract::*word_painter)(PAGE_RES_IT* pr_it));
575  bool word_display(PAGE_RES_IT* pr_it);
576  bool word_bln_display(PAGE_RES_IT* pr_it);
577  bool word_blank_and_set_display(PAGE_RES_IT* pr_its);
578  bool word_set_display(PAGE_RES_IT* pr_it);
579  // #ifndef GRAPHICS_DISABLED
580  bool word_dumper(PAGE_RES_IT* pr_it);
581  // #endif // GRAPHICS_DISABLED
582  void blob_feature_display(PAGE_RES* page_res, const TBOX& selection_box);
584  // make rej map for word
585  void make_reject_map(WERD_RES* word, ROW* row, int16_t pass);
586  bool one_ell_conflict(WERD_RES* word_res, bool update_map);
587  int16_t first_alphanum_index(const char* word, const char* word_lengths);
588  int16_t first_alphanum_offset(const char* word, const char* word_lengths);
589  int16_t alpha_count(const char* word, const char* word_lengths);
590  bool word_contains_non_1_digit(const char* word, const char* word_lengths);
591  void dont_allow_1Il(WERD_RES* word);
592  int16_t count_alphanums( // how many alphanums
593  WERD_RES* word);
594  void flip_0O(WERD_RES* word);
595  bool non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
596  bool non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
597  bool repeated_nonalphanum_wd(WERD_RES* word, ROW* row);
598  void nn_match_word( // Match a word
599  WERD_RES* word, ROW* row);
600  void nn_recover_rejects(WERD_RES* word, ROW* row);
601  void set_done( // set done flag
602  WERD_RES* word, int16_t pass);
603  int16_t safe_dict_word(const WERD_RES* werd_res); // is best_choice in dict?
604  void flip_hyphens(WERD_RES* word);
605  void reject_I_1_L(WERD_RES* word);
606  void reject_edge_blobs(WERD_RES* word);
607  void reject_mostly_rejects(WERD_RES* word);
609  bool word_adaptable( // should we adapt?
610  WERD_RES* word, uint16_t mode);
611 
613  void recog_word_recursive(WERD_RES* word);
614  void recog_word(WERD_RES* word);
615  void split_and_recog_word(WERD_RES* word);
616  void split_word(WERD_RES* word, int split_pt, WERD_RES** right_piece,
617  BlamerBundle** orig_blamer_bundle) const;
618  void join_words(WERD_RES* word, WERD_RES* word2, BlamerBundle* orig_bb) const;
620  bool digit_or_numeric_punct(WERD_RES* word, int char_position);
621  int16_t eval_word_spacing(WERD_RES_LIST& word_res_list);
622  void match_current_words(WERD_RES_LIST& words, ROW* row, BLOCK* block);
623  int16_t fp_eval_word_spacing(WERD_RES_LIST& word_res_list);
624  void fix_noisy_space_list(WERD_RES_LIST& best_perm, ROW* row, BLOCK* block);
625  void fix_fuzzy_space_list(WERD_RES_LIST& best_perm, ROW* row, BLOCK* block);
626  void fix_sp_fp_word(WERD_RES_IT& word_res_it, ROW* row, BLOCK* block);
627  void fix_fuzzy_spaces( // find fuzzy words
628  ETEXT_DESC* monitor, // progress monitor
629  int32_t word_count, // count of words in doc
630  PAGE_RES* page_res);
631  void dump_words(WERD_RES_LIST& perm, int16_t score, int16_t mode,
632  bool improved);
633  bool fixspace_thinks_word_done(WERD_RES* word);
634  int16_t worst_noise_blob(WERD_RES* word_res, float* worst_noise_score);
635  float blob_noise_score(TBLOB* blob);
636  void break_noisiest_blob_word(WERD_RES_LIST& words);
638 #ifndef DISABLED_LEGACY_ENGINE
639  GARBAGE_LEVEL garbage_word(WERD_RES* word, bool ok_dict_word);
640  bool potential_word_crunch(WERD_RES* word, GARBAGE_LEVEL garbage_level,
641  bool ok_dict_word);
642 #endif
643  void tilde_crunch(PAGE_RES_IT& page_res_it);
644  void unrej_good_quality_words( // unreject potential
645  PAGE_RES_IT& page_res_it);
646  void doc_and_block_rejection( // reject big chunks
647  PAGE_RES_IT& page_res_it, bool good_quality_doc);
648  void quality_based_rejection(PAGE_RES_IT& page_res_it, bool good_quality_doc);
649  void convert_bad_unlv_chs(WERD_RES* word_res);
650  void tilde_delete(PAGE_RES_IT& page_res_it);
651  int16_t word_blob_quality(WERD_RES* word, ROW* row);
652  void word_char_quality(WERD_RES* word, ROW* row, int16_t* match_count,
653  int16_t* accepted_match_count);
654  void unrej_good_chs(WERD_RES* word, ROW* row);
655  int16_t count_outline_errs(char c, int16_t outline_count);
656  int16_t word_outline_errs(WERD_RES* word);
657 #ifndef DISABLED_LEGACY_ENGINE
658  bool terrible_word_crunch(WERD_RES* word, GARBAGE_LEVEL garbage_level);
659 #endif
660  CRUNCH_MODE word_deletable(WERD_RES* word, int16_t& delete_mode);
661  int16_t failure_count(WERD_RES* word);
662  bool noise_outlines(TWERD* word);
664  void process_selected_words(
665  PAGE_RES* page_res, // blocks to check
666  // function to call
667  TBOX& selection_box,
668  bool (tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it));
670  void tess_add_doc_word( // test acceptability
671  WERD_CHOICE* word_choice // after context
672  );
673  void tess_segment_pass_n(int pass_n, WERD_RES* word);
674  bool tess_acceptable_word(WERD_RES* word);
675 
677  // Applies the box file based on the image name fname, and resegments
678  // the words in the block_list (page), with:
679  // blob-mode: one blob per line in the box file, words as input.
680  // word/line-mode: one blob per space-delimited unit after the #, and one word
681  // per line in the box file. (See comment above for box file format.)
682  // If find_segmentation is true, (word/line mode) then the classifier is used
683  // to re-segment words/lines to match the space-delimited truth string for
684  // each box. In this case, the input box may be for a word or even a whole
685  // text line, and the output words will contain multiple blobs corresponding
686  // to the space-delimited input string.
687  // With find_segmentation false, no classifier is needed, but the chopper
688  // can still be used to correctly segment touching characters with the help
689  // of the input boxes.
690  // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
691  // from normal classification, ie. with a word, chopped_word, rebuild_word,
692  // seam_array, denorm, box_word, and best_state, but NO best_choice or
693  // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
694  // Instead, the correct_text member of WERD_RES is set, and this may be later
695  // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
696  // is not required before calling ApplyBoxTraining.
697  PAGE_RES* ApplyBoxes(const STRING& fname, bool find_segmentation,
698  BLOCK_LIST* block_list);
699 
700  // Any row xheight that is significantly different from the median is set
701  // to the median.
702  void PreenXHeights(BLOCK_LIST* block_list);
703 
704  // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
705  // All fuzzy spaces are removed, and all the words are maximally chopped.
706  PAGE_RES* SetupApplyBoxes(const GenericVector<TBOX>& boxes,
707  BLOCK_LIST* block_list);
708  // Tests the chopper by exhaustively running chop_one_blob.
709  // The word_res will contain filled chopped_word, seam_array, denorm,
710  // box_word and best_state for the maximally chopped word.
711  void MaximallyChopWord(const GenericVector<TBOX>& boxes, BLOCK* block,
712  ROW* row, WERD_RES* word_res);
713  // Gather consecutive blobs that match the given box into the best_state
714  // and corresponding correct_text.
715  // Fights over which box owns which blobs are settled by pre-chopping and
716  // applying the blobs to box or next_box with the least non-overlap.
717  // Returns false if the box was in error, which can only be caused by
718  // failing to find an appropriate blob for a box.
719  // This means that occasionally, blobs may be incorrectly segmented if the
720  // chopper fails to find a suitable chop point.
721  bool ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
722  const TBOX& box, const TBOX* next_box,
723  const char* correct_text);
724  // Consume all source blobs that strongly overlap the given box,
725  // putting them into a new word, with the correct_text label.
726  // Fights over which box owns which blobs are settled by
727  // applying the blobs to box or next_box with the least non-overlap.
728  // Returns false if the box was in error, which can only be caused by
729  // failing to find an overlapping blob for a box.
730  bool ResegmentWordBox(BLOCK_LIST* block_list, const TBOX& box,
731  const TBOX* next_box, const char* correct_text);
732  // Resegments the words by running the classifier in an attempt to find the
733  // correct segmentation that produces the required string.
734  void ReSegmentByClassification(PAGE_RES* page_res);
735  // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
736  // Returns false if an invalid UNICHAR_ID is encountered.
737  bool ConvertStringToUnichars(const char* utf8,
738  GenericVector<UNICHAR_ID>* class_ids);
739  // Resegments the word to achieve the target_text from the classifier.
740  // Returns false if the re-segmentation fails.
741  // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
742  // applies a full search on the classifier results to find the best classified
743  // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
744  // substitutions ARE used.
745  bool FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
746  WERD_RES* word_res);
747  // Recursive helper to find a match to the target_text (from text_index
748  // position) in the choices (from choices_pos position).
749  // Choices is an array of GenericVectors, of length choices_length, with each
750  // element representing a starting position in the word, and the
751  // GenericVector holding classification results for a sequence of consecutive
752  // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
753  void SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
754  int choices_pos, int choices_length,
755  const GenericVector<UNICHAR_ID>& target_text,
756  int text_index, float rating,
757  GenericVector<int>* segmentation, float* best_rating,
758  GenericVector<int>* best_segmentation);
759  // Counts up the labelled words and the blobs within.
760  // Deletes all unused or emptied words, counting the unused ones.
761  // Resets W_BOL and W_EOL flags correctly.
762  // Builds the rebuild_word and rebuilds the box_word.
763  void TidyUp(PAGE_RES* page_res);
764  // Logs a bad box by line in the box file and box coords.
765  void ReportFailedBox(int boxfile_lineno, TBOX box, const char* box_ch,
766  const char* err_msg);
767  // Creates a fake best_choice entry in each WERD_RES with the correct text.
768  void CorrectClassifyWords(PAGE_RES* page_res);
769  // Call LearnWord to extract features for labelled blobs within each word.
770  // Features are stored in an internal buffer.
771  void ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res);
772 
774  // Returns the number of misfit blob tops in this word.
775  int CountMisfitTops(WERD_RES* word_res);
776  // Returns a new x-height in pixels (original image coords) that is
777  // maximally compatible with the result in word_res.
778  // Returns 0.0f if no x-height is found that is better than the current
779  // estimate.
780  float ComputeCompatibleXheight(WERD_RES* word_res, float* baseline_shift);
782  // TODO(ocr-team): Find and remove obsolete parameters.
783  BOOL_VAR_H(tessedit_resegment_from_boxes, false,
784  "Take segmentation and labeling from box file");
785  BOOL_VAR_H(tessedit_resegment_from_line_boxes, false,
786  "Conversion of word/line box file to char box file");
787  BOOL_VAR_H(tessedit_train_from_boxes, false,
788  "Generate training data from boxed chars");
789  BOOL_VAR_H(tessedit_make_boxes_from_boxes, false,
790  "Generate more boxes from boxed chars");
791  BOOL_VAR_H(tessedit_train_line_recognizer, false,
792  "Break input into lines and remap boxes if present");
793  BOOL_VAR_H(tessedit_dump_pageseg_images, false,
794  "Dump intermediate images made during page segmentation");
795  INT_VAR_H(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
796  "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
797  " 5=line, 6=word, 7=char"
798  " (Values from PageSegMode enum in publictypes.h)");
799  INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
800  "Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults"
801  " to loading and running the most accurate available.");
802  STRING_VAR_H(tessedit_char_blacklist, "",
803  "Blacklist of chars not to recognize");
804  STRING_VAR_H(tessedit_char_whitelist, "", "Whitelist of chars to recognize");
805  STRING_VAR_H(tessedit_char_unblacklist, "",
806  "List of chars to override tessedit_char_blacklist");
807  BOOL_VAR_H(tessedit_ambigs_training, false,
808  "Perform training for ambiguities");
809  INT_VAR_H(pageseg_devanagari_split_strategy,
811  "Whether to use the top-line splitting process for Devanagari "
812  "documents while performing page-segmentation.");
813  INT_VAR_H(ocr_devanagari_split_strategy,
815  "Whether to use the top-line splitting process for Devanagari "
816  "documents while performing ocr.");
817  STRING_VAR_H(tessedit_write_params_to_file, "",
818  "Write all parameters to the given file.");
819  BOOL_VAR_H(tessedit_adaption_debug, false,
820  "Generate and print debug information for adaption");
821  INT_VAR_H(bidi_debug, 0, "Debug level for BiDi");
822  INT_VAR_H(applybox_debug, 1, "Debug level");
823  INT_VAR_H(applybox_page, 0, "Page number to apply boxes from");
824  STRING_VAR_H(applybox_exposure_pattern, ".exp",
825  "Exposure value follows this pattern in the image"
826  " filename. The name of the image files are expected"
827  " to be in the form [lang].[fontname].exp[num].tif");
828  BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode, false,
829  "Learn both character fragments (as is done in the"
830  " special low exposure mode) as well as unfragmented"
831  " characters.");
832  BOOL_VAR_H(applybox_learn_ngrams_mode, false,
833  "Each bounding box is assumed to contain ngrams. Only"
834  " learn the ngrams whose outlines overlap horizontally.");
835  BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words");
836  BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices");
837  BOOL_VAR_H(tessedit_timing_debug, false, "Print timing stats");
838  BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true, "Try to improve fuzzy spaces");
839  BOOL_VAR_H(tessedit_unrej_any_wd, false,
840  "Don't bother with word plausibility");
841  BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?");
842  BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height");
843  BOOL_VAR_H(tessedit_enable_doc_dict, true,
844  "Add words to the document dictionary");
845  BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char");
846  BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats");
847  BOOL_VAR_H(tessedit_enable_bigram_correction, true,
848  "Enable correction based on the word bigram dictionary.");
849  BOOL_VAR_H(tessedit_enable_dict_correction, false,
850  "Enable single word correction based on the dictionary.");
851  INT_VAR_H(tessedit_bigram_debug, 0,
852  "Amount of debug output for bigram "
853  "correction.");
854  BOOL_VAR_H(enable_noise_removal, true,
855  "Remove and conditionally reassign small outlines when they"
856  " confuse layout analysis, determining diacritics vs noise");
857  INT_VAR_H(debug_noise_removal, 0, "Debug reassignment of small outlines");
858  // Worst (min) certainty, for which a diacritic is allowed to make the base
859  // character worse and still be included.
860  double_VAR_H(noise_cert_basechar, -8.0, "Hingepoint for base char certainty");
861  // Worst (min) certainty, for which a non-overlapping diacritic is allowed to
862  // make the base character worse and still be included.
863  double_VAR_H(noise_cert_disjoint, -2.5, "Hingepoint for disjoint certainty");
864  // Worst (min) certainty, for which a diacritic is allowed to make a new
865  // stand-alone blob.
866  double_VAR_H(noise_cert_punc, -2.5, "Threshold for new punc char certainty");
867  // Factor of certainty margin for adding diacritics to not count as worse.
868  double_VAR_H(noise_cert_factor, 0.375,
869  "Scaling on certainty diff from Hingepoint");
870  INT_VAR_H(noise_maxperblob, 8, "Max diacritics to apply to a blob");
871  INT_VAR_H(noise_maxperword, 16, "Max diacritics to apply to a word");
872  INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug");
873  BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk");
874  STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation");
875  STRING_VAR_H(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
876  STRING_VAR_H(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation");
877  double_VAR_H(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit");
878  double_VAR_H(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit");
879  double_VAR_H(quality_outline_pc, 1.0,
880  "good_quality_doc lte outline error limit");
881  double_VAR_H(quality_char_pc, 0.95, "good_quality_doc gte good char limit");
882  INT_VAR_H(quality_min_initial_alphas_reqd, 2, "alphas in a good word");
883  INT_VAR_H(tessedit_tess_adaption_mode, 0x27,
884  "Adaptation decision algorithm for tess");
885  BOOL_VAR_H(tessedit_minimal_rej_pass1, false,
886  "Do minimal rejection on pass 1 output");
887  BOOL_VAR_H(tessedit_test_adaption, false, "Test adaption criteria");
888  BOOL_VAR_H(tessedit_matcher_log, false, "Log matcher activity");
889  INT_VAR_H(tessedit_test_adaption_mode, 3,
890  "Adaptation decision algorithm for tess");
891  BOOL_VAR_H(test_pt, false, "Test for point");
892  double_VAR_H(test_pt_x, 99999.99, "xcoord");
893  double_VAR_H(test_pt_y, 99999.99, "ycoord");
894  INT_VAR_H(multilang_debug_level, 0, "Print multilang debug info.");
895  INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info.");
896  BOOL_VAR_H(paragraph_text_based, true,
897  "Run paragraph detection on the post-text-recognition "
898  "(more accurate)");
899  BOOL_VAR_H(lstm_use_matrix, 1, "Use ratings matrix/beam searct with lstm");
900  STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines");
901  STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines");
902  BOOL_VAR_H(docqual_excuse_outline_errs, false,
903  "Allow outline errs in unrejection?");
904  BOOL_VAR_H(tessedit_good_quality_unrej, true,
905  "Reduce rejection on good docs");
906  BOOL_VAR_H(tessedit_use_reject_spaces, true, "Reject spaces?");
907  double_VAR_H(tessedit_reject_doc_percent, 65.00,
908  "%rej allowed before rej whole doc");
909  double_VAR_H(tessedit_reject_block_percent, 45.00,
910  "%rej allowed before rej whole block");
911  double_VAR_H(tessedit_reject_row_percent, 40.00,
912  "%rej allowed before rej whole row");
913  double_VAR_H(tessedit_whole_wd_rej_row_percent, 70.00,
914  "Number of row rejects in whole word rejects"
915  "which prevents whole row rejection");
916  BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds, true,
917  "Only rej partially rejected words in block rejection");
918  BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds, true,
919  "Only rej partially rejected words in row rejection");
920  BOOL_VAR_H(tessedit_dont_blkrej_good_wds, false,
921  "Use word segmentation quality metric");
922  BOOL_VAR_H(tessedit_dont_rowrej_good_wds, false,
923  "Use word segmentation quality metric");
924  INT_VAR_H(tessedit_preserve_min_wd_len, 2,
925  "Only preserve wds longer than this");
926  BOOL_VAR_H(tessedit_row_rej_good_docs, true,
927  "Apply row rejection to good docs");
928  double_VAR_H(tessedit_good_doc_still_rowrej_wd, 1.1,
929  "rej good doc wd if more than this fraction rejected");
930  BOOL_VAR_H(tessedit_reject_bad_qual_wds, true, "Reject all bad quality wds");
931  BOOL_VAR_H(tessedit_debug_doc_rejection, false, "Page stats");
932  BOOL_VAR_H(tessedit_debug_quality_metrics, false,
933  "Output data to debug file");
934  BOOL_VAR_H(bland_unrej, false, "unrej potential with no checks");
935  double_VAR_H(quality_rowrej_pc, 1.1, "good_quality_doc gte good char limit");
936  BOOL_VAR_H(unlv_tilde_crunching, false, "Mark v.bad words for tilde crunch");
937  BOOL_VAR_H(hocr_font_info, false, "Add font info to hocr output");
938  BOOL_VAR_H(hocr_char_boxes, false,
939  "Add coordinates for each character to hocr output");
940  BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
941  BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
942  double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");
943  BOOL_VAR_H(crunch_terrible_garbage, true, "As it says");
944  double_VAR_H(crunch_poor_garbage_cert, -9.0, "crunch garbage cert lt this");
945  double_VAR_H(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this");
946  double_VAR_H(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this");
947  double_VAR_H(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this");
948  BOOL_VAR_H(crunch_pot_garbage, true, "POTENTIAL crunch garbage");
949  double_VAR_H(crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
950  double_VAR_H(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
951  double_VAR_H(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
952  double_VAR_H(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
953  double_VAR_H(crunch_del_min_width, 3.0, "Del if word width lt xht x this");
954  double_VAR_H(crunch_del_high_word, 1.5, "Del if word gt xht x this above bl");
955  double_VAR_H(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl");
956  double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this");
957  INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch");
958  INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed");
959  BOOL_VAR_H(crunch_leave_ok_strings, true, "Don't touch sensible strings");
960  BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring");
961  BOOL_VAR_H(crunch_leave_accept_strings, false,
962  "Don't pot crunch sensible strings");
963  BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures");
964  INT_VAR_H(crunch_leave_lc_strings, 4,
965  "Don't crunch words with long lower case strings");
966  INT_VAR_H(crunch_leave_uc_strings, 4,
967  "Don't crunch words with long lower case strings");
968  INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions");
969  INT_VAR_H(crunch_debug, 0, "As it says");
970  INT_VAR_H(fixsp_non_noise_limit, 1, "How many non-noise blbs either side?");
971  double_VAR_H(fixsp_small_outlines_size, 0.28, "Small if lt xht x this");
972  BOOL_VAR_H(tessedit_prefer_joined_punct, false, "Reward punctuation joins");
973  INT_VAR_H(fixsp_done_mode, 1, "What constitues done for spacing");
974  INT_VAR_H(debug_fix_space_level, 0, "Contextual fixspace debug");
975  STRING_VAR_H(numeric_punctuation, ".,", "Punct. chs expected WITHIN numbers");
976  INT_VAR_H(x_ht_acceptance_tolerance, 8,
977  "Max allowed deviation of blob top outside of font data");
978  INT_VAR_H(x_ht_min_change, 8, "Min change in xht before actually trying it");
979  INT_VAR_H(superscript_debug, 0, "Debug level for sub & superscript fixer");
980  double_VAR_H(superscript_worse_certainty, 2.0,
981  "How many times worse "
982  "certainty does a superscript position glyph need to be for us "
983  "to try classifying it as a char with a different baseline?");
984  double_VAR_H(superscript_bettered_certainty, 0.97,
985  "What reduction in "
986  "badness do we think sufficient to choose a superscript over "
987  "what we'd thought. For example, a value of 0.6 means we want "
988  "to reduce badness of certainty by 40%");
989  double_VAR_H(superscript_scaledown_ratio, 0.4,
990  "A superscript scaled down more than this is unbelievably "
991  "small. For example, 0.3 means we expect the font size to "
992  "be no smaller than 30% of the text line font size.");
993  double_VAR_H(subscript_max_y_top, 0.5,
994  "Maximum top of a character measured as a multiple of x-height "
995  "above the baseline for us to reconsider whether it's a "
996  "subscript.");
997  double_VAR_H(superscript_min_y_bottom, 0.3,
998  "Minimum bottom of a character measured as a multiple of "
999  "x-height above the baseline for us to reconsider whether it's "
1000  "a superscript.");
1001  BOOL_VAR_H(tessedit_write_block_separators, false,
1002  "Write block separators in output");
1003  BOOL_VAR_H(tessedit_write_rep_codes, false, "Write repetition char code");
1004  BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
1005  BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
1006  BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
1007  BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file");
1008  BOOL_VAR_H(tessedit_create_lstmbox, false,
1009  "Write .box file for LSTM training");
1010  BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
1011  BOOL_VAR_H(tessedit_create_wordstrbox, false,
1012  "Write WordStr format .box output file");
1013  BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
1014  BOOL_VAR_H(textonly_pdf, false,
1015  "Create PDF with only one invisible text layer");
1016  INT_VAR_H(jpg_quality, 85, "Set JPEG quality level");
1017  INT_VAR_H(user_defined_dpi, 0, "Specify DPI for input image");
1018  INT_VAR_H(min_characters_to_try, 50,
1019  "Specify minimum characters to try during OSD");
1020  STRING_VAR_H(unrecognised_char, "|", "Output char for unidentified blobs");
1021  INT_VAR_H(suspect_level, 99, "Suspect marker level");
1022  INT_VAR_H(suspect_space_level, 100, "Min suspect level for rejecting spaces");
1023  INT_VAR_H(suspect_short_words, 2, "Don't Suspect dict wds longer than this");
1024  BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected");
1025  double_VAR_H(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit");
1026  double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit");
1027  BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures");
1028  BOOL_VAR_H(tessedit_zero_rejection, false, "Don't reject ANYTHING");
1029  BOOL_VAR_H(tessedit_word_for_word, false,
1030  "Make output have exactly one word per WERD");
1031  BOOL_VAR_H(tessedit_zero_kelvin_rejection, false,
1032  "Don't reject ANYTHING AT ALL");
1033  BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same");
1034  INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm");
1035  BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug");
1036  BOOL_VAR_H(tessedit_flip_0O, true, "Contextual 0O O0 flips");
1037  double_VAR_H(tessedit_lower_flip_hyphen, 1.5, "Aspect ratio dot/hyphen test");
1038  double_VAR_H(tessedit_upper_flip_hyphen, 1.8, "Aspect ratio dot/hyphen test");
1039  BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector");
1040  BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test");
1041  BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Don't double check");
1042  BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control");
1043  BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control");
1044  BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control");
1045  BOOL_VAR_H(rej_use_sensible_wd, false, "Extend permuter check");
1046  BOOL_VAR_H(rej_alphas_in_number_perm, false, "Extend permuter check");
1047  double_VAR_H(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract");
1048  INT_VAR_H(tessedit_image_border, 2, "Rej blbs near image edge limit");
1049  STRING_VAR_H(ok_repeated_ch_non_alphanum_wds, "-?*\075", "Allow NN to unrej");
1050  STRING_VAR_H(conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
1051  INT_VAR_H(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
1052  BOOL_VAR_H(tessedit_create_boxfile, false, "Output text with boxes");
1053  INT_VAR_H(tessedit_page_number, -1,
1054  "-1 -> All pages, else specific page to process");
1055  BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE");
1056  BOOL_VAR_H(interactive_display_mode, false, "Run interactively?");
1057  STRING_VAR_H(file_type, ".tif", "Filename extension");
1058  BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word");
1059  STRING_VAR_H(tessedit_load_sublangs, "",
1060  "List of languages to load with this one");
1061  BOOL_VAR_H(tessedit_use_primary_params_model, false,
1062  "In multilingual mode use params model of the primary language");
1063  // Min acceptable orientation margin (difference in scores between top and 2nd
1064  // choice in OSResults::orientations) to believe the page orientation.
1065  double_VAR_H(min_orientation_margin, 7.0,
1066  "Min acceptable orientation margin");
1067  BOOL_VAR_H(textord_tabfind_show_vlines, false, "Debug line finding");
1068  BOOL_VAR_H(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model");
1069  BOOL_VAR_H(poly_allow_detailed_fx, false,
1070  "Allow feature extractors to see the original outline");
1071  BOOL_VAR_H(tessedit_init_config_only, false,
1072  "Only initialize with the config file. Useful if the instance is "
1073  "not going to be used for OCR but say only for layout analysis.");
1074  BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector");
1075  BOOL_VAR_H(textord_tabfind_vertical_text, true, "Enable vertical detection");
1076  BOOL_VAR_H(textord_tabfind_force_vertical_text, false,
1077  "Force using vertical text page mode");
1078  double_VAR_H(textord_tabfind_vertical_text_ratio, 0.5,
1079  "Fraction of textlines deemed vertical to use vertical page "
1080  "mode");
1081  double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
1082  "Fraction of height used as a minimum gap for aligned blobs.");
1083  INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible");
1084  BOOL_VAR_H(preserve_interword_spaces, false,
1085  "Preserve multiple interword spaces");
1086  STRING_VAR_H(page_separator, "\f",
1087  "Page separator (default is form feed control character)");
1088  INT_VAR_H(lstm_choice_mode, 0,
1089  "Allows to include alternative symbols choices in the hOCR "
1090  "output. "
1091  "Valid input values are 0, 1, 2 and 3. 0 is the default value. "
1092  "With 1 the alternative symbol choices per timestep are included. "
1093  "With 2 the alternative symbol choices are accumulated per "
1094  "character. ");
1095 
1097  FILE* init_recog_training(const STRING& fname);
1098  void recog_training_segmented(const STRING& fname, PAGE_RES* page_res,
1099  volatile ETEXT_DESC* monitor,
1100  FILE* output_file);
1101  void ambigs_classify_and_output(const char* label, PAGE_RES_IT* pr_it,
1102  FILE* output_file);
1103 
1104  private:
1105  // The filename of a backup config file. If not null, then we currently
1106  // have a temporary debug config file loaded, and backup_config_file_
1107  // will be loaded, and set to null when debug is complete.
1108  const char* backup_config_file_;
1109  // The filename of a config file to read when processing a debug word.
1110  STRING word_config_;
1111  // Image used for input to layout analysis and tesseract recognition.
1112  // May be modified by the ShiroRekhaSplitter to eliminate the top-line.
1113  Pix* pix_binary_;
1114  // Grey-level input image if the input was not binary, otherwise nullptr.
1115  Pix* pix_grey_;
1116  // Original input image. Color if the input was color.
1117  Pix* pix_original_;
1118  // Thresholds that were used to generate the thresholded image from grey.
1119  Pix* pix_thresholds_;
1120  // Debug images. If non-empty, will be written on destruction.
1121  DebugPixa pixa_debug_;
1122  // Input image resolution after any scaling. The resolution is not well
1123  // transmitted by operations on Pix, so we keep an independent record here.
1124  int source_resolution_;
1125  // The shiro-rekha splitter object which is used to split top-lines in
1126  // Devanagari words to provide a better word and grapheme segmentation.
1127  ShiroRekhaSplitter splitter_;
1128  // Page segmentation/layout
1129  Textord textord_;
1130  // True if the primary language uses right_to_left reading order.
1131  bool right_to_left_;
1132  Pix* scaled_color_;
1133  int scaled_factor_;
1134  FCOORD deskew_;
1135  FCOORD reskew_;
1136  TesseractStats stats_;
1137  // Sub-languages to be tried in addition to this.
1138  GenericVector<Tesseract*> sub_langs_;
1139  // Most recently used Tesseract out of this and sub_langs_. The default
1140  // language for the next word.
1141  Tesseract* most_recently_used_;
1142  // The size of the font table, ie max possible font id + 1.
1143  int font_table_size_;
1144  // Equation detector. Note: this pointer is NOT owned by the class.
1145  EquationDetect* equ_detect_;
1146  // LSTM recognizer, if available.
1147  LSTMRecognizer* lstm_recognizer_;
1148  // Output "page" number (actually line number) using TrainLineRecognizer.
1149  int train_line_page_num_;
1150 };
1151 
1152 } // namespace tesseract
1153 
1154 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H_
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:297
Definition: werd.h:56
bool AnyTessLang() const
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
void dont_allow_1Il(WERD_RES *word)
SetParamConstraint
Definition: params.h:35
Definition: rect.h:34
#define STRING_VAR_H(name, val, comment)
Definition: params.h:299
const FCOORD & reskew() const
Definition: blobs.h:397
void set_pix_grey(Pix *grey_pix)
Definition: strngs.h:45
Definition: points.h:188
Tesseract * get_sub_lang(int index) const
void flip_hyphens(WERD_RES *word)
Pix * scaled_color() const
const Textord & textord() const
#define INT_VAR_H(name, val, comment)
Definition: params.h:295
ACCEPTABLE_WERD_TYPE
Definition: control.h:28
int ImageWidth() const
void set_pix_thresholds(Pix *thresholds)
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
Definition: blobs.h:263
#define double_VAR_H(name, val, comment)
Definition: params.h:301
bool right_to_left() const
PointerVector< WERD_RES > lang_words
void set_pix_original(Pix *original_pix)
Pix * pix_original() const
Pix * BestPix() const
Textord * mutable_textord()
int num_sub_langs() const
GARBAGE_LEVEL
Definition: docqual.h:29
int scaled_factor() const
int init_tesseract(const char *datapath, const char *language, OcrEngineMode oem)
bool non_0_digit(const char *str, int length)
bool AnyLSTMLang() const
Pix * pix_grey() const
int source_resolution() const
CRUNCH_MODE
Definition: pageres.h:158
Pix * pix_binary() const
void set_source_resolution(int ppi)
int UNICHAR_ID
Definition: unichar.h:34
WordData(const PAGE_RES_IT &page_res_it)
void SetScaledColor(int factor, Pix *color)
Definition: ocrrow.h:36
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:172
WordData(BLOCK *block_in, ROW *row_in, WERD_RES *word_res)
void flip_0O(WERD_RES *word)
int ImageHeight() const
Definition: ocrblock.h:29
int16_t word_blob_quality(WERD_RES *word, ROW *row)