25 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H_ 26 #define TESSERACT_CCMAIN_TESSERACTCLASS_H_ 30 #include "allheaders.h" 34 #ifndef DISABLED_LEGACY_ENGINE 108 class EquationDetect;
110 class LSTMRecognizer;
148 : word(nullptr), row(nullptr), block(nullptr), prev_word(nullptr) {}
150 : word(page_res_it.word()),
151 row(page_res_it.row()->row),
152 block(page_res_it.block()->block),
153 prev_word(nullptr) {}
155 : word(word_res), row(row_in), block(block_in), prev_word(nullptr) {}
180 Dict& getDict()
override;
186 void ResetAdaptiveClassifier();
188 void ResetDocumentDictionary();
199 pixDestroy(&pix_binary_);
209 pixDestroy(&pix_grey_);
210 pix_grey_ = grey_pix;
213 return pix_original_;
217 pixDestroy(&pix_original_);
218 pix_original_ = original_pix;
220 for (
int i = 0; i < sub_langs_.size(); ++i) {
221 sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
234 if (pixGetWidth(pix_original_) == ImageWidth()) {
235 return pix_original_;
236 }
else if (pix_grey_ !=
nullptr) {
243 pixDestroy(&pix_thresholds_);
244 pix_thresholds_ = thresholds;
247 return source_resolution_;
250 source_resolution_ = ppi;
253 return pixGetWidth(pix_binary_);
256 return pixGetHeight(pix_binary_);
259 return scaled_color_;
262 return scaled_factor_;
265 scaled_factor_ = factor;
266 scaled_color_ = color;
276 return right_to_left_;
279 return sub_langs_.size();
282 return sub_langs_[index];
288 for (
int i = 0; i < sub_langs_.size(); ++i) {
289 if (sub_langs_[i]->tessedit_ocr_engine_mode !=
OEM_LSTM_ONLY)
298 for (
int i = 0; i < sub_langs_.size(); ++i) {
306 void SetBlackAndWhitelist();
312 void PrepareForPageseg();
319 void PrepareForTessOCR(BLOCK_LIST* block_list,
Tesseract* osd_tess,
322 int SegmentPage(
const STRING* input_file, BLOCK_LIST* blocks,
324 void SetupWordScripts(BLOCK_LIST* blocks);
325 int AutoPageSeg(
PageSegMode pageseg_mode, BLOCK_LIST* blocks,
326 TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
330 OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
331 Pix** music_mask_pix);
339 void TrainLineRecognizer(
const STRING& input_imagename,
340 const STRING& output_basename,
341 BLOCK_LIST* block_list);
354 int end_box,
const BLOCK& block);
362 TBOX* revised_box)
const;
365 void LSTMRecognizeWord(
const BLOCK& block,
ROW* row, WERD_RES* word,
373 bool ProcessTargetWord(
const TBOX& word_box,
const TBOX& target_word_box,
374 const char* word_config,
int pass);
376 void SetupAllWordsPassN(
int pass_n,
const TBOX* target_word_box,
377 const char* word_config,
PAGE_RES* page_res,
380 void SetupWordPassN(
int pass_n,
WordData* word);
385 const TBOX* target_word_box,
const char* word_config,
388 const TBOX* target_word_box,
const char* word_config);
389 void bigram_correction_pass(
PAGE_RES* page_res);
390 void blamer_pass(
PAGE_RES* page_res);
392 void script_pos_pass(
PAGE_RES* page_res);
397 bool debug, WERD_RES** in_word,
402 bool ReassignDiacritics(
int pass,
PAGE_RES_IT* pr_it,
403 bool* make_next_word_fuzzy);
408 void AssignDiacriticsToOverlappingBlobs(
422 bool SelectGoodDiacriticOutlines(
int pass,
float certainty_threshold,
437 STRING* best_str,
float* c2);
438 void classify_word_and_language(
int pass_n,
PAGE_RES_IT* pr_it,
440 void classify_word_pass1(
const WordData& word_data, WERD_RES** in_word,
442 void recog_pseudo_word(
PAGE_RES* page_res,
443 TBOX& selection_box);
449 const char* lengths);
450 void match_word_pass_n(
int pass_n, WERD_RES* word,
ROW* row,
BLOCK* block);
451 void classify_word_pass2(
const WordData& word_data, WERD_RES** in_word,
453 void ReportXhtFixResult(
bool accept_new_word,
float new_x_ht, WERD_RES* word,
455 bool RunOldFixXht(WERD_RES* word,
BLOCK* block,
ROW* row);
456 bool TrainedXheightFix(WERD_RES* word,
BLOCK* block,
ROW* row);
459 bool TestNewNormalization(
int original_misfits,
float baseline_shift,
460 float new_x_ht, WERD_RES* word,
BLOCK* block,
465 void set_word_fonts(WERD_RES* word);
466 void font_recognition_pass(
PAGE_RES* page_res);
467 void dictionary_correction_pass(
PAGE_RES* page_res);
468 bool check_debug_pt(WERD_RES* word,
int location);
471 bool SubAndSuperscriptFix(WERD_RES* word_res);
472 void GetSubAndSuperscriptCandidates(
473 const WERD_RES* word,
int* num_rebuilt_leading,
ScriptPos* leading_pos,
474 float* leading_certainty,
int* num_rebuilt_trailing,
475 ScriptPos* trailing_pos,
float* trailing_certainty,
float* avg_certainty,
476 float* unlikely_threshold);
477 WERD_RES* TrySuperscriptSplits(
int num_chopped_leading,
478 float leading_certainty,
ScriptPos leading_pos,
479 int num_chopped_trailing,
480 float trailing_certainty,
482 bool* is_good,
int* retry_leading,
483 int* retry_trailing);
484 bool BelievableSuperscript(
bool debug,
const WERD_RES& word,
485 float certainty_threshold,
int* left_ok,
486 int* right_ok)
const;
490 void output_pass(
PAGE_RES_IT& page_res_it,
const TBOX* target_word_box);
495 void set_unlv_suspects(WERD_RES* word);
497 bool acceptable_number_string(
const char* s,
const char* lengths);
506 int init_tesseract(
const char* arg0,
const char* textbase,
514 return init_tesseract(datapath,
nullptr, language, oem,
nullptr, 0,
nullptr,
515 nullptr,
false, &mgr);
533 int init_tesseract_internal(
const char* arg0,
const char* textbase,
535 char** configs,
int configs_size,
542 void SetupUniversalFontIds();
544 int init_tesseract_lm(
const char* arg0,
const char* textbase,
547 void recognize_page(
STRING& image_name);
548 void end_tesseract();
550 bool init_tesseract_lang_data(
const char* arg0,
const char* textbase,
552 char** configs,
int configs_size,
555 bool set_only_init_params,
563 #ifndef GRAPHICS_DISABLED 564 void pgeditor_main(
int width,
int height,
PAGE_RES* page_res);
565 #endif // GRAPHICS_DISABLED 566 void process_image_event(
568 bool process_cmd_win_event(
572 void debug_word(
PAGE_RES* page_res,
const TBOX& selection_box);
577 bool word_blank_and_set_display(
PAGE_RES_IT* pr_its);
582 void blob_feature_display(
PAGE_RES* page_res,
const TBOX& selection_box);
585 void make_reject_map(WERD_RES* word,
ROW* row, int16_t pass);
586 bool one_ell_conflict(WERD_RES* word_res,
bool update_map);
587 int16_t first_alphanum_index(
const char* word,
const char* word_lengths);
588 int16_t first_alphanum_offset(
const char* word,
const char* word_lengths);
589 int16_t alpha_count(
const char* word,
const char* word_lengths);
592 int16_t count_alphanums(
597 bool repeated_nonalphanum_wd(WERD_RES* word,
ROW* row);
599 WERD_RES* word,
ROW* row);
600 void nn_recover_rejects(WERD_RES* word,
ROW* row);
602 WERD_RES* word, int16_t pass);
603 int16_t safe_dict_word(
const WERD_RES* werd_res);
605 void reject_I_1_L(WERD_RES* word);
606 void reject_edge_blobs(WERD_RES* word);
607 void reject_mostly_rejects(WERD_RES* word);
610 WERD_RES* word, uint16_t mode);
613 void recog_word_recursive(WERD_RES* word);
614 void recog_word(WERD_RES* word);
615 void split_and_recog_word(WERD_RES* word);
616 void split_word(WERD_RES* word,
int split_pt, WERD_RES** right_piece,
618 void join_words(WERD_RES* word, WERD_RES* word2,
BlamerBundle* orig_bb)
const;
620 bool digit_or_numeric_punct(WERD_RES* word,
int char_position);
621 int16_t eval_word_spacing(WERD_RES_LIST& word_res_list);
622 void match_current_words(WERD_RES_LIST& words,
ROW* row,
BLOCK* block);
623 int16_t fp_eval_word_spacing(WERD_RES_LIST& word_res_list);
624 void fix_noisy_space_list(WERD_RES_LIST& best_perm,
ROW* row,
BLOCK* block);
625 void fix_fuzzy_space_list(WERD_RES_LIST& best_perm,
ROW* row,
BLOCK* block);
626 void fix_sp_fp_word(WERD_RES_IT& word_res_it,
ROW* row,
BLOCK* block);
627 void fix_fuzzy_spaces(
631 void dump_words(WERD_RES_LIST& perm, int16_t score, int16_t mode,
633 bool fixspace_thinks_word_done(WERD_RES* word);
634 int16_t worst_noise_blob(WERD_RES* word_res,
float* worst_noise_score);
635 float blob_noise_score(
TBLOB* blob);
636 void break_noisiest_blob_word(WERD_RES_LIST& words);
638 #ifndef DISABLED_LEGACY_ENGINE 639 GARBAGE_LEVEL garbage_word(WERD_RES* word,
bool ok_dict_word);
640 bool potential_word_crunch(WERD_RES* word,
GARBAGE_LEVEL garbage_level,
644 void unrej_good_quality_words(
646 void doc_and_block_rejection(
648 void quality_based_rejection(
PAGE_RES_IT& page_res_it,
bool good_quality_doc);
649 void convert_bad_unlv_chs(WERD_RES* word_res);
652 void word_char_quality(WERD_RES* word,
ROW* row, int16_t* match_count,
653 int16_t* accepted_match_count);
654 void unrej_good_chs(WERD_RES* word,
ROW* row);
655 int16_t count_outline_errs(
char c, int16_t outline_count);
656 int16_t word_outline_errs(WERD_RES* word);
657 #ifndef DISABLED_LEGACY_ENGINE 658 bool terrible_word_crunch(WERD_RES* word,
GARBAGE_LEVEL garbage_level);
660 CRUNCH_MODE word_deletable(WERD_RES* word, int16_t& delete_mode);
661 int16_t failure_count(WERD_RES* word);
662 bool noise_outlines(
TWERD* word);
664 void process_selected_words(
670 void tess_add_doc_word(
673 void tess_segment_pass_n(
int pass_n, WERD_RES* word);
674 bool tess_acceptable_word(WERD_RES* word);
698 BLOCK_LIST* block_list);
702 void PreenXHeights(BLOCK_LIST* block_list);
707 BLOCK_LIST* block_list);
712 ROW* row, WERD_RES* word_res);
721 bool ResegmentCharBox(
PAGE_RES* page_res,
const TBOX* prev_box,
722 const TBOX& box,
const TBOX* next_box,
723 const char* correct_text);
730 bool ResegmentWordBox(BLOCK_LIST* block_list,
const TBOX& box,
731 const TBOX* next_box,
const char* correct_text);
734 void ReSegmentByClassification(
PAGE_RES* page_res);
737 bool ConvertStringToUnichars(
const char* utf8,
754 int choices_pos,
int choices_length,
756 int text_index,
float rating,
765 void ReportFailedBox(
int boxfile_lineno,
TBOX box,
const char* box_ch,
766 const char* err_msg);
768 void CorrectClassifyWords(
PAGE_RES* page_res);
771 void ApplyBoxTraining(
const STRING& fontname,
PAGE_RES* page_res);
775 int CountMisfitTops(WERD_RES* word_res);
780 float ComputeCompatibleXheight(WERD_RES* word_res,
float* baseline_shift);
783 BOOL_VAR_H(tessedit_resegment_from_boxes,
false,
784 "Take segmentation and labeling from box file");
785 BOOL_VAR_H(tessedit_resegment_from_line_boxes,
false,
786 "Conversion of word/line box file to char box file");
788 "Generate training data from boxed chars");
789 BOOL_VAR_H(tessedit_make_boxes_from_boxes,
false,
790 "Generate more boxes from boxed chars");
791 BOOL_VAR_H(tessedit_train_line_recognizer,
false,
792 "Break input into lines and remap boxes if present");
793 BOOL_VAR_H(tessedit_dump_pageseg_images,
false,
794 "Dump intermediate images made during page segmentation");
796 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," 797 " 5=line, 6=word, 7=char" 798 " (Values from PageSegMode enum in publictypes.h)");
800 "Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults" 801 " to loading and running the most accurate available.");
803 "Blacklist of chars not to recognize");
804 STRING_VAR_H(tessedit_char_whitelist,
"",
"Whitelist of chars to recognize");
806 "List of chars to override tessedit_char_blacklist");
808 "Perform training for ambiguities");
809 INT_VAR_H(pageseg_devanagari_split_strategy,
811 "Whether to use the top-line splitting process for Devanagari " 812 "documents while performing page-segmentation.");
815 "Whether to use the top-line splitting process for Devanagari " 816 "documents while performing ocr.");
818 "Write all parameters to the given file.");
820 "Generate and print debug information for adaption");
823 INT_VAR_H(applybox_page, 0,
"Page number to apply boxes from");
825 "Exposure value follows this pattern in the image" 826 " filename. The name of the image files are expected" 827 " to be in the form [lang].[fontname].exp[num].tif");
828 BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode,
false,
829 "Learn both character fragments (as is done in the" 830 " special low exposure mode) as well as unfragmented" 833 "Each bounding box is assumed to contain ngrams. Only" 834 " learn the ngrams whose outlines overlap horizontally.");
835 BOOL_VAR_H(tessedit_display_outwords,
false,
"Draw output words");
836 BOOL_VAR_H(tessedit_dump_choices,
false,
"Dump char choices");
837 BOOL_VAR_H(tessedit_timing_debug,
false,
"Print timing stats");
838 BOOL_VAR_H(tessedit_fix_fuzzy_spaces,
true,
"Try to improve fuzzy spaces");
840 "Don't bother with word plausibility");
841 BOOL_VAR_H(tessedit_fix_hyphens,
true,
"Crunch double hyphens?");
842 BOOL_VAR_H(tessedit_redo_xheight,
true,
"Check/Correct x-height");
844 "Add words to the document dictionary");
845 BOOL_VAR_H(tessedit_debug_fonts,
false,
"Output font info per char");
846 BOOL_VAR_H(tessedit_debug_block_rejection,
false,
"Block and Row stats");
847 BOOL_VAR_H(tessedit_enable_bigram_correction,
true,
848 "Enable correction based on the word bigram dictionary.");
849 BOOL_VAR_H(tessedit_enable_dict_correction,
false,
850 "Enable single word correction based on the dictionary.");
852 "Amount of debug output for bigram " 855 "Remove and conditionally reassign small outlines when they" 856 " confuse layout analysis, determining diacritics vs noise");
857 INT_VAR_H(debug_noise_removal, 0,
"Debug reassignment of small outlines");
860 double_VAR_H(noise_cert_basechar, -8.0,
"Hingepoint for base char certainty");
863 double_VAR_H(noise_cert_disjoint, -2.5,
"Hingepoint for disjoint certainty");
866 double_VAR_H(noise_cert_punc, -2.5,
"Threshold for new punc char certainty");
869 "Scaling on certainty diff from Hingepoint");
870 INT_VAR_H(noise_maxperblob, 8,
"Max diacritics to apply to a blob");
871 INT_VAR_H(noise_maxperword, 16,
"Max diacritics to apply to a word");
873 BOOL_VAR_H(debug_acceptable_wds,
false,
"Dump word pass/fail chk");
875 STRING_VAR_H(chs_trailing_punct1,
").,;:?!",
"1st Trailing punctuation");
876 STRING_VAR_H(chs_trailing_punct2,
")'`\"",
"2nd Trailing punctuation");
877 double_VAR_H(quality_rej_pc, 0.08,
"good_quality_doc lte rejection limit");
878 double_VAR_H(quality_blob_pc, 0.0,
"good_quality_doc gte good blobs limit");
880 "good_quality_doc lte outline error limit");
881 double_VAR_H(quality_char_pc, 0.95,
"good_quality_doc gte good char limit");
882 INT_VAR_H(quality_min_initial_alphas_reqd, 2,
"alphas in a good word");
883 INT_VAR_H(tessedit_tess_adaption_mode, 0x27,
884 "Adaptation decision algorithm for tess");
886 "Do minimal rejection on pass 1 output");
887 BOOL_VAR_H(tessedit_test_adaption,
false,
"Test adaption criteria");
888 BOOL_VAR_H(tessedit_matcher_log,
false,
"Log matcher activity");
889 INT_VAR_H(tessedit_test_adaption_mode, 3,
890 "Adaptation decision algorithm for tess");
894 INT_VAR_H(multilang_debug_level, 0,
"Print multilang debug info.");
895 INT_VAR_H(paragraph_debug_level, 0,
"Print paragraph debug info.");
897 "Run paragraph detection on the post-text-recognition " 899 BOOL_VAR_H(lstm_use_matrix, 1,
"Use ratings matrix/beam searct with lstm");
901 STRING_VAR_H(outlines_2,
"ij!?%\":;",
"Non standard number of outlines");
902 BOOL_VAR_H(docqual_excuse_outline_errs,
false,
903 "Allow outline errs in unrejection?");
905 "Reduce rejection on good docs");
906 BOOL_VAR_H(tessedit_use_reject_spaces,
true,
"Reject spaces?");
908 "%rej allowed before rej whole doc");
910 "%rej allowed before rej whole block");
912 "%rej allowed before rej whole row");
914 "Number of row rejects in whole word rejects" 915 "which prevents whole row rejection");
916 BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds,
true,
917 "Only rej partially rejected words in block rejection");
918 BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds,
true,
919 "Only rej partially rejected words in row rejection");
920 BOOL_VAR_H(tessedit_dont_blkrej_good_wds,
false,
921 "Use word segmentation quality metric");
922 BOOL_VAR_H(tessedit_dont_rowrej_good_wds,
false,
923 "Use word segmentation quality metric");
924 INT_VAR_H(tessedit_preserve_min_wd_len, 2,
925 "Only preserve wds longer than this");
927 "Apply row rejection to good docs");
929 "rej good doc wd if more than this fraction rejected");
930 BOOL_VAR_H(tessedit_reject_bad_qual_wds,
true,
"Reject all bad quality wds");
931 BOOL_VAR_H(tessedit_debug_doc_rejection,
false,
"Page stats");
932 BOOL_VAR_H(tessedit_debug_quality_metrics,
false,
933 "Output data to debug file");
934 BOOL_VAR_H(bland_unrej,
false,
"unrej potential with no checks");
935 double_VAR_H(quality_rowrej_pc, 1.1,
"good_quality_doc gte good char limit");
936 BOOL_VAR_H(unlv_tilde_crunching,
false,
"Mark v.bad words for tilde crunch");
937 BOOL_VAR_H(hocr_font_info,
false,
"Add font info to hocr output");
939 "Add coordinates for each character to hocr output");
940 BOOL_VAR_H(crunch_early_merge_tess_fails,
true,
"Before word crunch?");
941 BOOL_VAR_H(crunch_early_convert_bad_unlv_chs,
false,
"Take out ~^ early?");
944 double_VAR_H(crunch_poor_garbage_cert, -9.0,
"crunch garbage cert lt this");
945 double_VAR_H(crunch_poor_garbage_rate, 60,
"crunch garbage rating lt this");
946 double_VAR_H(crunch_pot_poor_rate, 40,
"POTENTIAL crunch rating lt this");
947 double_VAR_H(crunch_pot_poor_cert, -8.0,
"POTENTIAL crunch cert lt this");
948 BOOL_VAR_H(crunch_pot_garbage,
true,
"POTENTIAL crunch garbage");
949 double_VAR_H(crunch_del_rating, 60,
"POTENTIAL crunch rating lt this");
953 double_VAR_H(crunch_del_min_width, 3.0,
"Del if word width lt xht x this");
954 double_VAR_H(crunch_del_high_word, 1.5,
"Del if word gt xht x this above bl");
955 double_VAR_H(crunch_del_low_word, 0.5,
"Del if word gt xht x this below bl");
956 double_VAR_H(crunch_small_outlines_size, 0.6,
"Small if lt xht x this");
957 INT_VAR_H(crunch_rating_max, 10,
"For adj length in rating per ch");
958 INT_VAR_H(crunch_pot_indicators, 1,
"How many potential indicators needed");
959 BOOL_VAR_H(crunch_leave_ok_strings,
true,
"Don't touch sensible strings");
960 BOOL_VAR_H(crunch_accept_ok,
true,
"Use acceptability in okstring");
961 BOOL_VAR_H(crunch_leave_accept_strings,
false,
962 "Don't pot crunch sensible strings");
963 BOOL_VAR_H(crunch_include_numerals,
false,
"Fiddle alpha figures");
965 "Don't crunch words with long lower case strings");
967 "Don't crunch words with long lower case strings");
968 INT_VAR_H(crunch_long_repetitions, 3,
"Crunch words with long repetitions");
970 INT_VAR_H(fixsp_non_noise_limit, 1,
"How many non-noise blbs either side?");
971 double_VAR_H(fixsp_small_outlines_size, 0.28,
"Small if lt xht x this");
972 BOOL_VAR_H(tessedit_prefer_joined_punct,
false,
"Reward punctuation joins");
973 INT_VAR_H(fixsp_done_mode, 1,
"What constitues done for spacing");
974 INT_VAR_H(debug_fix_space_level, 0,
"Contextual fixspace debug");
975 STRING_VAR_H(numeric_punctuation,
".,",
"Punct. chs expected WITHIN numbers");
977 "Max allowed deviation of blob top outside of font data");
978 INT_VAR_H(x_ht_min_change, 8,
"Min change in xht before actually trying it");
979 INT_VAR_H(superscript_debug, 0,
"Debug level for sub & superscript fixer");
981 "How many times worse " 982 "certainty does a superscript position glyph need to be for us " 983 "to try classifying it as a char with a different baseline?");
986 "badness do we think sufficient to choose a superscript over " 987 "what we'd thought. For example, a value of 0.6 means we want " 988 "to reduce badness of certainty by 40%");
990 "A superscript scaled down more than this is unbelievably " 991 "small. For example, 0.3 means we expect the font size to " 992 "be no smaller than 30% of the text line font size.");
994 "Maximum top of a character measured as a multiple of x-height " 995 "above the baseline for us to reconsider whether it's a " 998 "Minimum bottom of a character measured as a multiple of " 999 "x-height above the baseline for us to reconsider whether it's " 1001 BOOL_VAR_H(tessedit_write_block_separators,
false,
1002 "Write block separators in output");
1003 BOOL_VAR_H(tessedit_write_rep_codes,
false,
"Write repetition char code");
1004 BOOL_VAR_H(tessedit_write_unlv,
false,
"Write .unlv output file");
1005 BOOL_VAR_H(tessedit_create_txt,
false,
"Write .txt output file");
1006 BOOL_VAR_H(tessedit_create_hocr,
false,
"Write .html hOCR output file");
1007 BOOL_VAR_H(tessedit_create_alto,
false,
"Write .xml ALTO output file");
1009 "Write .box file for LSTM training");
1010 BOOL_VAR_H(tessedit_create_tsv,
false,
"Write .tsv output file");
1011 BOOL_VAR_H(tessedit_create_wordstrbox,
false,
1012 "Write WordStr format .box output file");
1013 BOOL_VAR_H(tessedit_create_pdf,
false,
"Write .pdf output file");
1015 "Create PDF with only one invisible text layer");
1017 INT_VAR_H(user_defined_dpi, 0,
"Specify DPI for input image");
1019 "Specify minimum characters to try during OSD");
1020 STRING_VAR_H(unrecognised_char,
"|",
"Output char for unidentified blobs");
1022 INT_VAR_H(suspect_space_level, 100,
"Min suspect level for rejecting spaces");
1023 INT_VAR_H(suspect_short_words, 2,
"Don't Suspect dict wds longer than this");
1024 BOOL_VAR_H(suspect_constrain_1Il,
false,
"UNLV keep 1Il chars rejected");
1025 double_VAR_H(suspect_rating_per_ch, 999.9,
"Don't touch bad rating limit");
1027 BOOL_VAR_H(tessedit_minimal_rejection,
false,
"Only reject tess failures");
1028 BOOL_VAR_H(tessedit_zero_rejection,
false,
"Don't reject ANYTHING");
1030 "Make output have exactly one word per WERD");
1031 BOOL_VAR_H(tessedit_zero_kelvin_rejection,
false,
1032 "Don't reject ANYTHING AT ALL");
1033 BOOL_VAR_H(tessedit_consistent_reps,
true,
"Force all rep chars the same");
1037 double_VAR_H(tessedit_lower_flip_hyphen, 1.5,
"Aspect ratio dot/hyphen test");
1038 double_VAR_H(tessedit_upper_flip_hyphen, 1.8,
"Aspect ratio dot/hyphen test");
1039 BOOL_VAR_H(rej_trust_doc_dawg,
false,
"Use DOC dawg in 11l conf. detector");
1041 BOOL_VAR_H(rej_1Il_trust_permuter_type,
true,
"Don't double check");
1042 BOOL_VAR_H(rej_use_tess_accepted,
true,
"Individual rejection control");
1043 BOOL_VAR_H(rej_use_tess_blanks,
true,
"Individual rejection control");
1044 BOOL_VAR_H(rej_use_good_perm,
true,
"Individual rejection control");
1046 BOOL_VAR_H(rej_alphas_in_number_perm,
false,
"Extend permuter check");
1047 double_VAR_H(rej_whole_of_mostly_reject_word_fract, 0.85,
"if >this fract");
1048 INT_VAR_H(tessedit_image_border, 2,
"Rej blbs near image edge limit");
1049 STRING_VAR_H(ok_repeated_ch_non_alphanum_wds,
"-?*\075",
"Allow NN to unrej");
1051 INT_VAR_H(min_sane_x_ht_pixels, 8,
"Reject any x-ht lt or eq than this");
1052 BOOL_VAR_H(tessedit_create_boxfile,
false,
"Output text with boxes");
1054 "-1 -> All pages, else specific page to process");
1055 BOOL_VAR_H(tessedit_write_images,
false,
"Capture the image from the IPE");
1056 BOOL_VAR_H(interactive_display_mode,
false,
"Run interactively?");
1058 BOOL_VAR_H(tessedit_override_permuter,
true,
"According to dict_word");
1060 "List of languages to load with this one");
1061 BOOL_VAR_H(tessedit_use_primary_params_model,
false,
1062 "In multilingual mode use params model of the primary language");
1066 "Min acceptable orientation margin");
1067 BOOL_VAR_H(textord_tabfind_show_vlines,
false,
"Debug line finding");
1068 BOOL_VAR_H(textord_use_cjk_fp_model,
false,
"Use CJK fixed pitch model");
1070 "Allow feature extractors to see the original outline");
1072 "Only initialize with the config file. Useful if the instance is " 1073 "not going to be used for OCR but say only for layout analysis.");
1074 BOOL_VAR_H(textord_equation_detect,
false,
"Turn on equation detector");
1075 BOOL_VAR_H(textord_tabfind_vertical_text,
true,
"Enable vertical detection");
1076 BOOL_VAR_H(textord_tabfind_force_vertical_text,
false,
1077 "Force using vertical text page mode");
1079 "Fraction of textlines deemed vertical to use vertical page " 1081 double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
1082 "Fraction of height used as a minimum gap for aligned blobs.");
1083 INT_VAR_H(tessedit_parallelize, 0,
"Run in parallel where possible");
1085 "Preserve multiple interword spaces");
1087 "Page separator (default is form feed control character)");
1089 "Allows to include alternative symbols choices in the hOCR " 1091 "Valid input values are 0, 1, 2 and 3. 0 is the default value. " 1092 "With 1 the alternative symbol choices per timestep are included. " 1093 "With 2 the alternative symbol choices are accumulated per " 1097 FILE* init_recog_training(
const STRING& fname);
1098 void recog_training_segmented(
const STRING& fname,
PAGE_RES* page_res,
1101 void ambigs_classify_and_output(
const char* label,
PAGE_RES_IT* pr_it,
1108 const char* backup_config_file_;
1119 Pix* pix_thresholds_;
1124 int source_resolution_;
1131 bool right_to_left_;
1143 int font_table_size_;
1149 int train_line_page_num_;
1154 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H_ #define BOOL_VAR_H(name, val, comment)
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
void dont_allow_1Il(WERD_RES *word)
#define STRING_VAR_H(name, val, comment)
const FCOORD & reskew() const
bool write_results_empty_block
void set_pix_grey(Pix *grey_pix)
Tesseract * get_sub_lang(int index) const
void flip_hyphens(WERD_RES *word)
Pix ** mutable_pix_binary()
Pix * scaled_color() const
const Textord & textord() const
#define INT_VAR_H(name, val, comment)
int16_t doc_good_char_quality
void set_pix_thresholds(Pix *thresholds)
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
#define double_VAR_H(name, val, comment)
bool right_to_left() const
PointerVector< WERD_RES > lang_words
void set_pix_original(Pix *original_pix)
bool last_char_was_newline
Pix * pix_original() const
Textord * mutable_textord()
int num_sub_langs() const
int scaled_factor() const
int init_tesseract(const char *datapath, const char *language, OcrEngineMode oem)
bool non_0_digit(const char *str, int length)
int source_resolution() const
bool tilde_crunch_written
int32_t adaption_word_number
void set_source_resolution(int ppi)
WordData(const PAGE_RES_IT &page_res_it)
void SetScaledColor(int factor, Pix *color)
Assume a single uniform block of text. (Default.)
WordData(BLOCK *block_in, ROW *row_in, WERD_RES *word_res)
void flip_0O(WERD_RES *word)
int16_t word_blob_quality(WERD_RES *word, ROW *row)