tesseract  4.1.0
tesseract::Tesseract Class Reference

#include <tesseractclass.h>

Inheritance diagram for tesseract::Tesseract:
tesseract::Wordrec tesseract::Classify tesseract::CCStruct tesseract::CUtil tesseract::CCUtil

Public Member Functions

 Tesseract ()
 
 ~Tesseract () override
 
DictgetDict () override
 
void Clear ()
 
void ResetAdaptiveClassifier ()
 
void ResetDocumentDictionary ()
 
void SetEquationDetect (EquationDetect *detector)
 
const FCOORDreskew () const
 
Pix ** mutable_pix_binary ()
 
Pix * pix_binary () const
 
Pix * pix_grey () const
 
void set_pix_grey (Pix *grey_pix)
 
Pix * pix_original () const
 
void set_pix_original (Pix *original_pix)
 
Pix * BestPix () const
 
void set_pix_thresholds (Pix *thresholds)
 
int source_resolution () const
 
void set_source_resolution (int ppi)
 
int ImageWidth () const
 
int ImageHeight () const
 
Pix * scaled_color () const
 
int scaled_factor () const
 
void SetScaledColor (int factor, Pix *color)
 
const Textordtextord () const
 
Textordmutable_textord ()
 
bool right_to_left () const
 
int num_sub_langs () const
 
Tesseractget_sub_lang (int index) const
 
bool AnyTessLang () const
 
bool AnyLSTMLang () const
 
void SetBlackAndWhitelist ()
 
void PrepareForPageseg ()
 
void PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
 
int SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
 
void SetupWordScripts (BLOCK_LIST *blocks)
 
int AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
 
ColumnFinderSetupPageSegAndDetectOrientation (PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
 
void PrerecAllWordsPar (const GenericVector< WordData > &words)
 
void TrainLineRecognizer (const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
 
void TrainFromBoxes (const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
 
ImageDataGetLineData (const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
 
ImageDataGetRectImage (const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
 
void LSTMRecognizeWord (const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
 
void SearchWords (PointerVector< WERD_RES > *words)
 
bool ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
 
void SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
 
void SetupWordPassN (int pass_n, WordData *word)
 
bool RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
 
bool recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
 
void rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
 
void bigram_correction_pass (PAGE_RES *page_res)
 
void blamer_pass (PAGE_RES *page_res)
 
void script_pos_pass (PAGE_RES *page_res)
 
int RetryWithLanguage (const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
 
bool ReassignDiacritics (int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
 
void AssignDiacriticsToOverlappingBlobs (const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
 
void AssignDiacriticsToNewBlobs (const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
 
bool SelectGoodDiacriticOutlines (int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
 
float ClassifyBlobPlusOutlines (const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
 
float ClassifyBlobAsWord (int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
 
void classify_word_and_language (int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
 
void classify_word_pass1 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box)
 
void fix_rep_char (PAGE_RES_IT *page_res_it)
 
ACCEPTABLE_WERD_TYPE acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths)
 
void match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
 
void classify_word_pass2 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
 
bool RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TestNewNormalization (int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
 
bool recog_interactive (PAGE_RES_IT *pr_it)
 
void set_word_fonts (WERD_RES *word)
 
void font_recognition_pass (PAGE_RES *page_res)
 
void dictionary_correction_pass (PAGE_RES *page_res)
 
bool check_debug_pt (WERD_RES *word, int location)
 
bool SubAndSuperscriptFix (WERD_RES *word_res)
 
void GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
 
WERD_RESTrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
 
bool BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
 
void output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
 
void write_results (PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
 
void set_unlv_suspects (WERD_RES *word)
 
UNICHAR_ID get_rep_char (WERD_RES *word)
 
bool acceptable_number_string (const char *s, const char *lengths)
 
int16_t count_alphanums (const WERD_CHOICE &word)
 
int16_t count_alphas (const WERD_CHOICE &word)
 
void read_config_file (const char *filename, SetParamConstraint constraint)
 
int init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
 
int init_tesseract (const char *datapath, const char *language, OcrEngineMode oem)
 
int init_tesseract_internal (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
 
void SetupUniversalFontIds ()
 
int init_tesseract_lm (const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
 
void recognize_page (STRING &image_name)
 
void end_tesseract ()
 
bool init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
 
void ParseLanguageString (const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
 
SVMenuNodebuild_menu_new ()
 
void pgeditor_main (int width, int height, PAGE_RES *page_res)
 
void process_image_event (const SVEvent &event)
 
bool process_cmd_win_event (int32_t cmd_event, char *new_value)
 
void debug_word (PAGE_RES *page_res, const TBOX &selection_box)
 
void do_re_display (bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
 
bool word_display (PAGE_RES_IT *pr_it)
 
bool word_bln_display (PAGE_RES_IT *pr_it)
 
bool word_blank_and_set_display (PAGE_RES_IT *pr_its)
 
bool word_set_display (PAGE_RES_IT *pr_it)
 
bool word_dumper (PAGE_RES_IT *pr_it)
 
void blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box)
 
void make_reject_map (WERD_RES *word, ROW *row, int16_t pass)
 
bool one_ell_conflict (WERD_RES *word_res, bool update_map)
 
int16_t first_alphanum_index (const char *word, const char *word_lengths)
 
int16_t first_alphanum_offset (const char *word, const char *word_lengths)
 
int16_t alpha_count (const char *word, const char *word_lengths)
 
bool word_contains_non_1_digit (const char *word, const char *word_lengths)
 
void dont_allow_1Il (WERD_RES *word)
 
int16_t count_alphanums (WERD_RES *word)
 
void flip_0O (WERD_RES *word)
 
bool non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
bool non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
bool repeated_nonalphanum_wd (WERD_RES *word, ROW *row)
 
void nn_match_word (WERD_RES *word, ROW *row)
 
void nn_recover_rejects (WERD_RES *word, ROW *row)
 
void set_done (WERD_RES *word, int16_t pass)
 
int16_t safe_dict_word (const WERD_RES *werd_res)
 
void flip_hyphens (WERD_RES *word)
 
void reject_I_1_L (WERD_RES *word)
 
void reject_edge_blobs (WERD_RES *word)
 
void reject_mostly_rejects (WERD_RES *word)
 
bool word_adaptable (WERD_RES *word, uint16_t mode)
 
void recog_word_recursive (WERD_RES *word)
 
void recog_word (WERD_RES *word)
 
void split_and_recog_word (WERD_RES *word)
 
void split_word (WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
 
void join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
 
void match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block)
 
int16_t fp_eval_word_spacing (WERD_RES_LIST &word_res_list)
 
void dump_words (WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
 
bool fixspace_thinks_word_done (WERD_RES *word)
 
GARBAGE_LEVEL garbage_word (WERD_RES *word, bool ok_dict_word)
 
bool potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
 
void tilde_crunch (PAGE_RES_IT &page_res_it)
 
void unrej_good_quality_words (PAGE_RES_IT &page_res_it)
 
void doc_and_block_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc)
 
void quality_based_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc)
 
void convert_bad_unlv_chs (WERD_RES *word_res)
 
void tilde_delete (PAGE_RES_IT &page_res_it)
 
int16_t word_blob_quality (WERD_RES *word, ROW *row)
 
void word_char_quality (WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
 
void unrej_good_chs (WERD_RES *word, ROW *row)
 
int16_t count_outline_errs (char c, int16_t outline_count)
 
int16_t word_outline_errs (WERD_RES *word)
 
bool terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level)
 
CRUNCH_MODE word_deletable (WERD_RES *word, int16_t &delete_mode)
 
int16_t failure_count (WERD_RES *word)
 
bool noise_outlines (TWERD *word)
 
void tess_segment_pass_n (int pass_n, WERD_RES *word)
 
PAGE_RESApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
 
void PreenXHeights (BLOCK_LIST *block_list)
 
PAGE_RESSetupApplyBoxes (const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
 
void MaximallyChopWord (const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
 
bool ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
 
bool ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
 
void ReSegmentByClassification (PAGE_RES *page_res)
 
bool ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
 
bool FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
 
void SearchForText (const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
 
void TidyUp (PAGE_RES *page_res)
 
void ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
 
void CorrectClassifyWords (PAGE_RES *page_res)
 
void ApplyBoxTraining (const STRING &fontname, PAGE_RES *page_res)
 
int CountMisfitTops (WERD_RES *word_res)
 
float ComputeCompatibleXheight (WERD_RES *word_res, float *baseline_shift)
 
FILE * init_recog_training (const STRING &fname)
 
void recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
 
void ambigs_classify_and_output (const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
 
eval_word_spacing()

The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect.

Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is preferred.

The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.

Conversely, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined.

The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.

bool digit_or_numeric_punct (WERD_RES *word, int char_position)
 
int16_t eval_word_spacing (WERD_RES_LIST &word_res_list)
 
fix_sp_fp_word()

Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words.

void fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
 
int16_t worst_noise_blob (WERD_RES *word_res, float *worst_noise_score)
 
float blob_noise_score (TBLOB *blob)
 
void break_noisiest_blob_word (WERD_RES_LIST &words)
 
fix_fuzzy_spaces()

Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.

Parameters
monitorprogress monitor
word_countcount of words in doc
[out]page_res
void fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_fuzzy_spaces (ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
 
process_selected_words()

Walk the current block list applying the specified word processor function to each word that overlaps the selection_box.

void process_selected_words (PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
 
tess_add_doc_word

Add the given word to the document dictionary

void tess_add_doc_word (WERD_CHOICE *word_choice)
 
tess_acceptable_word
Returns
true if the word is regarded as "good enough".
Parameters
word_choiceafter context
raw_choicebefore context
bool tess_acceptable_word (WERD_RES *word)
 
- Public Member Functions inherited from tesseract::Wordrec
 Wordrec ()
 
 ~Wordrec () override=default
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void DoSegSearch (WERD_RES *word_res)
 
void add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams)
 
void choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
 
void combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
 
SEAMpick_good_seam (TBLOB *blob)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY grade_split_length (SPLIT *split)
 
PRIORITY grade_sharpness (SPLIT *split)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
virtual BLOB_CHOICE_LIST * classify_piece (const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
 
void merge_fragments (MATRIX *ratings, int16_t num_blobs)
 
void get_fragment_lists (int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
 
void merge_and_put_fragment_lists (int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
 
void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
 
void program_editup (const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
 
void cc_recog (WERD_RES *word)
 
void program_editdown (int32_t elasped_time)
 
void set_pass1 ()
 
void set_pass2 ()
 
int end_recog ()
 
BLOB_CHOICE_LIST * call_matcher (TBLOB *blob)
 
int dict_word (const WERD_CHOICE &word)
 
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
 
PRIORITY point_priority (EDGEPT *point)
 
void add_point_to_list (PointHeap *point_heap, EDGEPT *point)
 
bool is_inside_angle (EDGEPT *pt)
 
int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
void prioritize_points (TESSLINE *outline, PointHeap *points)
 
void new_min_point (EDGEPT *local_min, PointHeap *points)
 
void new_max_point (EDGEPT *local_max, PointHeap *points)
 
void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_numbered_blob (TWERD *word, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
 
SEAMimprove_one_blob (const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
 
SEAMchop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
 
void chop_word_main (WERD_RES *word)
 
void improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
 
int select_blob_to_split (const GenericVector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment)
 
int select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
 ~Classify () override
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uint8_t ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (TFile *fp, uint16_t *Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (TFile *File)
 
float ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (TFile *fp)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (TessdataManager *mgr)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (float Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uint8_t *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (TFile *fp)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, float *XScale, float *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const STRING &filename)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()=default
 
 ~CCStruct () override
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()=default
 
 ~CUtil () override
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 

Public Attributes

bool tessedit_resegment_from_boxes = false
 
bool tessedit_resegment_from_line_boxes = false
 
bool tessedit_train_from_boxes = false
 
bool tessedit_make_boxes_from_boxes = false
 
bool tessedit_train_line_recognizer = false
 
bool tessedit_dump_pageseg_images = false
 
int tessedit_pageseg_mode = PSM_SINGLE_BLOCK
 
int tessedit_ocr_engine_mode = tesseract::OEM_DEFAULT
 
char * tessedit_char_blacklist = ""
 
char * tessedit_char_whitelist = ""
 
char * tessedit_char_unblacklist = ""
 
bool tessedit_ambigs_training = false
 
int pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
int ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
char * tessedit_write_params_to_file = ""
 
bool tessedit_adaption_debug = false
 
int bidi_debug = 0
 
int applybox_debug = 1
 
int applybox_page = 0
 
char * applybox_exposure_pattern = ".exp"
 
bool applybox_learn_chars_and_char_frags_mode = false
 
bool applybox_learn_ngrams_mode = false
 
bool tessedit_display_outwords = false
 
bool tessedit_dump_choices = false
 
bool tessedit_timing_debug = false
 
bool tessedit_fix_fuzzy_spaces = true
 
bool tessedit_unrej_any_wd = false
 
bool tessedit_fix_hyphens = true
 
bool tessedit_redo_xheight = true
 
bool tessedit_enable_doc_dict = true
 
bool tessedit_debug_fonts = false
 
bool tessedit_debug_block_rejection = false
 
bool tessedit_enable_bigram_correction = true
 
bool tessedit_enable_dict_correction = false
 
int tessedit_bigram_debug = 0
 
bool enable_noise_removal = true
 
int debug_noise_removal = 0
 
double noise_cert_basechar = -8.0
 
double noise_cert_disjoint = -2.5
 
double noise_cert_punc = -2.5
 
double noise_cert_factor = 0.375
 
int noise_maxperblob = 8
 
int noise_maxperword = 16
 
int debug_x_ht_level = 0
 
bool debug_acceptable_wds = false
 
char * chs_leading_punct = "('`\""
 
char * chs_trailing_punct1 = ").,;:?!"
 
char * chs_trailing_punct2 = ")'`\""
 
double quality_rej_pc = 0.08
 
double quality_blob_pc = 0.0
 
double quality_outline_pc = 1.0
 
double quality_char_pc = 0.95
 
int quality_min_initial_alphas_reqd = 2
 
int tessedit_tess_adaption_mode = 0x27
 
bool tessedit_minimal_rej_pass1 = false
 
bool tessedit_test_adaption = false
 
bool tessedit_matcher_log = false
 
int tessedit_test_adaption_mode = 3
 
bool test_pt = false
 
double test_pt_x = 99999.99
 
double test_pt_y = 99999.99
 
int multilang_debug_level = 0
 
int paragraph_debug_level = 0
 
bool paragraph_text_based = true
 
bool lstm_use_matrix = 1
 
char * outlines_odd = "%| "
 
char * outlines_2 = "ij!?%\":;"
 
bool docqual_excuse_outline_errs = false
 
bool tessedit_good_quality_unrej = true
 
bool tessedit_use_reject_spaces = true
 
double tessedit_reject_doc_percent = 65.00
 
double tessedit_reject_block_percent = 45.00
 
double tessedit_reject_row_percent = 40.00
 
double tessedit_whole_wd_rej_row_percent = 70.00
 
bool tessedit_preserve_blk_rej_perfect_wds = true
 
bool tessedit_preserve_row_rej_perfect_wds = true
 
bool tessedit_dont_blkrej_good_wds = false
 
bool tessedit_dont_rowrej_good_wds = false
 
int tessedit_preserve_min_wd_len = 2
 
bool tessedit_row_rej_good_docs = true
 
double tessedit_good_doc_still_rowrej_wd = 1.1
 
bool tessedit_reject_bad_qual_wds = true
 
bool tessedit_debug_doc_rejection = false
 
bool tessedit_debug_quality_metrics = false
 
bool bland_unrej = false
 
double quality_rowrej_pc = 1.1
 
bool unlv_tilde_crunching = false
 
bool hocr_font_info = false
 
bool hocr_char_boxes = false
 
bool crunch_early_merge_tess_fails = true
 
bool crunch_early_convert_bad_unlv_chs = false
 
double crunch_terrible_rating = 80.0
 
bool crunch_terrible_garbage = true
 
double crunch_poor_garbage_cert = -9.0
 
double crunch_poor_garbage_rate = 60
 
double crunch_pot_poor_rate = 40
 
double crunch_pot_poor_cert = -8.0
 
bool crunch_pot_garbage = true
 
double crunch_del_rating = 60
 
double crunch_del_cert = -10.0
 
double crunch_del_min_ht = 0.7
 
double crunch_del_max_ht = 3.0
 
double crunch_del_min_width = 3.0
 
double crunch_del_high_word = 1.5
 
double crunch_del_low_word = 0.5
 
double crunch_small_outlines_size = 0.6
 
int crunch_rating_max = 10
 
int crunch_pot_indicators = 1
 
bool crunch_leave_ok_strings = true
 
bool crunch_accept_ok = true
 
bool crunch_leave_accept_strings = false
 
bool crunch_include_numerals = false
 
int crunch_leave_lc_strings = 4
 
int crunch_leave_uc_strings = 4
 
int crunch_long_repetitions = 3
 
int crunch_debug = 0
 
int fixsp_non_noise_limit = 1
 
double fixsp_small_outlines_size = 0.28
 
bool tessedit_prefer_joined_punct = false
 
int fixsp_done_mode = 1
 
int debug_fix_space_level = 0
 
char * numeric_punctuation = ".,"
 
int x_ht_acceptance_tolerance = 8
 
int x_ht_min_change = 8
 
int superscript_debug = 0
 
double superscript_worse_certainty = 2.0
 
double superscript_bettered_certainty = 0.97
 
double superscript_scaledown_ratio = 0.4
 
double subscript_max_y_top = 0.5
 
double superscript_min_y_bottom = 0.3
 
bool tessedit_write_block_separators = false
 
bool tessedit_write_rep_codes = false
 
bool tessedit_write_unlv = false
 
bool tessedit_create_txt = false
 
bool tessedit_create_hocr = false
 
bool tessedit_create_alto = false
 
bool tessedit_create_lstmbox = false
 
bool tessedit_create_tsv = false
 
bool tessedit_create_wordstrbox = false
 
bool tessedit_create_pdf = false
 
bool textonly_pdf = false
 
int jpg_quality = 85
 
int user_defined_dpi = 0
 
int min_characters_to_try = 50
 
char * unrecognised_char = "|"
 
int suspect_level = 99
 
int suspect_space_level = 100
 
int suspect_short_words = 2
 
bool suspect_constrain_1Il = false
 
double suspect_rating_per_ch = 999.9
 
double suspect_accept_rating = -999.9
 
bool tessedit_minimal_rejection = false
 
bool tessedit_zero_rejection = false
 
bool tessedit_word_for_word = false
 
bool tessedit_zero_kelvin_rejection = false
 
bool tessedit_consistent_reps = true
 
int tessedit_reject_mode = 0
 
bool tessedit_rejection_debug = false
 
bool tessedit_flip_0O = true
 
double tessedit_lower_flip_hyphen = 1.5
 
double tessedit_upper_flip_hyphen = 1.8
 
bool rej_trust_doc_dawg = false
 
bool rej_1Il_use_dict_word = false
 
bool rej_1Il_trust_permuter_type = true
 
bool rej_use_tess_accepted = true
 
bool rej_use_tess_blanks = true
 
bool rej_use_good_perm = true
 
bool rej_use_sensible_wd = false
 
bool rej_alphas_in_number_perm = false
 
double rej_whole_of_mostly_reject_word_fract = 0.85
 
int tessedit_image_border = 2
 
char * ok_repeated_ch_non_alphanum_wds = "-?*\075"
 
char * conflict_set_I_l_1 = "Il1[]"
 
int min_sane_x_ht_pixels = 8
 
bool tessedit_create_boxfile = false
 
int tessedit_page_number = -1
 
bool tessedit_write_images = false
 
bool interactive_display_mode = false
 
char * file_type = ".tif"
 
bool tessedit_override_permuter = true
 
char * tessedit_load_sublangs = ""
 
bool tessedit_use_primary_params_model = false
 
double min_orientation_margin = 7.0
 
bool textord_tabfind_show_vlines = false
 
bool textord_use_cjk_fp_model = false
 
bool poly_allow_detailed_fx = false
 
bool tessedit_init_config_only = false
 
bool textord_equation_detect = false
 
bool textord_tabfind_vertical_text = true
 
bool textord_tabfind_force_vertical_text = false
 
double textord_tabfind_vertical_text_ratio = 0.5
 
double textord_tabfind_aligned_gap_fraction = 0.75
 
int tessedit_parallelize = 0
 
bool preserve_interword_spaces = false
 
char * page_separator = "\f"
 
int lstm_choice_mode = 0
 
- Public Attributes inherited from tesseract::Wordrec
bool merge_fragments_in_matrix = true
 
bool wordrec_no_block = false
 
bool wordrec_enable_assoc = true
 
bool force_word_assoc = false
 
double wordrec_worst_state = 1
 
bool fragments_guide_chopper = false
 
int repair_unchopped_blobs = 1
 
double tessedit_certainty_threshold = -2.25
 
int chop_debug = 0
 
bool chop_enable = 1
 
bool chop_vertical_creep = 0
 
int chop_split_length = 10000
 
int chop_same_distance = 2
 
int chop_min_outline_points = 6
 
int chop_seam_pile_size = 150
 
bool chop_new_seam_pile = 1
 
int chop_inside_angle = -50
 
int chop_min_outline_area = 2000
 
double chop_split_dist_knob = 0.5
 
double chop_overlap_knob = 0.9
 
double chop_center_knob = 0.15
 
int chop_centered_maxwidth = 90
 
double chop_sharpness_knob = 0.06
 
double chop_width_change_knob = 5.0
 
double chop_ok_split = 100.0
 
double chop_good_split = 50.0
 
int chop_x_y_weight = 3
 
int segment_adjust_debug = 0
 
bool assume_fixed_pitch_char_segment = false
 
int wordrec_debug_level = 0
 
int wordrec_max_join_chunks = 4
 
bool wordrec_skip_no_truth_words = false
 
bool wordrec_debug_blamer = false
 
bool wordrec_run_blamer = false
 
int segsearch_debug_level = 0
 
int segsearch_max_pain_points = 2000
 
int segsearch_max_futile_classifications = 10
 
double segsearch_max_char_wh_ratio = 2.0
 
bool save_alt_choices = true
 
std::unique_ptr< LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
WERD_CHOICEprev_word_best_choice_
 
GenericVector< int > blame_reasons_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
bool allow_blob_division = true
 
bool prioritize_division = false
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_min_norm_scale_x = 0.0
 
double classify_max_norm_scale_x = 0.325
 
double classify_min_norm_scale_y = 0.0
 
double classify_max_norm_scale_y = 0.325
 
double classify_max_rating_ratio = 1.5
 
double classify_max_certainty_margin = 5.5
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
bool classify_nonlinear_norm = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_reliable_adaptive_result = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
double classify_adapted_pruning_factor = 2.5
 
double classify_adapted_pruning_threshold = -1.0
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = true
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = false
 
bool matcher_debug_separate_windows = false
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 15
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 10
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
ADAPT_TEMPLATES BackupAdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
bool EnableLearning
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
int il1_adaption_test = 0
 
bool classify_bln_numeric_mode = 0
 
double speckle_large_max_size = 0.30
 
double speckle_rating_penalty = 10.0
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
int ambigs_debug_level = 0
 
bool use_definite_ambigs_for_classifier = false
 
bool use_ambigs_for_adaption = false
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Classify
static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
 
- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Member Functions inherited from tesseract::Wordrec
bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
 
void ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
 
void InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 

Detailed Description

Definition at line 174 of file tesseractclass.h.

Constructor & Destructor Documentation

tesseract::Tesseract::Tesseract ( )

Definition at line 52 of file tesseractclass.cpp.

54  "Take segmentation and labeling from box file",
55  this->params()),
57  "Conversion of word/line box file to char box file",
58  this->params()),
60  "Generate training data from boxed chars", this->params()),
62  "Generate more boxes from boxed chars", this->params()),
64  "Break input into lines and remap boxes if present",
65  this->params()),
67  "Dump intermediate images made during page segmentation",
68  this->params()),
69  // The default for pageseg_mode is the old behaviour, so as not to
70  // upset anything that relies on that.
71  INT_MEMBER(
73  "Page seg mode: 0=osd only, 1=auto+osd, 2=auto_only, 3=auto, 4=column,"
74  " 5=block_vert, 6=block, 7=line, 8=word, 9=word_circle, 10=char,"
75  "11=sparse_text, 12=sparse_text+osd, 13=raw_line"
76  " (Values from PageSegMode enum in publictypes.h)",
77  this->params()),
79  "Which OCR engine(s) to run (Tesseract, LSTM, both)."
80  " Defaults to loading and running the most accurate"
81  " available.",
82  this->params()),
84  "Blacklist of chars not to recognize", this->params()),
86  "Whitelist of chars to recognize", this->params()),
88  "List of chars to override tessedit_char_blacklist",
89  this->params()),
91  "Perform training for ambiguities", this->params()),
94  "Whether to use the top-line splitting process for Devanagari "
95  "documents while performing page-segmentation.",
96  this->params()),
99  "Whether to use the top-line splitting process for Devanagari "
100  "documents while performing ocr.",
101  this->params()),
103  "Write all parameters to the given file.", this->params()),
105  "Generate and print debug"
106  " information for adaption",
107  this->params()),
108  INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
109  INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
110  INT_MEMBER(applybox_page, 0, "Page number to apply boxes from",
111  this->params()),
113  "Exposure value follows"
114  " this pattern in the image filename. The name of the image"
115  " files are expected to be in the form"
116  " [lang].[fontname].exp[num].tif",
117  this->params()),
119  "Learn both character fragments (as is done in the"
120  " special low exposure mode) as well as unfragmented"
121  " characters.",
122  this->params()),
124  "Each bounding box"
125  " is assumed to contain ngrams. Only learn the ngrams"
126  " whose outlines overlap horizontally.",
127  this->params()),
128  BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words",
129  this->params()),
130  BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices",
131  this->params()),
132  BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
133  this->params()),
135  "Try to improve fuzzy spaces", this->params()),
137  "Don't bother with word plausibility", this->params()),
138  BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
139  this->params()),
140  BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height",
141  this->params()),
143  "Add words to the document dictionary", this->params()),
144  BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char",
145  this->params()),
146  BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats",
147  this->params()),
149  "Enable correction based on the word bigram dictionary.",
150  this->params()),
152  "Enable single word correction based on the dictionary.",
153  this->params()),
155  "Amount of debug output for bigram correction.",
156  this->params()),
158  "Remove and conditionally reassign small outlines when they"
159  " confuse layout analysis, determining diacritics vs noise",
160  this->params()),
161  INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines",
162  this->params()),
163  // Worst (min) certainty, for which a diacritic is allowed to make the
164  // base
165  // character worse and still be included.
167  "Hingepoint for base char certainty", this->params()),
168  // Worst (min) certainty, for which a non-overlapping diacritic is allowed
169  // to make the base character worse and still be included.
171  "Hingepoint for disjoint certainty", this->params()),
172  // Worst (min) certainty, for which a diacritic is allowed to make a new
173  // stand-alone blob.
175  "Threshold for new punc char certainty", this->params()),
176  // Factor of certainty margin for adding diacritics to not count as worse.
178  "Scaling on certainty diff from Hingepoint",
179  this->params()),
180  INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob",
181  this->params()),
182  INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word",
183  this->params()),
184  INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
185  BOOL_MEMBER(debug_acceptable_wds, false, "Dump word pass/fail chk",
186  this->params()),
187  STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation",
188  this->params()),
189  STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation",
190  this->params()),
191  STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation",
192  this->params()),
194  "good_quality_doc lte rejection limit", this->params()),
196  "good_quality_doc gte good blobs limit", this->params()),
198  "good_quality_doc lte outline error limit", this->params()),
200  "good_quality_doc gte good char limit", this->params()),
201  INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word",
202  this->params()),
204  "Adaptation decision algorithm for tess", this->params()),
206  "Do minimal rejection on pass 1 output", this->params()),
207  BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria",
208  this->params()),
209  BOOL_MEMBER(tessedit_matcher_log, false, "Log matcher activity",
210  this->params()),
212  "Adaptation decision algorithm for tess", this->params()),
213  BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
214  double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
215  double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
216  INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.",
217  this->params()),
218  INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
219  this->params()),
221  "Run paragraph detection on the post-text-recognition "
222  "(more accurate)",
223  this->params()),
225  "Use ratings matrix/beam search with lstm", this->params()),
226  STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
227  this->params()),
228  STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines",
229  this->params()),
231  "Allow outline errs in unrejection?", this->params()),
233  "Reduce rejection on good docs", this->params()),
234  BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?",
235  this->params()),
237  "%rej allowed before rej whole doc", this->params()),
239  "%rej allowed before rej whole block", this->params()),
241  "%rej allowed before rej whole row", this->params()),
243  "Number of row rejects in whole word rejects"
244  " which prevents whole row rejection",
245  this->params()),
247  "Only rej partially rejected words in block rejection",
248  this->params()),
250  "Only rej partially rejected words in row rejection",
251  this->params()),
253  "Use word segmentation quality metric", this->params()),
255  "Use word segmentation quality metric", this->params()),
257  "Only preserve wds longer than this", this->params()),
259  "Apply row rejection to good docs", this->params()),
261  "rej good doc wd if more than this fraction rejected",
262  this->params()),
264  "Reject all bad quality wds", this->params()),
265  BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats",
266  this->params()),
268  "Output data to debug file", this->params()),
269  BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks",
270  this->params()),
272  "good_quality_doc gte good char limit", this->params()),
274  "Mark v.bad words for tilde crunch", this->params()),
275  BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
276  this->params()),
277  BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
278  this->params()),
279  BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
280  this->params()),
282  "Take out ~^ early?", this->params()),
283  double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this",
284  this->params()),
285  BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
287  "crunch garbage cert lt this", this->params()),
289  "crunch garbage rating lt this", this->params()),
290  double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this",
291  this->params()),
292  double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this",
293  this->params()),
294  BOOL_MEMBER(crunch_pot_garbage, true, "POTENTIAL crunch garbage",
295  this->params()),
296  double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this",
297  this->params()),
298  double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this",
299  this->params()),
300  double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this",
301  this->params()),
302  double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this",
303  this->params()),
305  "Del if word width lt xht x this", this->params()),
307  "Del if word gt xht x this above bl", this->params()),
309  "Del if word gt xht x this below bl", this->params()),
310  double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this",
311  this->params()),
312  INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch",
313  this->params()),
315  "How many potential indicators needed", this->params()),
316  BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings",
317  this->params()),
318  BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
319  this->params()),
321  "Don't pot crunch sensible strings", this->params()),
322  BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
323  this->params()),
325  "Don't crunch words with long lower case strings",
326  this->params()),
328  "Don't crunch words with long lower case strings",
329  this->params()),
331  "Crunch words with long repetitions", this->params()),
332  INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
334  "How many non-noise blbs either side?", this->params()),
335  double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this",
336  this->params()),
338  "Reward punctuation joins", this->params()),
339  INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing",
340  this->params()),
341  INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug",
342  this->params()),
344  "Punct. chs expected WITHIN numbers", this->params()),
346  "Max allowed deviation of blob top outside of font data",
347  this->params()),
349  "Min change in xht before actually trying it", this->params()),
351  "Debug level for sub & superscript fixer", this->params()),
354  "How many times worse "
355  "certainty does a superscript position glyph need to be for "
356  "us to try classifying it as a char with a different "
357  "baseline?",
358  this->params()),
361  "What reduction in "
362  "badness do we think sufficient to choose a superscript "
363  "over what we'd thought. For example, a value of 0.6 means "
364  "we want to reduce badness of certainty by at least 40%",
365  this->params()),
367  "A superscript scaled down more than this is unbelievably "
368  "small. For example, 0.3 means we expect the font size to "
369  "be no smaller than 30% of the text line font size.",
370  this->params()),
372  "Maximum top of a character measured as a multiple of "
373  "x-height above the baseline for us to reconsider whether "
374  "it's a subscript.",
375  this->params()),
377  "Minimum bottom of a character measured as a multiple of "
378  "x-height above the baseline for us to reconsider whether "
379  "it's a superscript.",
380  this->params()),
382  "Write block separators in output", this->params()),
383  BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code",
384  this->params()),
385  BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
386  this->params()),
387  BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file",
388  this->params()),
389  BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
390  this->params()),
391  BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file",
392  this->params()),
393  BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
394  this->params()),
395  BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
396  this->params()),
397  BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file",
398  this->params()),
399  BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
400  this->params()),
401  BOOL_MEMBER(textonly_pdf, false,
402  "Create PDF with only one invisible text layer",
403  this->params()),
404  INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params()),
405  INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image",
406  this->params()),
408  "Specify minimum characters to try during OSD",
409  this->params()),
411  "Output char for unidentified blobs", this->params()),
412  INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
414  "Min suspect level for rejecting spaces", this->params()),
416  "Don't suspect dict wds longer than this", this->params()),
417  BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
418  this->params()),
420  "Don't touch bad rating limit", this->params()),
421  double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
422  this->params()),
424  "Only reject tess failures", this->params()),
425  BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING",
426  this->params()),
428  "Make output have exactly one word per WERD", this->params()),
430  "Don't reject ANYTHING AT ALL", this->params()),
432  "Force all rep chars the same", this->params()),
433  INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
434  this->params()),
435  BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug",
436  this->params()),
437  BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips",
438  this->params()),
440  "Aspect ratio dot/hyphen test", this->params()),
442  "Aspect ratio dot/hyphen test", this->params()),
444  "Use DOC dawg in 11l conf. detector", this->params()),
445  BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
446  this->params()),
447  BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check",
448  this->params()),
449  BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
450  this->params()),
451  BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control",
452  this->params()),
453  BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control",
454  this->params()),
455  BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check",
456  this->params()),
457  BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check",
458  this->params()),
460  "if >this fract", this->params()),
461  INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit",
462  this->params()),
464  "Allow NN to unrej", this->params()),
465  STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set",
466  this->params()),
467  INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this",
468  this->params()),
469  BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes",
470  this->params()),
472  "-1 -> All pages"
473  " , else specific page to process",
474  this->params()),
476  "Capture the image from the IPE", this->params()),
477  BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
478  this->params()),
479  STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
480  BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word",
481  this->params()),
483  "List of languages to load with this one", this->params()),
485  "In multilingual mode use params model of the"
486  " primary language",
487  this->params()),
489  "Min acceptable orientation margin", this->params()),
490  BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
491  this->params()),
492  BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model",
493  this->params()),
495  "Allow feature extractors to see the original outline",
496  this->params()),
498  "Only initialize with the config file. Useful if the "
499  "instance is not going to be used for OCR but say only "
500  "for layout analysis.",
501  this->params()),
502  BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
503  this->params()),
505  "Enable vertical detection", this->params()),
507  "Force using vertical text page mode", this->params()),
510  "Fraction of textlines deemed vertical to use vertical page "
511  "mode",
512  this->params()),
515  "Fraction of height used as a minimum gap for aligned blobs.",
516  this->params()),
517  INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
518  this->params()),
520  "Preserve multiple interword spaces", this->params()),
522  "Page separator (default is form feed control character)",
523  this->params()),
525  "Allows to include alternative symbols choices in the hOCR output. "
526  "Valid input values are 0, 1, 2 and 3. 0 is the default value. "
527  "With 1 the alternative symbol choices per timestep are included. "
528  "With 2 the alternative symbol choices are accumulated per "
529  "character. ",
530  this->params()),
531 
532  backup_config_file_(nullptr),
533  pix_binary_(nullptr),
534  pix_grey_(nullptr),
535  pix_original_(nullptr),
536  pix_thresholds_(nullptr),
537  source_resolution_(0),
538  textord_(this),
539  right_to_left_(false),
540  scaled_color_(nullptr),
541  scaled_factor_(-1),
542  deskew_(1.0f, 0.0f),
543  reskew_(1.0f, 0.0f),
544  most_recently_used_(this),
545  font_table_size_(0),
546  equ_detect_(nullptr),
547 #ifndef ANDROID_BUILD
548  lstm_recognizer_(nullptr),
549 #endif
550  train_line_page_num_(0) {
551 }
ParamsVectors * params()
Definition: ccutil.h:65
char * tessedit_write_params_to_file
double tessedit_reject_row_percent
bool applybox_learn_chars_and_char_frags_mode
double rej_whole_of_mostly_reject_word_fract
double textord_tabfind_aligned_gap_fraction
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:324
double superscript_scaledown_ratio
double textord_tabfind_vertical_text_ratio
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:318
bool tessedit_preserve_blk_rej_perfect_wds
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:315
bool tessedit_enable_bigram_correction
bool tessedit_preserve_row_rej_perfect_wds
bool crunch_early_convert_bad_unlv_chs
double tessedit_good_doc_still_rowrej_wd
char * ok_repeated_ch_non_alphanum_wds
#define INT_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:327
double superscript_bettered_certainty
bool tessedit_resegment_from_line_boxes
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:321
double tessedit_reject_doc_percent
double tessedit_whole_wd_rej_row_percent
double superscript_worse_certainty
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:172
bool textord_tabfind_force_vertical_text
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:330
double tessedit_reject_block_percent
tesseract::Tesseract::~Tesseract ( )
override

Definition at line 553 of file tesseractclass.cpp.

553  {
554  Clear();
555  pixDestroy(&pix_original_);
556  end_tesseract();
557  sub_langs_.delete_data_pointers();
558 #ifndef ANDROID_BUILD
559  delete lstm_recognizer_;
560  lstm_recognizer_ = nullptr;
561 #endif
562 }

Member Function Documentation

bool tesseract::Tesseract::acceptable_number_string ( const char *  s,
const char *  lengths 
)

Definition at line 387 of file output.cpp.

388  {
389  bool prev_digit = false;
390 
391  if (*lengths == 1 && *s == '(')
392  s++;
393 
394  if (*lengths == 1 &&
395  ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
396  s++;
397 
398  for (; *s != '\0'; s += *(lengths++)) {
399  if (unicharset.get_isdigit(s, *lengths))
400  prev_digit = true;
401  else if (prev_digit &&
402  (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
403  prev_digit = false;
404  else if (prev_digit && *lengths == 1 &&
405  (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
406  return true;
407  else if (prev_digit &&
408  *lengths == 1 && (*s == '%') &&
409  (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
410  (*(s + *lengths + *(lengths + 1)) == '\0'))
411  return true;
412  else
413  return false;
414  }
415  return true;
416 }
UNICHARSET unicharset
Definition: ccutil.h:71
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string ( const UNICHARSET char_set,
const char *  s,
const char *  lengths 
)

Definition at line 1759 of file control.cpp.

1760  {
1761  int i = 0;
1762  int offset = 0;
1763  int leading_punct_count;
1764  int upper_count = 0;
1765  int hyphen_pos = -1;
1767 
1768  if (strlen (lengths) > 20)
1769  return word_type;
1770 
1771  /* Single Leading punctuation char*/
1772 
1773  if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
1774  offset += lengths[i++];
1775  leading_punct_count = i;
1776 
1777  /* Initial cap */
1778  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1779  offset += lengths[i++];
1780  upper_count++;
1781  }
1782  if (upper_count > 1) {
1783  word_type = AC_UPPER_CASE;
1784  } else {
1785  /* Lower case word, possibly with an initial cap */
1786  while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1787  offset += lengths[i++];
1788  }
1789  if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1790  goto not_a_word;
1791  /*
1792  Allow a single hyphen in a lower case word
1793  - don't trust upper case - I've seen several cases of "H" -> "I-I"
1794  */
1795  if (lengths[i] == 1 && s[offset] == '-') {
1796  hyphen_pos = i;
1797  offset += lengths[i++];
1798  if (s[offset] != '\0') {
1799  while ((s[offset] != '\0') &&
1800  char_set.get_islower(s + offset, lengths[i])) {
1801  offset += lengths[i++];
1802  }
1803  if (i < hyphen_pos + 3)
1804  goto not_a_word;
1805  }
1806  } else {
1807  /* Allow "'s" in NON hyphenated lower case words */
1808  if (lengths[i] == 1 && (s[offset] == '\'') &&
1809  lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1810  offset += lengths[i++];
1811  offset += lengths[i++];
1812  }
1813  }
1814  if (upper_count > 0)
1815  word_type = AC_INITIAL_CAP;
1816  else
1817  word_type = AC_LOWER_CASE;
1818  }
1819 
1820  /* Up to two different, constrained trailing punctuation chars */
1821  if (lengths[i] == 1 && s[offset] != '\0' &&
1822  STRING(chs_trailing_punct1).contains(s[offset]))
1823  offset += lengths[i++];
1824  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
1825  s[offset - lengths[i - 1]] != s[offset] &&
1826  STRING(chs_trailing_punct2).contains (s[offset]))
1827  offset += lengths[i++];
1828 
1829  if (s[offset] != '\0')
1830  word_type = AC_UNACCEPTABLE;
1831 
1832  not_a_word:
1833 
1834  if (word_type == AC_UNACCEPTABLE) {
1835  /* Look for abbreviation string */
1836  i = 0;
1837  offset = 0;
1838  if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1839  word_type = AC_UC_ABBREV;
1840  while (s[offset] != '\0' &&
1841  char_set.get_isupper(s + offset, lengths[i]) &&
1842  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1843  offset += lengths[i++];
1844  offset += lengths[i++];
1845  }
1846  }
1847  else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1848  word_type = AC_LC_ABBREV;
1849  while (s[offset] != '\0' &&
1850  char_set.get_islower(s + offset, lengths[i]) &&
1851  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1852  offset += lengths[i++];
1853  offset += lengths[i++];
1854  }
1855  }
1856  if (s[offset] != '\0')
1857  word_type = AC_UNACCEPTABLE;
1858  }
1859 
1860  return word_type;
1861 }
Unacceptable word.
Definition: control.h:30
Definition: strngs.h:45
ACCEPTABLE_WERD_TYPE
Definition: control.h:28
a.b.c.
Definition: control.h:34
ALL but initial lc.
Definition: control.h:33
A.B.C.
Definition: control.h:35
bool contains(char c) const
Definition: strngs.cpp:185
ALL upper case.
Definition: control.h:32
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498
ALL lower case.
Definition: control.h:31
int16_t tesseract::Tesseract::alpha_count ( const char *  word,
const char *  word_lengths 
)

Definition at line 496 of file reject.cpp.

497  {
498  int16_t i;
499  int16_t offset;
500  int16_t count = 0;
501 
502  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
503  if (unicharset.get_isalpha (word + offset, word_lengths[i]))
504  count++;
505  }
506  return count;
507 }
UNICHARSET unicharset
Definition: ccutil.h:71
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
int count(LIST var_list)
Definition: oldlist.cpp:96
void tesseract::Tesseract::ambigs_classify_and_output ( const char *  label,
PAGE_RES_IT pr_it,
FILE *  output_file 
)

Definition at line 211 of file recogtraining.cpp.

213  {
214  // Classify word.
215  fflush(stdout);
216  WordData word_data(*pr_it);
217  SetupWordPassN(1, &word_data);
218  classify_word_and_language(1, pr_it, &word_data);
219  WERD_RES* werd_res = word_data.word;
220  WERD_CHOICE* best_choice = werd_res->best_choice;
221  ASSERT_HOST(best_choice != nullptr);
222 
223  // Compute the number of unichars in the label.
224  GenericVector<UNICHAR_ID> encoding;
225  if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
226  tprintf("Not outputting illegal unichar %s\n", label);
227  return;
228  }
229 
230  // Dump all paths through the ratings matrix (which is normally small).
231  int dim = werd_res->ratings->dimension();
232  const auto** blob_choices = new const BLOB_CHOICE*[dim];
233  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset,
234  label, output_file);
235  delete[] blob_choices;
236 }
UNICHARSET unicharset
Definition: ccutil.h:71
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:177
WERD_CHOICE * best_choice
Definition: pageres.h:234
MATRIX * ratings
Definition: pageres.h:230
#define ASSERT_HOST(x)
Definition: errcode.h:88
int dimension() const
Definition: matrix.h:536
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:259
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1333
WERD * word
Definition: pageres.h:188
bool tesseract::Tesseract::AnyLSTMLang ( ) const
inline

Definition at line 295 of file tesseractclass.h.

295  {
297  return true;
298  for (int i = 0; i < sub_langs_.size(); ++i) {
299  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) {
300  return true;
301  }
302  }
303  return false;
304  }
bool tesseract::Tesseract::AnyTessLang ( ) const
inline

Definition at line 285 of file tesseractclass.h.

285  {
287  return true;
288  for (int i = 0; i < sub_langs_.size(); ++i) {
289  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_LSTM_ONLY)
290  return true;
291  }
292  return false;
293  }
PAGE_RES * tesseract::Tesseract::ApplyBoxes ( const STRING fname,
bool  find_segmentation,
BLOCK_LIST *  block_list 
)

Definition at line 109 of file applybox.cpp.

111  {
112  GenericVector<TBOX> boxes;
113  GenericVector<STRING> texts, full_texts;
114  if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
115  nullptr)) {
116  return nullptr; // Can't do it.
117  }
118 
119  const int box_count = boxes.size();
120  int box_failures = 0;
121 
122  // In word mode, we use the boxes to make a word for each box, but
123  // in blob mode we use the existing words and maximally chop them first.
124  PAGE_RES* page_res = find_segmentation ?
125  nullptr : SetupApplyBoxes(boxes, block_list);
126  clear_any_old_text(block_list);
127 
128  for (int i = 0; i < box_count; i++) {
129  bool foundit = false;
130  if (page_res != nullptr) {
131  foundit = ResegmentCharBox(page_res,
132  (i == 0) ? nullptr : &boxes[i - 1],
133  boxes[i],
134  (i == box_count - 1) ? nullptr : &boxes[i + 1],
135  full_texts[i].string());
136  } else {
137  foundit = ResegmentWordBox(block_list, boxes[i],
138  (i == box_count - 1) ? nullptr : &boxes[i + 1],
139  texts[i].string());
140  }
141  if (!foundit) {
142  box_failures++;
143  ReportFailedBox(i, boxes[i], texts[i].string(),
144  "FAILURE! Couldn't find a matching blob");
145  }
146  }
147 
148  if (page_res == nullptr) {
149  // In word/line mode, we now maximally chop all the words and resegment
150  // them with the classifier.
151  page_res = SetupApplyBoxes(boxes, block_list);
152  ReSegmentByClassification(page_res);
153  }
154  if (applybox_debug > 0) {
155  tprintf("APPLY_BOXES:\n");
156  tprintf(" Boxes read from boxfile: %6d\n", box_count);
157  if (box_failures > 0)
158  tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
159  }
160  TidyUp(page_res);
161  return page_res;
162 }
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
Definition: applybox.cpp:435
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:510
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
Definition: applybox.cpp:333
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:207
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:712
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:53
int size() const
Definition: genericvector.h:70
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
Definition: applybox.cpp:772
void tesseract::Tesseract::ApplyBoxTraining ( const STRING fontname,
PAGE_RES page_res 
)

Calls LearnWord to extract features for labelled blobs within each word. Features are stored in an internal buffer.

Definition at line 807 of file applybox.cpp.

807  {
808  PAGE_RES_IT pr_it(page_res);
809  int word_count = 0;
810  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
811  word_res = pr_it.forward()) {
812  LearnWord(fontname.string(), word_res);
813  ++word_count;
814  }
815  tprintf("Generated training data for %d words\n", word_count);
816 }
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:250
WERD * word
Definition: pageres.h:188
void tesseract::Tesseract::AssignDiacriticsToNewBlobs ( const GenericVector< C_OUTLINE * > &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
GenericVector< bool > *  word_wanted,
GenericVector< C_BLOB * > *  target_blobs 
)

Definition at line 1069 of file control.cpp.

1072  {
1073 #ifndef DISABLED_LEGACY_ENGINE
1074  GenericVector<bool> blob_wanted;
1075  word_wanted->init_to_size(outlines.size(), false);
1076  target_blobs->init_to_size(outlines.size(), nullptr);
1077  // Check for outlines that need to be turned into stand-alone blobs.
1078  for (int i = 0; i < outlines.size(); ++i) {
1079  if (outlines[i] == nullptr) continue;
1080  // Get a set of adjacent outlines that don't overlap any existing blob.
1081  blob_wanted.init_to_size(outlines.size(), false);
1082  int num_blob_outlines = 0;
1083  TBOX total_ol_box(outlines[i]->bounding_box());
1084  while (i < outlines.size() && outlines[i] != nullptr) {
1085  blob_wanted[i] = true;
1086  total_ol_box += outlines[i]->bounding_box();
1087  ++i;
1088  ++num_blob_outlines;
1089  }
1090  // Find the insertion point.
1091  C_BLOB_IT blob_it(real_word->cblob_list());
1092  while (!blob_it.at_last() &&
1093  blob_it.data_relative(1)->bounding_box().left() <=
1094  total_ol_box.left()) {
1095  blob_it.forward();
1096  }
1097  // Choose which combination of them we actually want and where to put
1098  // them.
1099  if (debug_noise_removal)
1100  tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1101  C_BLOB* left_blob = blob_it.data();
1102  TBOX left_box = left_blob->bounding_box();
1103  C_BLOB* right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1104  if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
1105  !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1106  SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
1107  outlines, num_blob_outlines,
1108  &blob_wanted)) {
1109  if (debug_noise_removal) tprintf("Added to left blob\n");
1110  for (int j = 0; j < blob_wanted.size(); ++j) {
1111  if (blob_wanted[j]) {
1112  (*word_wanted)[j] = true;
1113  (*target_blobs)[j] = left_blob;
1114  }
1115  }
1116  } else if (right_blob != nullptr &&
1117  (!left_box.x_overlap(total_ol_box) ||
1118  right_blob->bounding_box().x_overlap(total_ol_box)) &&
1120  right_blob, outlines,
1121  num_blob_outlines, &blob_wanted)) {
1122  if (debug_noise_removal) tprintf("Added to right blob\n");
1123  for (int j = 0; j < blob_wanted.size(); ++j) {
1124  if (blob_wanted[j]) {
1125  (*word_wanted)[j] = true;
1126  (*target_blobs)[j] = right_blob;
1127  }
1128  }
1129  } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr,
1130  outlines, num_blob_outlines,
1131  &blob_wanted)) {
1132  if (debug_noise_removal) tprintf("Fitted between blobs\n");
1133  for (int j = 0; j < blob_wanted.size(); ++j) {
1134  if (blob_wanted[j]) {
1135  (*word_wanted)[j] = true;
1136  (*target_blobs)[j] = nullptr;
1137  }
1138  }
1139  }
1140  }
1141 #endif // ndef DISABLED_LEGACY_ENGINE
1142 }
Definition: rect.h:34
TBOX bounding_box() const
Definition: stepblob.cpp:253
void init_to_size(int size, const T &t)
bool x_overlap(const TBOX &box) const
Definition: rect.h:401
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int size() const
Definition: genericvector.h:70
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1147
C_BLOB_LIST * cblob_list()
Definition: werd.h:95
void tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs ( const GenericVector< C_OUTLINE * > &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
GenericVector< bool > *  word_wanted,
GenericVector< bool > *  overlapped_any_blob,
GenericVector< C_BLOB * > *  target_blobs 
)

Definition at line 1014 of file control.cpp.

1018  {
1019 #ifndef DISABLED_LEGACY_ENGINE
1020  GenericVector<bool> blob_wanted;
1021  word_wanted->init_to_size(outlines.size(), false);
1022  overlapped_any_blob->init_to_size(outlines.size(), false);
1023  target_blobs->init_to_size(outlines.size(), nullptr);
1024  // For each real blob, find the outlines that seriously overlap it.
1025  // A single blob could be several merged characters, so there can be quite
1026  // a few outlines overlapping, and the full engine needs to be used to chop
1027  // and join to get a sensible result.
1028  C_BLOB_IT blob_it(real_word->cblob_list());
1029  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1030  C_BLOB* blob = blob_it.data();
1031  const TBOX blob_box = blob->bounding_box();
1032  blob_wanted.init_to_size(outlines.size(), false);
1033  int num_blob_outlines = 0;
1034  for (int i = 0; i < outlines.size(); ++i) {
1035  if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
1036  !(*word_wanted)[i]) {
1037  blob_wanted[i] = true;
1038  (*overlapped_any_blob)[i] = true;
1039  ++num_blob_outlines;
1040  }
1041  }
1042  if (debug_noise_removal) {
1043  tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1044  blob_box.print();
1045  }
1046  // If any outlines overlap the blob, and not too many, classify the blob
1047  // (using the full engine, languages and all), and choose the maximal
1048  // combination of outlines that doesn't hurt the end-result classification
1049  // by too much. Mark them as wanted.
1050  if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1051  if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
1052  outlines, num_blob_outlines,
1053  &blob_wanted)) {
1054  for (int i = 0; i < blob_wanted.size(); ++i) {
1055  if (blob_wanted[i]) {
1056  // Claim the outline and record where it is going.
1057  (*word_wanted)[i] = true;
1058  (*target_blobs)[i] = blob;
1059  }
1060  }
1061  }
1062  }
1063  }
1064 #endif // ndef DISABLED_LEGACY_ENGINE
1065 }
Definition: rect.h:34
void print() const
Definition: rect.h:278
TBOX bounding_box() const
Definition: stepblob.cpp:253
void init_to_size(int size, const T &t)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int size() const
Definition: genericvector.h:70
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:412
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1147
C_BLOB_LIST * cblob_list()
Definition: werd.h:95
int tesseract::Tesseract::AutoPageSeg ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks,
BLOBNBOX_LIST *  diacritic_blobs,
Tesseract osd_tess,
OSResults osr 
)

Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.

Resolution (in ppi) is derived from the input image.

The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.

If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.

If diacritic_blobs is non-null, then diacritics/noise blobs, that would confuse layout analysis by causing textline overlap, are placed there, with the expectation that they will be reassigned to words later and noise/diacriticness determined via classification.

If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).

Definition at line 200 of file pagesegmain.cpp.

203  {
204  Pix* photomask_pix = nullptr;
205  Pix* musicmask_pix = nullptr;
206  // The blocks made by the ColumnFinder. Moved to blocks before return.
207  BLOCK_LIST found_blocks;
208  TO_BLOCK_LIST temp_blocks;
209 
210  ColumnFinder* finder = SetupPageSegAndDetectOrientation(
211  pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
212  &musicmask_pix);
213  int result = 0;
214  if (finder != nullptr) {
215  TO_BLOCK_IT to_block_it(&temp_blocks);
216  TO_BLOCK* to_block = to_block_it.data();
217  if (musicmask_pix != nullptr) {
218  // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
219  // blocks separately. For now combine with photomask_pix.
220  pixOr(photomask_pix, photomask_pix, musicmask_pix);
221  }
222  if (equ_detect_) {
223  finder->SetEquationDetect(equ_detect_);
224  }
225  result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
226  to_block, photomask_pix, pix_thresholds_,
227  pix_grey_, &pixa_debug_, &found_blocks,
228  diacritic_blobs, to_blocks);
229  if (result >= 0)
230  finder->GetDeskewVectors(&deskew_, &reskew_);
231  delete finder;
232  }
233  pixDestroy(&photomask_pix);
234  pixDestroy(&musicmask_pix);
235  if (result < 0) return result;
236 
237  blocks->clear();
238  BLOCK_IT block_it(blocks);
239  // Move the found blocks to the input/output blocks.
240  block_it.add_list_after(&found_blocks);
241  return result;
242 }
ColumnFinder * SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
bool tesseract::Tesseract::BelievableSuperscript ( bool  debug,
const WERD_RES word,
float  certainty_threshold,
int *  left_ok,
int *  right_ok 
) const

Return whether this is believable superscript or subscript text.

We insist that:

  • there are no punctuation marks.
  • there are no italics.
  • no normal-sized character is smaller than superscript_scaledown_ratio of what it ought to be, and
  • each character is at least as certain as certainty_threshold.
Parameters
[in]debugIf true, spew debug output
[in]wordThe word whose best_choice we're evaluating
[in]certainty_thresholdIf any of the characters have less certainty than this, reject.
[out]left_okHow many left-side characters were ok?
[out]right_okHow many right-side characters were ok?
Returns
Whether the complete best choice is believable as a superscript.

Definition at line 522 of file superscript.cpp.

526  {
527  int initial_ok_run_count = 0;
528  int ok_run_count = 0;
529  float worst_certainty = 0.0f;
530  const WERD_CHOICE &wc = *word.best_choice;
531 
532  const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
533  for (int i = 0; i < wc.length(); i++) {
534  TBLOB *blob = word.rebuild_word->blobs[i];
535  UNICHAR_ID unichar_id = wc.unichar_id(i);
536  float char_certainty = wc.certainty(i);
537  bool bad_certainty = char_certainty < certainty_threshold;
538  bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
539  bool is_italic = word.fontinfo && word.fontinfo->is_italic();
540  BLOB_CHOICE *choice = word.GetBlobChoice(i);
541  if (choice && fontinfo_table.size() > 0) {
542  // Get better information from the specific choice, if available.
543  int font_id1 = choice->fontinfo_id();
544  bool font1_is_italic = font_id1 >= 0
545  ? fontinfo_table.get(font_id1).is_italic() : false;
546  int font_id2 = choice->fontinfo_id2();
547  is_italic = font1_is_italic &&
548  (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
549  }
550 
551  float height_fraction = 1.0f;
552  float char_height = blob->bounding_box().height();
553  float normal_height = char_height;
554  if (wc.unicharset()->top_bottom_useful()) {
555  int min_bot, max_bot, min_top, max_top;
556  wc.unicharset()->get_top_bottom(unichar_id,
557  &min_bot, &max_bot,
558  &min_top, &max_top);
559  float hi_height = max_top - max_bot;
560  float lo_height = min_top - min_bot;
561  normal_height = (hi_height + lo_height) / 2;
562  if (normal_height >= kBlnXHeight) {
563  // Only ding characters that we have decent information for because
564  // they're supposed to be normal sized, not tiny specks or dashes.
565  height_fraction = char_height / normal_height;
566  }
567  }
568  bool bad_height = height_fraction < superscript_scaledown_ratio;
569 
570  if (debug) {
571  if (is_italic) {
572  tprintf(" Rejecting: superscript is italic.\n");
573  }
574  if (is_punc) {
575  tprintf(" Rejecting: punctuation present.\n");
576  }
577  const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
578  if (bad_certainty) {
579  tprintf(" Rejecting: don't believe character %s with certainty %.2f "
580  "which is less than threshold %.2f\n", char_str,
581  char_certainty, certainty_threshold);
582  }
583  if (bad_height) {
584  tprintf(" Rejecting: character %s seems too small @ %.2f versus "
585  "expected %.2f\n", char_str, char_height, normal_height);
586  }
587  }
588  if (bad_certainty || bad_height || is_punc || is_italic) {
589  if (ok_run_count == i) {
590  initial_ok_run_count = ok_run_count;
591  }
592  ok_run_count = 0;
593  } else {
594  ok_run_count++;
595  }
596  if (char_certainty < worst_certainty) {
597  worst_certainty = char_certainty;
598  }
599  }
600  bool all_ok = ok_run_count == wc.length();
601  if (all_ok && debug) {
602  tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
603  }
604  if (!all_ok) {
605  if (left_ok) *left_ok = initial_ok_run_count;
606  if (right_ok) *right_ok = ok_run_count;
607  }
608  return all_ok;
609 }
const int kBlnXHeight
Definition: normalis.h:24
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:519
TWERD * rebuild_word
Definition: pageres.h:259
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
double superscript_scaledown_ratio
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:754
bool is_italic() const
Definition: fontinfo.h:111
TBOX bounding_box() const
Definition: blobs.cpp:472
const T & get(int id) const
Return the object from an id.
int length() const
Definition: ratngs.h:303
Definition: blobs.h:263
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
int16_t height() const
Definition: rect.h:108
int size() const
Return the size used.
int16_t fontinfo_id() const
Definition: ratngs.h:86
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:568
const FontInfo * fontinfo
Definition: pageres.h:303
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
bool top_bottom_useful() const
Definition: unicharset.h:537
float certainty() const
Definition: ratngs.h:330
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
WERD_CHOICE * best_choice
Definition: pageres.h:234
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
int16_t fontinfo_id2() const
Definition: ratngs.h:89
int UNICHAR_ID
Definition: unichar.h:34
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
Pix* tesseract::Tesseract::BestPix ( ) const
inline

Definition at line 233 of file tesseractclass.h.

233  {
234  if (pixGetWidth(pix_original_) == ImageWidth()) {
235  return pix_original_;
236  } else if (pix_grey_ != nullptr) {
237  return pix_grey_;
238  } else {
239  return pix_binary_;
240  }
241  }
int ImageWidth() const
void tesseract::Tesseract::bigram_correction_pass ( PAGE_RES page_res)

Definition at line 468 of file control.cpp.

468  {
469  PAGE_RES_IT word_it(page_res);
470 
471  WERD_RES *w_prev = nullptr;
472  WERD_RES *w = word_it.word();
473  while (true) {
474  w_prev = w;
475  while (word_it.forward() != nullptr &&
476  (!word_it.word() || word_it.word()->part_of_combo)) {
477  // advance word_it, skipping over parts of combos
478  }
479  if (!word_it.word()) break;
480  w = word_it.word();
481  if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
482  continue;
483  }
484  if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
485  if (tessedit_bigram_debug) {
486  tprintf("Skipping because one of the words is W_REP_CHAR\n");
487  }
488  continue;
489  }
490  // Two words sharing the same language model, excellent!
491  GenericVector<WERD_CHOICE *> overrides_word1;
492  GenericVector<WERD_CHOICE *> overrides_word2;
493 
494  const STRING orig_w1_str = w_prev->best_choice->unichar_string();
495  const STRING orig_w2_str = w->best_choice->unichar_string();
496  WERD_CHOICE prev_best(w->uch_set);
497  {
498  int w1start, w1end;
499  w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
500  prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
501  }
502  WERD_CHOICE this_best(w->uch_set);
503  {
504  int w2start, w2end;
505  w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
506  this_best = w->best_choice->shallow_copy(w2start, w2end);
507  }
508 
509  if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
510  if (tessedit_bigram_debug) {
511  tprintf("Top choice \"%s %s\" verified by bigram model.\n",
512  orig_w1_str.string(), orig_w2_str.string());
513  }
514  continue;
515  }
516  if (tessedit_bigram_debug > 2) {
517  tprintf("Examining alt choices for \"%s %s\".\n",
518  orig_w1_str.string(), orig_w2_str.string());
519  }
520  if (tessedit_bigram_debug > 1) {
521  if (!w_prev->best_choices.singleton()) {
522  w_prev->PrintBestChoices();
523  }
524  if (!w->best_choices.singleton()) {
525  w->PrintBestChoices();
526  }
527  }
528  float best_rating = 0.0;
529  int best_idx = 0;
530  WERD_CHOICE_IT prev_it(&w_prev->best_choices);
531  for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
532  WERD_CHOICE *p1 = prev_it.data();
533  WERD_CHOICE strip1(w->uch_set);
534  {
535  int p1start, p1end;
536  p1->GetNonSuperscriptSpan(&p1start, &p1end);
537  strip1 = p1->shallow_copy(p1start, p1end);
538  }
539  WERD_CHOICE_IT w_it(&w->best_choices);
540  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
541  WERD_CHOICE *p2 = w_it.data();
542  WERD_CHOICE strip2(w->uch_set);
543  {
544  int p2start, p2end;
545  p2->GetNonSuperscriptSpan(&p2start, &p2end);
546  strip2 = p2->shallow_copy(p2start, p2end);
547  }
548  if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
549  overrides_word1.push_back(p1);
550  overrides_word2.push_back(p2);
551  if (overrides_word1.size() == 1 ||
552  p1->rating() + p2->rating() < best_rating) {
553  best_rating = p1->rating() + p2->rating();
554  best_idx = overrides_word1.size() - 1;
555  }
556  }
557  }
558  }
559  if (!overrides_word1.empty()) {
560  // Excellent, we have some bigram matches.
562  *overrides_word1[best_idx]) &&
564  *overrides_word2[best_idx])) {
565  if (tessedit_bigram_debug > 1) {
566  tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
567  "model.\n", orig_w1_str.string(), orig_w2_str.string());
568  }
569  continue;
570  }
571  const STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
572  const STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
573  if (new_w1_str != orig_w1_str) {
574  w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
575  }
576  if (new_w2_str != orig_w2_str) {
577  w->ReplaceBestChoice(overrides_word2[best_idx]);
578  }
579  if (tessedit_bigram_debug > 0) {
580  STRING choices_description;
581  int num_bigram_choices
582  = overrides_word1.size() * overrides_word2.size();
583  if (num_bigram_choices == 1) {
584  choices_description = "This was the unique bigram choice.";
585  } else {
586  if (tessedit_bigram_debug > 1) {
587  STRING bigrams_list;
588  const int kMaxChoicesToPrint = 20;
589  for (int i = 0; i < overrides_word1.size() &&
590  i < kMaxChoicesToPrint; i++) {
591  if (i > 0) { bigrams_list += ", "; }
592  WERD_CHOICE *p1 = overrides_word1[i];
593  WERD_CHOICE *p2 = overrides_word2[i];
594  bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
595  }
596  choices_description = "There were many choices: {";
597  choices_description += bigrams_list;
598  choices_description += "}";
599  } else {
600  choices_description.add_str_int("There were ", num_bigram_choices);
601  choices_description += " compatible bigrams.";
602  }
603  }
604  tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
605  orig_w1_str.string(), orig_w2_str.string(),
606  new_w1_str.string(), new_w2_str.string(),
607  choices_description.string());
608  }
609  }
610  }
611 }
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:799
Definition: strngs.h:45
void add_str_int(const char *str, int number)
Definition: strngs.cpp:377
const STRING & unichar_string() const
Definition: ratngs.h:541
WERD_CHOICE_LIST best_choices
Definition: pageres.h:242
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:805
tesseract::Tesseract * tesseract
Definition: pageres.h:281
repeated character
Definition: werd.h:38
const UNICHARSET * uch_set
Definition: pageres.h:205
float rating() const
Definition: ratngs.h:327
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
bool empty() const
Definition: genericvector.h:89
int push_back(T object)
Dict & getDict() override
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:397
WERD_CHOICE * best_choice
Definition: pageres.h:234
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:822
int size() const
Definition: genericvector.h:70
void PrintBestChoices() const
Definition: pageres.cpp:721
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:414
WERD * word
Definition: pageres.h:188
void tesseract::Tesseract::blamer_pass ( PAGE_RES page_res)

Definition at line 711 of file control.cpp.

711  {
712  if (!wordrec_run_blamer) return;
713  PAGE_RES_IT page_res_it(page_res);
714  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
715  page_res_it.forward()) {
716  WERD_RES *word = page_res_it.word();
719  }
720  tprintf("Blame reasons:\n");
721  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
723  static_cast<IncorrectResultReason>(bl)),
724  page_res->blame_reasons[bl]);
725  }
726  if (page_res->misadaption_log.length() > 0) {
727  tprintf("Misadaption log:\n");
728  for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
729  tprintf("%s\n", page_res->misadaption_log[i].string());
730  }
731  }
732 }
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:555
int length() const
Definition: genericvector.h:84
GenericVector< int > blame_reasons
Definition: pageres.h:86
bool wordrec_run_blamer
Definition: wordrec.h:237
GenericVector< STRING > misadaption_log
Definition: pageres.h:91
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:118
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
bool wordrec_debug_blamer
Definition: wordrec.h:236
WERD * word
Definition: pageres.h:188
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:61
BlamerBundle * blamer_bundle
Definition: pageres.h:245
void tesseract::Tesseract::blob_feature_display ( PAGE_RES page_res,
const TBOX selection_box 
)

Definition at line 941 of file pgedit.cpp.

942  {
943 #ifndef DISABLED_LEGACY_ENGINE
944  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
945  if (it != nullptr) {
946  WERD_RES* word_res = it->word();
947  word_res->x_height = it->row()->row->x_height();
948  word_res->SetupForRecognition(unicharset, this, BestPix(),
949  tessedit_ocr_engine_mode, nullptr,
953  it->row()->row, it->block()->block);
954  TWERD* bln_word = word_res->chopped_word;
955  TBLOB* bln_blob = bln_word->blobs[0];
956  INT_FX_RESULT_STRUCT fx_info;
959  Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features,
960  &cn_features, &fx_info, nullptr);
961  // Display baseline features.
962  ScrollView* bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0);
964  for (int f = 0; f < bl_features.size(); ++f)
965  RenderIntFeature(bl_win, &bl_features[f], ScrollView::GREEN);
966  bl_win->Update();
967  // Display cn features.
968  ScrollView* cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0);
970  for (int f = 0; f < cn_features.size(); ++f)
971  RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN);
972  cn_win->Update();
973 
974  it->DeleteCurrentWord();
975  delete it;
976  }
977 #endif // ndef DISABLED_LEGACY_ENGINE
978 }
BLOCK * block
Definition: pageres.h:116
float x_height() const
Definition: ocrrow.h:64
Definition: blobs.h:397
static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
Definition: intfx.cpp:442
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:306
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
ROW_RES * row() const
Definition: pageres.h:758
UNICHARSET unicharset
Definition: ccutil.h:71
BLOCK_RES * block() const
Definition: pageres.h:761
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:987
Definition: blobs.h:263
bool classify_nonlinear_norm
Definition: classify.h:456
static void Update()
Definition: scrollview.cpp:709
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1763
Pix * BestPix() const
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:35
WERD_RES * word() const
Definition: pageres.h:755
TWERD * chopped_word
Definition: pageres.h:214
int size() const
Definition: genericvector.h:70
ROW * row
Definition: pageres.h:142
float x_height
Definition: pageres.h:310
void DeleteCurrentWord()
Definition: pageres.cpp:1487
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1602
bool classify_bln_numeric_mode
Definition: classify.h:540
float tesseract::Tesseract::blob_noise_score ( TBLOB blob)

Definition at line 787 of file fixspace.cpp.

787  {
788  TBOX box; // BB of outline
789  int16_t outline_count = 0;
790  int16_t max_dimension;
791  int16_t largest_outline_dimension = 0;
792 
793  for (TESSLINE* ol = blob->outlines; ol != nullptr; ol= ol->next) {
794  outline_count++;
795  box = ol->bounding_box();
796  if (box.height() > box.width()) {
797  max_dimension = box.height();
798  } else {
799  max_dimension = box.width();
800  }
801 
802  if (largest_outline_dimension < max_dimension)
803  largest_outline_dimension = max_dimension;
804  }
805 
806  if (outline_count > 5) {
807  // penalise LOTS of blobs
808  largest_outline_dimension *= 2;
809  }
810 
811  box = blob->bounding_box();
812  if (box.bottom() > kBlnBaselineOffset * 4 ||
813  box.top() < kBlnBaselineOffset / 2) {
814  // Lax blob is if high or low
815  largest_outline_dimension /= 2;
816  }
817 
818  return largest_outline_dimension;
819 }
int16_t top() const
Definition: rect.h:58
TESSLINE * next
Definition: blobs.h:260
Definition: rect.h:34
TBOX bounding_box() const
Definition: blobs.cpp:472
TESSLINE * outlines
Definition: blobs.h:379
int16_t height() const
Definition: rect.h:108
const int kBlnBaselineOffset
Definition: normalis.h:25
int16_t width() const
Definition: rect.h:115
int16_t bottom() const
Definition: rect.h:65
void tesseract::Tesseract::break_noisiest_blob_word ( WERD_RES_LIST &  words)

break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.

Definition at line 642 of file fixspace.cpp.

642  {
643  WERD_RES_IT word_it(&words);
644  WERD_RES_IT worst_word_it;
645  float worst_noise_score = 9999;
646  int worst_blob_index = -1; // Noisiest blob of noisiest wd
647  int blob_index; // of wds noisiest blob
648  float noise_score; // of wds noisiest blob
649  WERD_RES *word_res;
650  C_BLOB_IT blob_it;
651  C_BLOB_IT rej_cblob_it;
652  C_BLOB_LIST new_blob_list;
653  C_BLOB_IT new_blob_it;
654  C_BLOB_IT new_rej_cblob_it;
655  WERD *new_word;
656  int16_t start_of_noise_blob;
657  int16_t i;
658 
659  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
660  blob_index = worst_noise_blob(word_it.data(), &noise_score);
661  if (blob_index > -1 && worst_noise_score > noise_score) {
662  worst_noise_score = noise_score;
663  worst_blob_index = blob_index;
664  worst_word_it = word_it;
665  }
666  }
667  if (worst_blob_index < 0) {
668  words.clear(); // signal termination
669  return;
670  }
671 
672  /* Now split the worst_word_it */
673 
674  word_res = worst_word_it.data();
675 
676  /* Move blobs before noise blob to a new bloblist */
677 
678  new_blob_it.set_to_list(&new_blob_list);
679  blob_it.set_to_list(word_res->word->cblob_list());
680  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
681  new_blob_it.add_after_then_move(blob_it.extract());
682  }
683  start_of_noise_blob = blob_it.data()->bounding_box().left();
684  delete blob_it.extract(); // throw out noise blob
685 
686  new_word = new WERD(&new_blob_list, word_res->word);
687  new_word->set_flag(W_EOL, false);
688  word_res->word->set_flag(W_BOL, false);
689  word_res->word->set_blanks(1); // After break
690 
691  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
692  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
693  for (;
694  (!rej_cblob_it.empty() &&
695  (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
696  rej_cblob_it.forward()) {
697  new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
698  }
699 
700  auto* new_word_res = new WERD_RES(new_word);
701  new_word_res->combination = true;
702  worst_word_it.add_before_then_move(new_word_res);
703 
704  word_res->ClearResults();
705 }
Definition: werd.h:56
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:90
start of line
Definition: werd.h:32
end of line
Definition: werd.h:33
void set_blanks(uint8_t new_blanks)
Definition: werd.h:102
void ClearResults()
Definition: pageres.cpp:1151
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:707
WERD * word
Definition: pageres.h:188
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:118
C_BLOB_LIST * cblob_list()
Definition: werd.h:95
SVMenuNode * tesseract::Tesseract::build_menu_new ( )

Definition at line 300 of file pgedit.cpp.

300  {
301  SVMenuNode* parent_menu;
302  auto* root_menu_item = new SVMenuNode();
303 
304  SVMenuNode* modes_menu_item = root_menu_item->AddChild("MODES");
305 
306  modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
307  modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
308  modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
309  modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
310  modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
311  modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
312  modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
313  modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES);
314 
315  parent_menu = root_menu_item->AddChild("DISPLAY");
316 
317  parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, false);
318  parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, false);
319  parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, false);
320  parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, false);
321  parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, false);
322  parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, true);
323  parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
324  parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
325  parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
326  parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
327  parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
328  parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
329  parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
330  parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
331  parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
332 
333 
334  parent_menu = root_menu_item->AddChild("OTHER");
335 
336  parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
337  parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, false);
338  parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, false);
339  parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, false);
340  parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
341  parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
342 
343  return root_menu_item;
344 }
SVMenuNode * AddChild(const char *txt)
Definition: svmnode.cpp:58
bool tesseract::Tesseract::check_debug_pt ( WERD_RES word,
int  location 
)

Definition at line 1863 of file control.cpp.

1863  {
1864  bool show_map_detail = false;
1865  int16_t i;
1866 
1867  if (!test_pt)
1868  return false;
1869 
1870  tessedit_rejection_debug.set_value (false);
1871  debug_x_ht_level.set_value(0);
1872 
1873  if (word->word->bounding_box().contains(FCOORD (test_pt_x, test_pt_y))) {
1874  if (location < 0)
1875  return true; // For breakpoint use
1876  tessedit_rejection_debug.set_value(true);
1877  debug_x_ht_level.set_value(2);
1878  tprintf ("\n\nTESTWD::");
1879  switch (location) {
1880  case 0:
1881  tprintf ("classify_word_pass1 start\n");
1882  word->word->print();
1883  break;
1884  case 10:
1885  tprintf ("make_reject_map: initial map");
1886  break;
1887  case 20:
1888  tprintf ("make_reject_map: after NN");
1889  break;
1890  case 30:
1891  tprintf ("classify_word_pass2 - START");
1892  break;
1893  case 40:
1894  tprintf ("classify_word_pass2 - Pre Xht");
1895  break;
1896  case 50:
1897  tprintf ("classify_word_pass2 - END");
1898  show_map_detail = true;
1899  break;
1900  case 60:
1901  tprintf ("fixspace");
1902  break;
1903  case 70:
1904  tprintf ("MM pass START");
1905  break;
1906  case 80:
1907  tprintf ("MM pass END");
1908  break;
1909  case 90:
1910  tprintf ("After Poor quality rejection");
1911  break;
1912  case 100:
1913  tprintf ("unrej_good_quality_words - START");
1914  break;
1915  case 110:
1916  tprintf ("unrej_good_quality_words - END");
1917  break;
1918  case 120:
1919  tprintf ("Write results pass");
1920  show_map_detail = true;
1921  break;
1922  }
1923  if (word->best_choice != nullptr) {
1924  tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
1925  word->reject_map.print(debug_fp);
1926  tprintf("\n");
1927  if (show_map_detail) {
1928  tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
1929  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1930  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1931  word->reject_map[i].full_print(debug_fp);
1932  }
1933  }
1934  } else {
1935  tprintf("null best choice\n");
1936  }
1937  tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1938  tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1939  return true;
1940  } else {
1941  return false;
1942  }
1943 }
void full_print(FILE *fp)
Definition: rejctmap.cpp:333
bool done
Definition: pageres.h:297
Definition: points.h:188
FILE * debug_fp
Definition: tessvars.cpp:24
const STRING & unichar_string() const
Definition: ratngs.h:541
bool contains(const FCOORD pt) const
Definition: rect.h:333
void print(FILE *fp)
Definition: rejctmap.cpp:321
bool tess_accepted
Definition: pageres.h:295
REJMAP reject_map
Definition: pageres.h:286
void print()
Definition: werd.cpp:253
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
TBOX bounding_box() const
Definition: werd.cpp:148
WERD_CHOICE * best_choice
Definition: pageres.h:234
WERD * word
Definition: pageres.h:188
void tesseract::Tesseract::classify_word_and_language ( int  pass_n,
PAGE_RES_IT pr_it,
WordData word_data 
)

Definition at line 1333 of file control.cpp.

1334  {
1335 #ifdef DISABLED_LEGACY_ENGINE
1337 #else
1338  WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
1340 #endif // def DISABLED_LEGACY_ENGINE
1341 
1342  // Best result so far.
1343  PointerVector<WERD_RES> best_words;
1344  // Points to the best result. May be word or in lang_words.
1345  const WERD_RES* word = word_data->word;
1346  clock_t start_t = clock();
1347  const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1348  if (debug) {
1349  tprintf("%s word with lang %s at:",
1350  word->done ? "Already done" : "Processing",
1351  most_recently_used_->lang.string());
1352  word->word->bounding_box().print();
1353  }
1354  if (word->done) {
1355  // If done on pass1, leave it as-is.
1356  if (!word->tess_failed)
1357  most_recently_used_ = word->tesseract;
1358  return;
1359  }
1360  int sub = sub_langs_.size();
1361  if (most_recently_used_ != this) {
1362  // Get the index of the most_recently_used_.
1363  for (sub = 0; sub < sub_langs_.size() &&
1364  most_recently_used_ != sub_langs_[sub]; ++sub) {}
1365  }
1366  most_recently_used_->RetryWithLanguage(
1367  *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
1368  Tesseract* best_lang_tess = most_recently_used_;
1369  if (!WordsAcceptable(best_words)) {
1370  // Try all the other languages to see if they are any better.
1371  if (most_recently_used_ != this &&
1372  this->RetryWithLanguage(*word_data, recognizer, debug,
1373  &word_data->lang_words[sub_langs_.size()],
1374  &best_words) > 0) {
1375  best_lang_tess = this;
1376  }
1377  for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
1378  ++i) {
1379  if (most_recently_used_ != sub_langs_[i] &&
1380  sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
1381  &word_data->lang_words[i],
1382  &best_words) > 0) {
1383  best_lang_tess = sub_langs_[i];
1384  }
1385  }
1386  }
1387  most_recently_used_ = best_lang_tess;
1388  if (!best_words.empty()) {
1389  if (best_words.size() == 1 && !best_words[0]->combination) {
1390  // Move the best single result to the main word.
1391  word_data->word->ConsumeWordResults(best_words[0]);
1392  } else {
1393  // Words came from LSTM, and must be moved to the PAGE_RES properly.
1394  word_data->word = best_words.back();
1395  pr_it->ReplaceCurrentWord(&best_words);
1396  }
1397  ASSERT_HOST(word_data->word->box_word != nullptr);
1398  } else {
1399  tprintf("no best words!!\n");
1400  }
1401  clock_t ocr_t = clock();
1402  if (tessedit_timing_debug) {
1403  tprintf("%s (ocr took %.2f sec)\n",
1404  word_data->word->best_choice->unichar_string().string(),
1405  static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
1406  }
1407 }
bool tess_failed
Definition: pageres.h:287
bool done
Definition: pageres.h:297
void print() const
Definition: rect.h:278
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
tesseract::Tesseract * tesseract
Definition: pageres.h:281
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
STRING lang
Definition: ccutil.h:69
TBOX bounding_box() const
Definition: werd.cpp:148
#define ASSERT_HOST(x)
Definition: errcode.h:88
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1415
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:905
WERD * word
Definition: pageres.h:188
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1586
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1380
void tesseract::Tesseract::classify_word_pass1 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass1

Baseline normalize the word and pass it to Tess.

Definition at line 1415 of file control.cpp.

1417  {
1418  ROW* row = word_data.row;
1419  BLOCK* block = word_data.block;
1420  prev_word_best_choice_ = word_data.prev_word != nullptr
1421  ? word_data.prev_word->word->best_choice : nullptr;
1422 #ifndef ANDROID_BUILD
1423 #ifdef DISABLED_LEGACY_ENGINE
1425 #else
1428 #endif // def DISABLED_LEGACY_ENGINE
1429  if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1430  LSTMRecognizeWord(*block, row, *in_word, out_words);
1431  if (!out_words->empty())
1432  return; // Successful lstm recognition.
1433  }
1435  // No fallback allowed, so use a fake.
1436  (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1437  return;
1438  }
1439 
1440  #ifndef DISABLED_LEGACY_ENGINE
1441  // Fall back to tesseract for failed words or odd words.
1442  (*in_word)->SetupForRecognition(unicharset, this, BestPix(),
1443  OEM_TESSERACT_ONLY, nullptr,
1446  poly_allow_detailed_fx, row, block);
1447 #endif // ndef DISABLED_LEGACY_ENGINE
1448  }
1449 #endif // ndef ANDROID_BUILD
1450 
1451 #ifndef DISABLED_LEGACY_ENGINE
1452  WERD_RES* word = *in_word;
1453  match_word_pass_n(1, word, row, block);
1454  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1455  word->tess_would_adapt = AdaptableWord(word);
1456  bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1457 
1458  if (adapt_ok) {
1459  // Send word to adaptive classifier for training.
1460  word->BestChoiceToCorrectText();
1461  LearnWord(nullptr, word);
1462  // Mark misadaptions if running blamer.
1463  if (word->blamer_bundle != nullptr) {
1466  }
1467  }
1468 
1469  if (tessedit_enable_doc_dict && !word->IsAmbiguous())
1471  }
1472 #endif // ndef DISABLED_LEGACY_ENGINE
1473 }
bool tess_failed
Definition: pageres.h:287
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:821
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481
bool word_adaptable(WERD_RES *word, uint16_t mode)
Definition: adaptions.cpp:34
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:222
const UNICHARSET & GetUnicharset() const
UNICHARSET unicharset
Definition: ccutil.h:71
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1644
repeated character
Definition: werd.h:38
bool tess_would_adapt
Definition: pageres.h:296
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:582
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:72
Pix * BestPix() const
bool empty() const
Definition: genericvector.h:89
void BestChoiceToCorrectText()
Definition: pageres.cpp:927
WERD_CHOICE * best_choice
Definition: pageres.h:234
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:250
bool wordrec_debug_blamer
Definition: wordrec.h:236
Definition: ocrrow.h:36
WERD * word
Definition: pageres.h:188
bool IsAmbiguous()
Definition: pageres.cpp:456
Definition: ocrblock.h:29
bool classify_bln_numeric_mode
Definition: classify.h:540
BlamerBundle * blamer_bundle
Definition: pageres.h:245
void tesseract::Tesseract::classify_word_pass2 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass2

Control what to do with the word in pass 2

Definition at line 1586 of file control.cpp.

1588  {
1589  // Return if we do not want to run Tesseract.
1591  return;
1592  }
1593 #ifndef DISABLED_LEGACY_ENGINE
1594  ROW* row = word_data.row;
1595  BLOCK* block = word_data.block;
1596  WERD_RES* word = *in_word;
1597  prev_word_best_choice_ = word_data.prev_word != nullptr
1598  ? word_data.prev_word->word->best_choice : nullptr;
1599 
1601  check_debug_pt(word, 30);
1602  if (!word->done) {
1603  word->caps_height = 0.0;
1604  if (word->x_height == 0.0f)
1605  word->x_height = row->x_height();
1606  match_word_pass_n(2, word, row, block);
1607  check_debug_pt(word, 40);
1608  }
1609 
1610  SubAndSuperscriptFix(word);
1611 
1612  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1614  block->classify_rotation().y() == 0.0f) {
1615  // Use the tops and bottoms since they are available.
1616  TrainedXheightFix(word, block, row);
1617  }
1618 
1620  }
1621 #ifndef GRAPHICS_DISABLED
1623  if (fx_win == nullptr)
1624  create_fx_win();
1625  clear_fx_win();
1626  word->rebuild_word->plot(fx_win);
1627  TBOX wbox = word->rebuild_word->bounding_box();
1628  fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
1629  wbox.right(), wbox.bottom());
1631  }
1632 #endif
1634  check_debug_pt(word, 50);
1635 #endif // ndef DISABLED_LEGACY_ENGINE
1636 }
bool tess_failed
Definition: pageres.h:287
int16_t top() const
Definition: rect.h:58
float x_height() const
Definition: ocrrow.h:64
bool done
Definition: pageres.h:297
void clear_fx_win()
Definition: drawfx.cpp:62
Definition: rect.h:34
TWERD * rebuild_word
Definition: pageres.h:259
FCOORD classify_rotation() const
Definition: ocrblock.h:141
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:757
UNICHARSET unicharset
Definition: ccutil.h:71
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1644
void create_fx_win()
Definition: drawfx.cpp:49
void plot(ScrollView *window)
Definition: blobs.cpp:901
repeated character
Definition: werd.h:38
ScrollView * fx_win
Definition: drawfx.cpp:40
bool script_has_xheight() const
Definition: unicharset.h:904
float y() const
Definition: points.h:210
static void Update()
Definition: scrollview.cpp:709
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1863
TBOX bounding_box() const
Definition: blobs.cpp:865
bool SubAndSuperscriptFix(WERD_RES *word_res)
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1499
bool top_bottom_useful() const
Definition: unicharset.h:537
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
void set_global_subloc_code(int loc_code)
Definition: globaloc.cpp:30
int16_t left() const
Definition: rect.h:72
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
float caps_height
Definition: pageres.h:311
Definition: ocrrow.h:36
float x_height
Definition: pageres.h:310
WERD * word
Definition: pageres.h:188
Definition: ocrblock.h:29
#define SUBLOC_NORM
Definition: errcode.h:58
float tesseract::Tesseract::ClassifyBlobAsWord ( int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
STRING best_str,
float *  c2 
)

Definition at line 1282 of file control.cpp.

1283  {
1284 #ifndef DISABLED_LEGACY_ENGINE
1285  WERD* real_word = pr_it->word()->word;
1286  WERD* word = real_word->ConstructFromSingleBlob(
1287  real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
1288  WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1289  // Get a new iterator that points to the new word.
1290  PAGE_RES_IT it(pr_it->page_res);
1291  while (it.word() != word_res && it.word() != nullptr) it.forward();
1292  ASSERT_HOST(it.word() == word_res);
1293  WordData wd(it);
1294  // Force full initialization.
1295  SetupWordPassN(1, &wd);
1296  classify_word_and_language(pass_n, &it, &wd);
1297  if (debug_noise_removal) {
1298  if (wd.word->raw_choice != nullptr) {
1299  tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
1300  wd.row->x_height(), wd.word->raw_choice->min_x_height(),
1301  wd.word->raw_choice->max_x_height());
1302  } else {
1303  tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
1304  wd.row->x_height());
1305  }
1306  }
1307  float cert = 0.0f;
1308  if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but...
1309  cert = wd.word->raw_choice->certainty();
1310  float rat = wd.word->raw_choice->rating();
1311  *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1312  *best_str = wd.word->raw_choice->unichar_string();
1313  } else {
1314  *c2 = 0.0f;
1315  *best_str = "";
1316  }
1317  it.DeleteCurrentWord();
1318  pr_it->ResetWordIterator();
1319  return cert;
1320 #else
1321  return 0.1;
1322 #endif // ndef DISABLED_LEGACY_ENGINE
1323 }
Definition: werd.h:56
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1260
void ResetWordIterator()
Definition: pageres.cpp:1570
start of line
Definition: werd.h:32
end of line
Definition: werd.h:33
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
WERD_RES * word() const
Definition: pageres.h:755
PAGE_RES * page_res
Definition: pageres.h:678
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:177
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:125
#define ASSERT_HOST(x)
Definition: errcode.h:88
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:119
float x_height
Definition: pageres.h:310
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1333
WERD * word
Definition: pageres.h:188
float tesseract::Tesseract::ClassifyBlobPlusOutlines ( const GenericVector< bool > &  ok_outlines,
const GenericVector< C_OUTLINE * > &  outlines,
int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
STRING best_str 
)

Definition at line 1234 of file control.cpp.

1237  {
1238 #ifndef DISABLED_LEGACY_ENGINE
1239  C_OUTLINE_IT ol_it;
1240  C_OUTLINE* first_to_keep = nullptr;
1241  C_BLOB* local_blob = nullptr;
1242  if (blob != nullptr) {
1243  // Add the required outlines to the blob.
1244  ol_it.set_to_list(blob->out_list());
1245  first_to_keep = ol_it.data();
1246  }
1247  for (int i = 0; i < ok_outlines.size(); ++i) {
1248  if (ok_outlines[i]) {
1249  // This outline is to be added.
1250  if (blob == nullptr) {
1251  local_blob = new C_BLOB(outlines[i]);
1252  blob = local_blob;
1253  ol_it.set_to_list(blob->out_list());
1254  } else {
1255  ol_it.add_before_stay_put(outlines[i]);
1256  }
1257  }
1258  }
1259  float c2;
1260  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1261  ol_it.move_to_first();
1262  if (first_to_keep == nullptr) {
1263  // We created blob. Empty its outlines and delete it.
1264  for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
1265  delete local_blob;
1266  cert = -c2;
1267  } else {
1268  // Remove the outlines that we put in.
1269  for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1270  ol_it.extract();
1271  }
1272  }
1273  return cert;
1274 #else
1275  return 0.1;
1276 #endif // ndef DISABLED_LEGACY_ENGINE
1277 }
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1282
int size() const
Definition: genericvector.h:70
void tesseract::Tesseract::Clear ( )

Definition at line 577 of file tesseractclass.cpp.

577  {
578  STRING debug_name = imagebasename + "_debug.pdf";
579  pixa_debug_.WritePDF(debug_name.string());
580  pixDestroy(&pix_binary_);
581  pixDestroy(&pix_grey_);
582  pixDestroy(&pix_thresholds_);
583  pixDestroy(&scaled_color_);
584  deskew_ = FCOORD(1.0f, 0.0f);
585  reskew_ = FCOORD(1.0f, 0.0f);
586  splitter_.Clear();
587  scaled_factor_ = -1;
588  for (int i = 0; i < sub_langs_.size(); ++i)
589  sub_langs_[i]->Clear();
590 }
Definition: strngs.h:45
Definition: points.h:188
const char * string() const
Definition: strngs.cpp:194
void WritePDF(const char *filename)
Definition: debugpixa.h:36
STRING imagebasename
Definition: ccutil.h:68
float tesseract::Tesseract::ComputeCompatibleXheight ( WERD_RES word_res,
float *  baseline_shift 
)

Definition at line 102 of file fixxht.cpp.

103  {
104  STATS top_stats(0, UINT8_MAX);
105  STATS shift_stats(-UINT8_MAX, UINT8_MAX);
106  int bottom_shift = 0;
107  int num_blobs = word_res->rebuild_word->NumBlobs();
108  do {
109  top_stats.clear();
110  shift_stats.clear();
111  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
112  TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
113  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
114  if (unicharset.get_isalpha(class_id) ||
115  unicharset.get_isdigit(class_id)) {
116  int top = blob->bounding_box().top() + bottom_shift;
117  // Clip the top to the limit of normalized feature space.
118  if (top >= INT_FEAT_RANGE)
119  top = INT_FEAT_RANGE - 1;
120  int bottom = blob->bounding_box().bottom() + bottom_shift;
121  int min_bottom, max_bottom, min_top, max_top;
122  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
123  &min_top, &max_top);
124  // Chars with a wild top range would mess up the result so ignore them.
125  if (max_top - min_top > kMaxCharTopRange)
126  continue;
127  int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,
128  top - (max_top + x_ht_acceptance_tolerance));
129  int height = top - kBlnBaselineOffset;
130  if (debug_x_ht_level >= 2) {
131  tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
132  unicharset.id_to_unichar(class_id),
133  height, min_bottom, max_bottom, min_top, max_top,
134  bottom, top);
135  }
136  // Use only chars that fit in the expected bottom range, and where
137  // the range of tops is sensibly near the xheight.
138  if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
139  bottom - x_ht_acceptance_tolerance <= max_bottom &&
140  min_top > kBlnBaselineOffset &&
141  max_top - kBlnBaselineOffset >= kBlnXHeight &&
142  misfit_dist > 0) {
143  // Compute the x-height position using proportionality between the
144  // actual height and expected height.
145  int min_xht = DivRounded(height * kBlnXHeight,
146  max_top - kBlnBaselineOffset);
147  int max_xht = DivRounded(height * kBlnXHeight,
148  min_top - kBlnBaselineOffset);
149  if (debug_x_ht_level >= 2) {
150  tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
151  }
152  // The range of expected heights gets a vote equal to the distance
153  // of the actual top from the expected top.
154  for (int y = min_xht; y <= max_xht; ++y)
155  top_stats.add(y, misfit_dist);
156  } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
157  bottom - x_ht_acceptance_tolerance > max_bottom) &&
158  bottom_shift == 0) {
159  // Get the range of required bottom shift.
160  int min_shift = min_bottom - bottom;
161  int max_shift = max_bottom - bottom;
162  if (debug_x_ht_level >= 2) {
163  tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
164  }
165  // The range of expected shifts gets a vote equal to the min distance
166  // of the actual bottom from the expected bottom, spread over the
167  // range of its acceptance.
168  int misfit_weight = abs(min_shift);
169  if (max_shift > min_shift)
170  misfit_weight /= max_shift - min_shift;
171  for (int y = min_shift; y <= max_shift; ++y)
172  shift_stats.add(y, misfit_weight);
173  } else {
174  if (bottom_shift == 0) {
175  // Things with bottoms that are already ok need to say so, on the
176  // 1st iteration only.
177  shift_stats.add(0, kBlnBaselineOffset);
178  }
179  if (debug_x_ht_level >= 2) {
180  tprintf(" already OK\n");
181  }
182  }
183  }
184  }
185  if (shift_stats.get_total() > top_stats.get_total()) {
186  bottom_shift = IntCastRounded(shift_stats.median());
187  if (debug_x_ht_level >= 2) {
188  tprintf("Applying bottom shift=%d\n", bottom_shift);
189  }
190  }
191  } while (bottom_shift != 0 &&
192  top_stats.get_total() < shift_stats.get_total());
193  // Baseline shift is opposite sign to the bottom shift.
194  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
195  if (debug_x_ht_level >= 2) {
196  tprintf("baseline shift=%g\n", *baseline_shift);
197  }
198  if (top_stats.get_total() == 0)
199  return bottom_shift != 0 ? word_res->x_height : 0.0f;
200  // The new xheight is just the median vote, which is then scaled out
201  // of BLN space back to pixel space to get the x-height in pixel space.
202  float new_xht = top_stats.median();
203  if (debug_x_ht_level >= 2) {
204  tprintf("Median xht=%f\n", new_xht);
205  tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
206  new_xht, new_xht / word_res->denorm.y_scale());
207  }
208  // The xheight must change by at least x_ht_min_change to be used.
209  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
210  return new_xht / word_res->denorm.y_scale();
211  else
212  return bottom_shift != 0 ? word_res->x_height : 0.0f;
213 }
int IntCastRounded(double x)
Definition: helpers.h:175
const int kBlnXHeight
Definition: normalis.h:24
int16_t top() const
Definition: rect.h:58
TWERD * rebuild_word
Definition: pageres.h:259
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
TBOX bounding_box() const
Definition: blobs.cpp:472
UNICHARSET unicharset
Definition: ccutil.h:71
Definition: blobs.h:263
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
#define INT_FEAT_RANGE
Definition: float2int.h:27
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
const int kBlnBaselineOffset
Definition: normalis.h:25
float y_scale() const
Definition: normalis.h:270
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:568
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
const int kMaxCharTopRange
Definition: fixxht.cpp:67
int16_t bottom() const
Definition: rect.h:65
int NumBlobs() const
Definition: blobs.h:427
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
WERD_CHOICE * best_choice
Definition: pageres.h:234
int UNICHAR_ID
Definition: unichar.h:34
Definition: statistc.h:31
float x_height
Definition: pageres.h:310
DENORM denorm
Definition: pageres.h:203
int DivRounded(int a, int b)
Definition: helpers.h:167
void tesseract::Tesseract::convert_bad_unlv_chs ( WERD_RES word_res)

Definition at line 659 of file docqual.cpp.

659  {
660  int i;
661  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
662  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
663  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
664  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
665  for (i = 0; i < word_res->reject_map.length(); ++i) {
666  if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
667  word_res->best_choice->set_unichar_id(unichar_dash, i);
668  if (word_res->reject_map[i].accepted ())
669  word_res->reject_map[i].setrej_unlv_rej ();
670  }
671  if (word_res->best_choice->unichar_id(i) == unichar_pow) {
672  word_res->best_choice->set_unichar_id(unichar_space, i);
673  if (word_res->reject_map[i].accepted ())
674  word_res->reject_map[i].setrej_unlv_rej ();
675  }
676  }
677 }
int32_t length() const
Definition: rejctmap.h:223
REJMAP reject_map
Definition: pageres.h:286
const UNICHARSET * uch_set
Definition: pageres.h:205
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:359
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
WERD_CHOICE * best_choice
Definition: pageres.h:234
int UNICHAR_ID
Definition: unichar.h:34
bool tesseract::Tesseract::ConvertStringToUnichars ( const char *  utf8,
GenericVector< UNICHAR_ID > *  class_ids 
)

Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.

Returns
false if an invalid UNICHAR_ID is encountered.

Definition at line 538 of file applybox.cpp.

539  {
540  for (int step = 0; *utf8 != '\0'; utf8 += step) {
541  const char* next_space = strchr(utf8, ' ');
542  if (next_space == nullptr)
543  next_space = utf8 + strlen(utf8);
544  step = next_space - utf8;
545  UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
546  if (class_id == INVALID_UNICHAR_ID) {
547  return false;
548  }
549  while (utf8[step] == ' ')
550  ++step;
551  class_ids->push_back(class_id);
552  }
553  return true;
554 }
UNICHARSET unicharset
Definition: ccutil.h:71
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
int push_back(T object)
int UNICHAR_ID
Definition: unichar.h:34
void tesseract::Tesseract::CorrectClassifyWords ( PAGE_RES page_res)

Creates a fake best_choice entry in each WERD_RES with the correct text.

Definition at line 780 of file applybox.cpp.

780  {
781  PAGE_RES_IT pr_it(page_res);
782  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
783  word_res = pr_it.forward()) {
784  auto* choice = new WERD_CHOICE(word_res->uch_set,
785  word_res->correct_text.size());
786  for (int i = 0; i < word_res->correct_text.size(); ++i) {
787  // The part before the first space is the real ground truth, and the
788  // rest is the bounding box location and page number.
789  GenericVector<STRING> tokens;
790  word_res->correct_text[i].split(' ', &tokens);
791  UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
792  choice->append_unichar_id_space_allocated(char_id,
793  word_res->best_state[i],
794  0.0f, 0.0f);
795  }
796  word_res->ClearWordChoices();
797  word_res->LogNewRawChoice(choice);
798  word_res->LogNewCookedChoice(1, false, choice);
799  }
800 }
UNICHARSET unicharset
Definition: ccutil.h:71
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
int UNICHAR_ID
Definition: unichar.h:34
WERD * word
Definition: pageres.h:188
int16_t tesseract::Tesseract::count_alphanums ( const WERD_CHOICE word)

Definition at line 376 of file output.cpp.

376  {
377  int count = 0;
378  for (int i = 0; i < word.length(); ++i) {
379  if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
380  word.unicharset()->get_isdigit(word.unichar_id(i)))
381  count++;
382  }
383  return count;
384 }
int length() const
Definition: ratngs.h:303
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int count(LIST var_list)
Definition: oldlist.cpp:96
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
int16_t tesseract::Tesseract::count_alphanums ( WERD_RES word)

Definition at line 559 of file reject.cpp.

559  {
560  int count = 0;
561  const WERD_CHOICE *best_choice = word_res->best_choice;
562  for (int i = 0; i < word_res->reject_map.length(); ++i) {
563  if ((word_res->reject_map[i].accepted()) &&
564  (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
565  word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
566  count++;
567  }
568  }
569  return count;
570 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int count(LIST var_list)
Definition: oldlist.cpp:96
int16_t tesseract::Tesseract::count_alphas ( const WERD_CHOICE word)

Definition at line 366 of file output.cpp.

366  {
367  int count = 0;
368  for (int i = 0; i < word.length(); ++i) {
369  if (word.unicharset()->get_isalpha(word.unichar_id(i)))
370  count++;
371  }
372  return count;
373 }
int length() const
Definition: ratngs.h:303
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int count(LIST var_list)
Definition: oldlist.cpp:96
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
int16_t tesseract::Tesseract::count_outline_errs ( char  c,
int16_t  outline_count 
)

Definition at line 126 of file docqual.cpp.

126  {
127  int expected_outline_count;
128 
129  if (STRING (outlines_odd).contains (c))
130  return 0; // Don't use this char
131  else if (STRING (outlines_2).contains (c))
132  expected_outline_count = 2;
133  else
134  expected_outline_count = 1;
135  return abs (outline_count - expected_outline_count);
136 }
Definition: strngs.h:45
int tesseract::Tesseract::CountMisfitTops ( WERD_RES word_res)

Definition at line 70 of file fixxht.cpp.

70  {
71  int bad_blobs = 0;
72  int num_blobs = word_res->rebuild_word->NumBlobs();
73  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
74  TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
75  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
76  if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
77  int top = blob->bounding_box().top();
78  if (top >= INT_FEAT_RANGE)
79  top = INT_FEAT_RANGE - 1;
80  int min_bottom, max_bottom, min_top, max_top;
81  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
82  &min_top, &max_top);
83  if (max_top - min_top > kMaxCharTopRange)
84  continue;
85  bool bad = top < min_top - x_ht_acceptance_tolerance ||
86  top > max_top + x_ht_acceptance_tolerance;
87  if (bad)
88  ++bad_blobs;
89  if (debug_x_ht_level >= 1) {
90  tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
91  unicharset.id_to_unichar(class_id),
92  bad ? "Misfit" : "OK", top, min_top, max_top,
93  static_cast<int>(x_ht_acceptance_tolerance));
94  }
95  }
96  }
97  return bad_blobs;
98 }
int16_t top() const
Definition: rect.h:58
TWERD * rebuild_word
Definition: pageres.h:259
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
TBOX bounding_box() const
Definition: blobs.cpp:472
UNICHARSET unicharset
Definition: ccutil.h:71
Definition: blobs.h:263
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
#define INT_FEAT_RANGE
Definition: float2int.h:27
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:568
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
const int kMaxCharTopRange
Definition: fixxht.cpp:67
int NumBlobs() const
Definition: blobs.h:427
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
WERD_CHOICE * best_choice
Definition: pageres.h:234
int UNICHAR_ID
Definition: unichar.h:34
void tesseract::Tesseract::debug_word ( PAGE_RES page_res,
const TBOX selection_box 
)

debug_word

Process the whole image, but load word_config_ for the selected word(s).

Definition at line 667 of file pgedit.cpp.

667  {
668 #ifndef DISABLED_LEGACY_ENGINE
670 #endif
671  recog_all_words(page_res, nullptr, &selection_box, word_config_.string(), 0);
672 }
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:303
const char * string() const
Definition: strngs.cpp:194
void tesseract::Tesseract::dictionary_correction_pass ( PAGE_RES page_res)

Definition at line 2112 of file control.cpp.

2112  {
2113  PAGE_RES_IT word_it(page_res);
2114  for (WERD_RES* word = word_it.word(); word != nullptr;
2115  word = word_it.forward()) {
2116  if (word->best_choices.singleton())
2117  continue; // There are no alternates.
2118 
2119  const WERD_CHOICE* best = word->best_choice;
2120  if (word->tesseract->getDict().valid_word(*best) != 0)
2121  continue; // The best choice is in the dictionary.
2122 
2123  WERD_CHOICE_IT choice_it(&word->best_choices);
2124  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2125  choice_it.forward()) {
2126  WERD_CHOICE* alternate = choice_it.data();
2127  if (word->tesseract->getDict().valid_word(*alternate)) {
2128  // The alternate choice is in the dictionary.
2129  if (tessedit_bigram_debug) {
2130  tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2131  best->unichar_string().string(),
2132  alternate->unichar_string().string());
2133  }
2134  // Replace the 'best' choice with a better choice.
2135  word->ReplaceBestChoice(alternate);
2136  break;
2137  }
2138  }
2139  }
2140 }
const STRING & unichar_string() const
Definition: ratngs.h:541
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
WERD * word
Definition: pageres.h:188
bool tesseract::Tesseract::digit_or_numeric_punct ( WERD_RES word,
int  char_position 
)

Definition at line 370 of file fixspace.cpp.

370  {
371  int i;
372  int offset;
373 
374  for (i = 0, offset = 0; i < char_position;
375  offset += word->best_choice->unichar_lengths()[i++]);
376  return (
377  word->uch_set->get_isdigit(
378  word->best_choice->unichar_string().string() + offset,
379  word->best_choice->unichar_lengths()[i]) ||
380  (word->best_choice->permuter() == NUMBER_PERM &&
382  word->best_choice->unichar_string().string()[offset])));
383 }
Definition: strngs.h:45
const STRING & unichar_string() const
Definition: ratngs.h:541
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
const UNICHARSET * uch_set
Definition: pageres.h:205
const char * string() const
Definition: strngs.cpp:194
bool contains(char c) const
Definition: strngs.cpp:185
WERD_CHOICE * best_choice
Definition: pageres.h:234
const STRING & unichar_lengths() const
Definition: ratngs.h:548
uint8_t permuter() const
Definition: ratngs.h:346
void tesseract::Tesseract::do_re_display ( bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_painter)

do_re_display()

Redisplay page

Definition at line 351 of file pgedit.cpp.

352  {
353  int block_count = 1;
354 
355  image_win->Clear();
356  if (display_image) {
357  image_win->Image(pix_binary_, 0, 0);
358  }
359 
360  image_win->Brush(ScrollView::NONE);
361  PAGE_RES_IT pr_it(current_page_res);
362  for (WERD_RES* word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
363  (this->*word_painter)(&pr_it);
364  if (display_baselines && pr_it.row() != pr_it.prev_row())
365  pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
366  if (display_blocks && pr_it.block() != pr_it.prev_block())
367  pr_it.block()->block->pdblk.plot(image_win, block_count++, ScrollView::RED);
368  }
369  image_win->Update();
370 }
void Image(struct Pix *image, int x_pos, int y_pos)
Definition: scrollview.cpp:765
void Brush(Color color)
Definition: scrollview.cpp:725
static void Update()
Definition: scrollview.cpp:709
void Clear()
Definition: scrollview.cpp:589
WERD * word
Definition: pageres.h:188
void tesseract::Tesseract::doc_and_block_rejection ( PAGE_RES_IT page_res_it,
bool  good_quality_doc 
)

Definition at line 232 of file docqual.cpp.

234  {
235  int16_t block_no = 0;
236  int16_t row_no = 0;
237  BLOCK_RES *current_block;
238  ROW_RES *current_row;
239 
240  bool rej_word;
241  bool prev_word_rejected;
242  int16_t char_quality = 0;
243  int16_t accepted_char_quality;
244 
245  if (page_res_it.page_res->rej_count * 100.0 /
247  reject_whole_page(page_res_it);
249  tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
250  page_res_it.page_res->char_count,
251  page_res_it.page_res->rej_count);
252  }
253  } else {
255  tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
256  page_res_it.page_res->char_count,
257  page_res_it.page_res->rej_count);
258  }
259 
260  /* Walk blocks testing for block rejection */
261 
262  page_res_it.restart_page();
263  WERD_RES* word;
264  while ((word = page_res_it.word()) != nullptr) {
265  current_block = page_res_it.block();
266  block_no = current_block->block->pdblk.index();
267  if (current_block->char_count > 0 &&
268  (current_block->rej_count * 100.0 / current_block->char_count) >
271  tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
272  block_no, current_block->char_count,
273  current_block->rej_count);
274  }
275  prev_word_rejected = false;
276  while ((word = page_res_it.word()) != nullptr &&
277  (page_res_it.block() == current_block)) {
279  rej_word = word->reject_map.reject_count() > 0 ||
281  if (rej_word && tessedit_dont_blkrej_good_wds &&
284  *word->uch_set,
285  word->best_choice->unichar_string().string(),
286  word->best_choice->unichar_lengths().string()) !=
287  AC_UNACCEPTABLE) {
288  word_char_quality(word, page_res_it.row()->row,
289  &char_quality,
290  &accepted_char_quality);
291  rej_word = char_quality != word->reject_map.length();
292  }
293  } else {
294  rej_word = true;
295  }
296  if (rej_word) {
297  /*
298  Reject spacing if both current and prev words are rejected.
299  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
300  generated more space errors.
301  */
303  prev_word_rejected &&
304  page_res_it.prev_row() == page_res_it.row() &&
305  word->word->space() == 1)
306  word->reject_spaces = true;
308  }
309  prev_word_rejected = rej_word;
310  page_res_it.forward();
311  }
312  } else {
314  tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
315  block_no, page_res_it.block()->char_count,
316  page_res_it.block()->rej_count);
317  }
318 
319  /* Walk rows in block testing for row rejection */
320  row_no = 0;
321  while (page_res_it.word() != nullptr &&
322  page_res_it.block() == current_block) {
323  current_row = page_res_it.row();
324  row_no++;
325  /* Reject whole row if:
326  fraction of chars on row which are rejected exceed a limit AND
327  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
328  limit
329  */
330  if (current_row->char_count > 0 &&
331  (current_row->rej_count * 100.0 / current_row->char_count) >
333  (current_row->whole_word_rej_count * 100.0 /
334  current_row->rej_count) <
337  tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
338  row_no, current_row->char_count,
339  current_row->rej_count);
340  }
341  prev_word_rejected = false;
342  while ((word = page_res_it.word()) != nullptr &&
343  page_res_it.row () == current_row) {
344  /* Preserve words on good docs unless they are mostly rejected*/
345  if (!tessedit_row_rej_good_docs && good_quality_doc) {
346  rej_word = word->reject_map.reject_count() /
347  static_cast<float>(word->reject_map.length()) >
350  /* Preserve perfect words anyway */
351  rej_word = word->reject_map.reject_count() > 0 ||
353  if (rej_word && tessedit_dont_rowrej_good_wds &&
356  word->best_choice->unichar_string().string(),
357  word->best_choice->unichar_lengths().string()) !=
358  AC_UNACCEPTABLE) {
359  word_char_quality(word, page_res_it.row()->row,
360  &char_quality,
361  &accepted_char_quality);
362  rej_word = char_quality != word->reject_map.length();
363  }
364  } else {
365  rej_word = true;
366  }
367  if (rej_word) {
368  /*
369  Reject spacing if both current and prev words are rejected.
370  NOTE - this is NOT restricted to FUZZY spaces. - When tried
371  this generated more space errors.
372  */
374  prev_word_rejected &&
375  page_res_it.prev_row() == page_res_it.row() &&
376  word->word->space () == 1)
377  word->reject_spaces = true;
379  }
380  prev_word_rejected = rej_word;
381  page_res_it.forward();
382  }
383  } else {
385  tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
386  row_no, current_row->char_count, current_row->rej_count);
387  }
388  while (page_res_it.word() != nullptr &&
389  page_res_it.row() == current_row)
390  page_res_it.forward();
391  }
392  }
393  }
394  }
395  }
396 }
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:92
BLOCK * block
Definition: pageres.h:116
double tessedit_reject_row_percent
WERD_RES * restart_page()
Definition: pageres.h:702
void rej_word_row_rej()
Definition: rejctmap.cpp:442
Unacceptable word.
Definition: control.h:30
int32_t length() const
Definition: rejctmap.h:223
int32_t char_count
Definition: pageres.h:143
ROW_RES * prev_row() const
Definition: pageres.h:749
const STRING & unichar_string() const
Definition: ratngs.h:541
ROW_RES * row() const
Definition: pageres.h:758
int32_t whole_word_rej_count
Definition: pageres.h:145
bool tessedit_preserve_blk_rej_perfect_wds
BLOCK_RES * block() const
Definition: pageres.h:761
bool tessedit_preserve_row_rej_perfect_wds
REJMAP reject_map
Definition: pageres.h:286
const UNICHARSET * uch_set
Definition: pageres.h:205
int32_t rej_count
Definition: pageres.h:118
double tessedit_good_doc_still_rowrej_wd
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1759
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int32_t char_count
Definition: pageres.h:78
WERD_RES * word() const
Definition: pageres.h:755
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:406
PAGE_RES * page_res
Definition: pageres.h:678
WERD_RES * forward()
Definition: pageres.h:735
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:191
uint8_t space()
Definition: werd.h:99
int32_t rej_count
Definition: pageres.h:79
WERD_CHOICE * best_choice
Definition: pageres.h:234
int16_t reject_count()
Definition: rejctmap.h:229
double tessedit_reject_doc_percent
double tessedit_whole_wd_rej_row_percent
int32_t char_count
Definition: pageres.h:117
int32_t rej_count
Definition: pageres.h:144
const STRING & unichar_lengths() const
Definition: ratngs.h:548
ROW * row
Definition: pageres.h:142
int index() const
Definition: pdblock.h:68
WERD * word
Definition: pageres.h:188
bool reject_spaces
Definition: pageres.h:335
void rej_word_block_rej()
Definition: rejctmap.cpp:433
double tessedit_reject_block_percent
void tesseract::Tesseract::dont_allow_1Il ( WERD_RES word)

Definition at line 527 of file reject.cpp.

527  {
528  int i = 0;
529  int offset;
530  int word_len = word->reject_map.length();
531  const char *s = word->best_choice->unichar_string().string();
532  const char *lengths = word->best_choice->unichar_lengths().string();
533  bool accepted_1Il = false;
534 
535  for (i = 0, offset = 0; i < word_len;
536  offset += word->best_choice->unichar_lengths()[i++]) {
537  if (word->reject_map[i].accepted()) {
538  if (STRING(conflict_set_I_l_1).contains(s[offset])) {
539  accepted_1Il = true;
540  } else {
541  if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
542  word->uch_set->get_isdigit(s + offset, lengths[i]))
543  return; // >=1 non 1Il ch accepted
544  }
545  }
546  }
547  if (!accepted_1Il)
548  return; //Nothing to worry about
549 
550  for (i = 0, offset = 0; i < word_len;
551  offset += word->best_choice->unichar_lengths()[i++]) {
552  if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
553  word->reject_map[i].accepted())
554  word->reject_map[i].setrej_postNN_1Il();
555  }
556 }
int32_t length() const
Definition: rejctmap.h:223
Definition: strngs.h:45
const STRING & unichar_string() const
Definition: ratngs.h:541
REJMAP reject_map
Definition: pageres.h:286
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
const UNICHARSET * uch_set
Definition: pageres.h:205
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
const char * string() const
Definition: strngs.cpp:194
bool contains(char c) const
Definition: strngs.cpp:185
WERD_CHOICE * best_choice
Definition: pageres.h:234
const STRING & unichar_lengths() const
Definition: ratngs.h:548
void tesseract::Tesseract::dump_words ( WERD_RES_LIST &  perm,
int16_t  score,
int16_t  mode,
bool  improved 
)

Definition at line 476 of file fixspace.cpp.

477  {
478  WERD_RES_IT word_res_it(&perm);
479 
480  if (debug_fix_space_level > 0) {
481  if (mode == 1) {
482  stats_.dump_words_str = "";
483  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
484  word_res_it.forward()) {
485  if (!word_res_it.data()->part_of_combo) {
486  stats_.dump_words_str +=
487  word_res_it.data()->best_choice->unichar_string();
488  stats_.dump_words_str += ' ';
489  }
490  }
491  }
492 
493  if (debug_fix_space_level > 1) {
494  switch (mode) {
495  case 1:
496  tprintf("EXTRACTED (%d): \"", score);
497  break;
498  case 2:
499  tprintf("TESTED (%d): \"", score);
500  break;
501  case 3:
502  tprintf("RETURNED (%d): \"", score);
503  break;
504  }
505 
506  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
507  word_res_it.forward()) {
508  if (!word_res_it.data()->part_of_combo) {
509  tprintf("%s/%1d ",
510  word_res_it.data()->best_choice->unichar_string().string(),
511  static_cast<int>(word_res_it.data()->best_choice->permuter()));
512  }
513  }
514  tprintf("\"\n");
515  } else if (improved) {
516  tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
517  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
518  word_res_it.forward()) {
519  if (!word_res_it.data()->part_of_combo) {
520  tprintf("%s/%1d ",
521  word_res_it.data()->best_choice->unichar_string().string(),
522  static_cast<int>(word_res_it.data()->best_choice->permuter()));
523  }
524  }
525  tprintf("\"\n");
526  }
527  }
528 }
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void tesseract::Tesseract::end_tesseract ( )

Definition at line 463 of file tessedit.cpp.

463 { end_recog(); }
int16_t tesseract::Tesseract::eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 266 of file fixspace.cpp.

266  {
267  WERD_RES_IT word_res_it(&word_res_list);
268  int16_t total_score = 0;
269  int16_t word_count = 0;
270  int16_t done_word_count = 0;
271  int16_t word_len;
272  int16_t i;
273  int16_t offset;
274  WERD_RES *word; // current word
275  int16_t prev_word_score = 0;
276  bool prev_word_done = false;
277  bool prev_char_1 = false; // prev ch a "1/I/l"?
278  bool prev_char_digit = false; // prev ch 2..9 or 0
279  bool current_char_1 = false;
280  bool current_word_ok_so_far;
281  STRING punct_chars = "!\"`',.:;";
282  bool prev_char_punct = false;
283  bool current_char_punct = false;
284  bool word_done = false;
285 
286  do {
287  word = word_res_it.data();
288  word_done = fixspace_thinks_word_done(word);
289  word_count++;
290  if (word->tess_failed) {
291  total_score += prev_word_score;
292  if (prev_word_done)
293  done_word_count++;
294  prev_word_score = 0;
295  prev_char_1 = false;
296  prev_char_digit = false;
297  prev_word_done = false;
298  } else {
299  /*
300  Can we add the prev word score and potentially count this word?
301  Yes IF it didn't end in a 1 when the first char of this word is a digit
302  AND it didn't end in a digit when the first char of this word is a 1
303  */
304  word_len = word->reject_map.length();
305  current_word_ok_so_far = false;
306  if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
307  (prev_char_digit && (
308  (word_done &&
309  word->best_choice->unichar_lengths().string()[0] == 1 &&
310  word->best_choice->unichar_string()[0] == '1') ||
311  (!word_done && STRING(conflict_set_I_l_1).contains(
312  word->best_choice->unichar_string()[0])))))) {
313  total_score += prev_word_score;
314  if (prev_word_done)
315  done_word_count++;
316  current_word_ok_so_far = word_done;
317  }
318 
319  if (current_word_ok_so_far) {
320  prev_word_done = true;
321  prev_word_score = word_len;
322  } else {
323  prev_word_done = false;
324  prev_word_score = 0;
325  }
326 
327  /* Add 1 to total score for every joined 1 regardless of context and
328  rejtn */
329  for (i = 0, prev_char_1 = false; i < word_len; i++) {
330  current_char_1 = word->best_choice->unichar_string()[i] == '1';
331  if (prev_char_1 || (current_char_1 && (i > 0)))
332  total_score++;
333  prev_char_1 = current_char_1;
334  }
335 
336  /* Add 1 to total score for every joined punctuation regardless of context
337  and rejtn */
339  for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
340  offset += word->best_choice->unichar_lengths()[i++]) {
341  current_char_punct =
342  punct_chars.contains(word->best_choice->unichar_string()[offset]);
343  if (prev_char_punct || (current_char_punct && i > 0))
344  total_score++;
345  prev_char_punct = current_char_punct;
346  }
347  }
348  prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
349  for (i = 0, offset = 0; i < word_len - 1;
350  offset += word->best_choice->unichar_lengths()[i++]);
351  prev_char_1 =
352  ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
353  || (!word_done && STRING(conflict_set_I_l_1).contains(
354  word->best_choice->unichar_string()[offset])));
355  }
356  /* Find next word */
357  do {
358  word_res_it.forward();
359  } while (word_res_it.data()->part_of_combo);
360  } while (!word_res_it.at_first());
361  total_score += prev_word_score;
362  if (prev_word_done)
363  done_word_count++;
364  if (done_word_count == word_count)
365  return PERFECT_WERDS;
366  else
367  return total_score;
368 }
bool tess_failed
Definition: pageres.h:287
int32_t length() const
Definition: rejctmap.h:223
Definition: strngs.h:45
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:370
const STRING & unichar_string() const
Definition: ratngs.h:541
#define PERFECT_WERDS
Definition: fixspace.cpp:44
REJMAP reject_map
Definition: pageres.h:286
const char * string() const
Definition: strngs.cpp:194
bool contains(char c) const
Definition: strngs.cpp:185
WERD_CHOICE * best_choice
Definition: pageres.h:234
const STRING & unichar_lengths() const
Definition: ratngs.h:548
bool fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:530
int16_t tesseract::Tesseract::failure_count ( WERD_RES word)

Definition at line 968 of file docqual.cpp.

968  {
969  const char *str = word->best_choice->unichar_string().string();
970  int tess_rejs = 0;
971 
972  for (; *str != '\0'; str++) {
973  if (*str == ' ')
974  tess_rejs++;
975  }
976  return tess_rejs;
977 }
const STRING & unichar_string() const
Definition: ratngs.h:541
const char * string() const
Definition: strngs.cpp:194
WERD_CHOICE * best_choice
Definition: pageres.h:234
bool tesseract::Tesseract::FindSegmentation ( const GenericVector< UNICHAR_ID > &  target_text,
WERD_RES word_res 
)

Resegments the word to achieve the target_text from the classifier. Returns false if the re-segmentation fails. Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and applies a full search on the classifier results to find the best classified segmentation. As a compromise to obtain better recall, 1-1 ambiguity substitutions ARE used.

Definition at line 565 of file applybox.cpp.

566  {
567  // Classify all required combinations of blobs and save results in choices.
568  const int word_length = word_res->box_word->length();
569  auto* choices =
570  new GenericVector<BLOB_CHOICE_LIST*>[word_length];
571  for (int i = 0; i < word_length; ++i) {
572  for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
573  BLOB_CHOICE_LIST* match_result = classify_piece(
574  word_res->seam_array, i, i + j - 1, "Applybox",
575  word_res->chopped_word, word_res->blamer_bundle);
576  if (applybox_debug > 2) {
577  tprintf("%d+%d:", i, j);
578  print_ratings_list("Segment:", match_result, unicharset);
579  }
580  choices[i].push_back(match_result);
581  }
582  }
583  // Search the segmentation graph for the target text. Must be an exact
584  // match. Using wildcards makes it difficult to find the correct
585  // segmentation even when it is there.
586  word_res->best_state.clear();
587  GenericVector<int> search_segmentation;
588  float best_rating = 0.0f;
589  SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
590  &search_segmentation, &best_rating, &word_res->best_state);
591  for (int i = 0; i < word_length; ++i)
592  choices[i].delete_data_pointers();
593  delete [] choices;
594  if (word_res->best_state.empty()) {
595  // Build the original segmentation and if it is the same length as the
596  // truth, assume it will do.
597  int blob_count = 1;
598  for (int s = 0; s < word_res->seam_array.size(); ++s) {
599  SEAM* seam = word_res->seam_array[s];
600  if (!seam->HasAnySplits()) {
601  word_res->best_state.push_back(blob_count);
602  blob_count = 1;
603  } else {
604  ++blob_count;
605  }
606  }
607  word_res->best_state.push_back(blob_count);
608  if (word_res->best_state.size() != target_text.size()) {
609  word_res->best_state.clear(); // No good. Original segmentation bad size.
610  return false;
611  }
612  }
613  word_res->correct_text.clear();
614  for (int i = 0; i < target_text.size(); ++i) {
615  word_res->correct_text.push_back(
616  STRING(unicharset.id_to_unichar(target_text[i])));
617  }
618  return true;
619 }
Definition: strngs.h:45
GenericVector< STRING > correct_text
Definition: pageres.h:274
UNICHARSET unicharset
Definition: ccutil.h:71
GenericVector< SEAM * > seam_array
Definition: pageres.h:216
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
int length() const
Definition: boxword.h:83
void SearchForText(const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
Definition: applybox.cpp:635
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:833
bool HasAnySplits() const
Definition: seam.h:61
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:50
Definition: seam.h:38
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
bool empty() const
Definition: genericvector.h:89
tesseract::BoxWord * box_word
Definition: pageres.h:265
int push_back(T object)
TWERD * chopped_word
Definition: pageres.h:214
GenericVector< int > best_state
Definition: pageres.h:270
const int kMaxGroupSize
Definition: applybox.cpp:31
int size() const
Definition: genericvector.h:70
BlamerBundle * blamer_bundle
Definition: pageres.h:245
int16_t tesseract::Tesseract::first_alphanum_index ( const char *  word,
const char *  word_lengths 
)

Definition at line 470 of file reject.cpp.

471  {
472  int16_t i;
473  int16_t offset;
474 
475  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
476  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
477  unicharset.get_isdigit(word + offset, word_lengths[i]))
478  return i;
479  }
480  return -1;
481 }
UNICHARSET unicharset
Definition: ccutil.h:71
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
int16_t tesseract::Tesseract::first_alphanum_offset ( const char *  word,
const char *  word_lengths 
)

Definition at line 483 of file reject.cpp.

484  {
485  int16_t i;
486  int16_t offset;
487 
488  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
489  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
490  unicharset.get_isdigit(word + offset, word_lengths[i]))
491  return offset;
492  }
493  return -1;
494 }
UNICHARSET unicharset
Definition: ccutil.h:71
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
void tesseract::Tesseract::fix_fuzzy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 172 of file fixspace.cpp.

174  {
175  int16_t best_score;
176  WERD_RES_LIST current_perm;
177  int16_t current_score;
178  bool improved = false;
179 
180  best_score = eval_word_spacing(best_perm); // default score
181  dump_words(best_perm, best_score, 1, improved);
182 
183  if (best_score != PERFECT_WERDS)
184  initialise_search(best_perm, current_perm);
185 
186  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
187  match_current_words(current_perm, row, block);
188  current_score = eval_word_spacing(current_perm);
189  dump_words(current_perm, current_score, 2, improved);
190  if (current_score > best_score) {
191  best_perm.clear();
192  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
193  best_score = current_score;
194  improved = true;
195  }
196  if (current_score < PERFECT_WERDS)
197  transform_to_next_perm(current_perm);
198  }
199  dump_words(best_perm, best_score, 3, improved);
200 }
#define PERFECT_WERDS
Definition: fixspace.cpp:44
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:399
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:204
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:650
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:266
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:223
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
Definition: fixspace.cpp:476
void tesseract::Tesseract::fix_fuzzy_spaces ( ETEXT_DESC monitor,
int32_t  word_count,
PAGE_RES page_res 
)

Definition at line 75 of file fixspace.cpp.

77  {
78  BLOCK_RES_IT block_res_it;
79  ROW_RES_IT row_res_it;
80  WERD_RES_IT word_res_it_from;
81  WERD_RES_IT word_res_it_to;
82  WERD_RES *word_res;
83  WERD_RES_LIST fuzzy_space_words;
84  int16_t new_length;
85  bool prevent_null_wd_fixsp; // DON'T process blobless wds
86  int32_t word_index; // current word
87 
88  block_res_it.set_to_list(&page_res->block_res_list);
89  word_index = 0;
90  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
91  block_res_it.forward()) {
92  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
93  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
94  row_res_it.forward()) {
95  word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
96  while (!word_res_it_from.at_last()) {
97  word_res = word_res_it_from.data();
98  while (!word_res_it_from.at_last() &&
99  !(word_res->combination ||
100  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
101  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
102  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
103  block_res_it.data()->block);
104  word_res = word_res_it_from.forward();
105  word_index++;
106  if (monitor != nullptr) {
107  monitor->ocr_alive = true;
108  monitor->progress = 90 + 5 * word_index / word_count;
109  if (monitor->deadline_exceeded() ||
110  (monitor->cancel != nullptr &&
111  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
112  return;
113  }
114  }
115 
116  if (!word_res_it_from.at_last()) {
117  word_res_it_to = word_res_it_from;
118  prevent_null_wd_fixsp =
119  word_res->word->cblob_list()->empty();
120  if (check_debug_pt(word_res, 60))
121  debug_fix_space_level.set_value(10);
122  word_res_it_to.forward();
123  word_index++;
124  if (monitor != nullptr) {
125  monitor->ocr_alive = true;
126  monitor->progress = 90 + 5 * word_index / word_count;
127  if (monitor->deadline_exceeded() ||
128  (monitor->cancel != nullptr &&
129  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
130  return;
131  }
132  while (!word_res_it_to.at_last () &&
133  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
134  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
135  if (check_debug_pt(word_res, 60))
136  debug_fix_space_level.set_value(10);
137  if (word_res->word->cblob_list()->empty())
138  prevent_null_wd_fixsp = true;
139  word_res = word_res_it_to.forward();
140  }
141  if (check_debug_pt(word_res, 60))
142  debug_fix_space_level.set_value(10);
143  if (word_res->word->cblob_list()->empty())
144  prevent_null_wd_fixsp = true;
145  if (prevent_null_wd_fixsp) {
146  word_res_it_from = word_res_it_to;
147  } else {
148  fuzzy_space_words.assign_to_sublist(&word_res_it_from,
149  &word_res_it_to);
150  fix_fuzzy_space_list(fuzzy_space_words,
151  row_res_it.data()->row,
152  block_res_it.data()->block);
153  new_length = fuzzy_space_words.length();
154  word_res_it_from.add_list_before(&fuzzy_space_words);
155  for (;
156  !word_res_it_from.at_last() && new_length > 0;
157  new_length--) {
158  word_res_it_from.forward();
159  }
160  }
161  if (test_pt)
162  debug_fix_space_level.set_value(0);
163  }
164  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
165  block_res_it.data()->block);
166  // Last word in row
167  }
168  }
169  }
170 }
fuzzy nonspace
Definition: werd.h:40
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:172
bool combination
Definition: pageres.h:333
fuzzy space
Definition: werd.h:39
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:105
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:110
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:116
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1863
bool deadline_exceeded() const
Definition: ocrclass.h:138
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:562
BLOCK_RES_LIST block_res_list
Definition: pageres.h:80
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:112
void tesseract::Tesseract::fix_noisy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 596 of file fixspace.cpp.

597  {
598  int16_t best_score;
599  WERD_RES_IT best_perm_it(&best_perm);
600  WERD_RES_LIST current_perm;
601  WERD_RES_IT current_perm_it(&current_perm);
602  WERD_RES *old_word_res;
603  int16_t current_score;
604  bool improved = false;
605 
606  best_score = fp_eval_word_spacing(best_perm); // default score
607 
608  dump_words(best_perm, best_score, 1, improved);
609 
610  old_word_res = best_perm_it.data();
611  // Even deep_copy doesn't copy the underlying WERD unless its combination
612  // flag is true!.
613  old_word_res->combination = true; // Kludge to force deep copy
614  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
615  old_word_res->combination = false; // Undo kludge
616 
617  break_noisiest_blob_word(current_perm);
618 
619  while (best_score != PERFECT_WERDS && !current_perm.empty()) {
620  match_current_words(current_perm, row, block);
621  current_score = fp_eval_word_spacing(current_perm);
622  dump_words(current_perm, current_score, 2, improved);
623  if (current_score > best_score) {
624  best_perm.clear();
625  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
626  best_score = current_score;
627  improved = true;
628  }
629  if (current_score < PERFECT_WERDS) {
630  break_noisiest_blob_word(current_perm);
631  }
632  }
633  dump_words(best_perm, best_score, 3, improved);
634 }
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:642
#define PERFECT_WERDS
Definition: fixspace.cpp:44
bool combination
Definition: pageres.h:333
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:857
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:650
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:223
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
Definition: fixspace.cpp:476
void tesseract::Tesseract::fix_rep_char ( PAGE_RES_IT page_res_it)

fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.

Definition at line 1720 of file control.cpp.

1720  {
1721  WERD_RES *word_res = page_res_it->word();
1722  const WERD_CHOICE &word = *(word_res->best_choice);
1723 
1724  // Find the frequency of each unique character in the word.
1725  SortHelper<UNICHAR_ID> rep_ch(word.length());
1726  for (int i = 0; i < word.length(); ++i) {
1727  rep_ch.Add(word.unichar_id(i), 1);
1728  }
1729 
1730  // Find the most frequent result.
1731  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1732  int max_count = rep_ch.MaxCount(&maxch_id);
1733  // Find the best exemplar of a classifier result for maxch_id.
1734  BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1735  if (best_choice == nullptr) {
1736  tprintf("Failed to find a choice for %s, occurring %d times\n",
1737  word_res->uch_set->debug_str(maxch_id).string(), max_count);
1738  return;
1739  }
1740  word_res->done = true;
1741 
1742  // Measure the mean space.
1743  int gap_count = 0;
1744  WERD* werd = word_res->word;
1745  C_BLOB_IT blob_it(werd->cblob_list());
1746  C_BLOB* prev_blob = blob_it.data();
1747  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1748  C_BLOB* blob = blob_it.data();
1749  int gap = blob->bounding_box().left();
1750  gap -= prev_blob->bounding_box().right();
1751  ++gap_count;
1752  prev_blob = blob;
1753  }
1754  // Just correct existing classification.
1755  CorrectRepcharChoices(best_choice, word_res);
1756  word_res->reject_map.initialise(word.length());
1757 }
Definition: werd.h:56
bool done
Definition: pageres.h:297
int length() const
Definition: ratngs.h:303
TBOX bounding_box() const
Definition: stepblob.cpp:253
REJMAP reject_map
Definition: pageres.h:286
const UNICHARSET * uch_set
Definition: pageres.h:205
void initialise(int16_t length)
Definition: rejctmap.cpp:273
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void Add(T value, int count)
Definition: sorthelper.h:65
WERD_RES * word() const
Definition: pageres.h:755
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:343
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
WERD_CHOICE * best_choice
Definition: pageres.h:234
int16_t left() const
Definition: rect.h:72
int UNICHAR_ID
Definition: unichar.h:34
WERD * word
Definition: pageres.h:188
C_BLOB_LIST * cblob_list()
Definition: werd.h:95
void tesseract::Tesseract::fix_sp_fp_word ( WERD_RES_IT &  word_res_it,
ROW row,
BLOCK block 
)

Definition at line 562 of file fixspace.cpp.

563  {
564  WERD_RES *word_res;
565  WERD_RES_LIST sub_word_list;
566  WERD_RES_IT sub_word_list_it(&sub_word_list);
567  int16_t blob_index;
568  int16_t new_length;
569  float junk;
570 
571  word_res = word_res_it.data();
572  if (word_res->word->flag(W_REP_CHAR) ||
573  word_res->combination ||
574  word_res->part_of_combo ||
575  !word_res->word->flag(W_DONT_CHOP))
576  return;
577 
578  blob_index = worst_noise_blob(word_res, &junk);
579  if (blob_index < 0)
580  return;
581 
582  if (debug_fix_space_level > 1) {
583  tprintf("FP fixspace working on \"%s\"\n",
584  word_res->best_choice->unichar_string().string());
585  }
586  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
587  sub_word_list_it.add_after_stay_put(word_res_it.extract());
588  fix_noisy_space_list(sub_word_list, row, block);
589  new_length = sub_word_list.length();
590  word_res_it.add_list_before(&sub_word_list);
591  for (; !word_res_it.at_last() && new_length > 1; new_length--) {
592  word_res_it.forward();
593  }
594 }
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:596
const STRING & unichar_string() const
Definition: ratngs.h:541
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:90
bool combination
Definition: pageres.h:333
repeated character
Definition: werd.h:38
fixed pitch chopped
Definition: werd.h:37
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
WERD_CHOICE * best_choice
Definition: pageres.h:234
bool part_of_combo
Definition: pageres.h:334
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:707
WERD * word
Definition: pageres.h:188
bool tesseract::Tesseract::fixspace_thinks_word_done ( WERD_RES word)

Definition at line 530 of file fixspace.cpp.

530  {
531  if (word->done)
532  return true;
533 
534  /*
535  Use all the standard pass 2 conditions for mode 5 in set_done() in
536  reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
537  CARE WHETHER WE HAVE of/at on/an etc.
538  */
539  if (fixsp_done_mode > 0 &&
540  (word->tess_accepted ||
541  (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
542  fixsp_done_mode == 3) &&
543  (strchr(word->best_choice->unichar_string().string(), ' ') == nullptr) &&
544  ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
545  (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
546  (word->best_choice->permuter() == USER_DAWG_PERM) ||
547  (word->best_choice->permuter() == NUMBER_PERM))) {
548  return true;
549  } else {
550  return false;
551  }
552 }
bool done
Definition: pageres.h:297
const STRING & unichar_string() const
Definition: ratngs.h:541
bool tess_accepted
Definition: pageres.h:295
REJMAP reject_map
Definition: pageres.h:286
const char * string() const
Definition: strngs.cpp:194
WERD_CHOICE * best_choice
Definition: pageres.h:234
int16_t reject_count()
Definition: rejctmap.h:229
uint8_t permuter() const
Definition: ratngs.h:346
void tesseract::Tesseract::flip_0O ( WERD_RES word)

Definition at line 674 of file reject.cpp.

674  {
675  WERD_CHOICE *best_choice = word_res->best_choice;
676  int i;
677  TBOX out_box;
678 
679  if (!tessedit_flip_0O)
680  return;
681 
682  int num_blobs = word_res->rebuild_word->NumBlobs();
683  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
684  TBLOB* blob = word_res->rebuild_word->blobs[i];
685  if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
686  word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
687  out_box = blob->bounding_box();
688  if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
689  (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
690  return; //Beware words with sub/superscripts
691  }
692  }
693  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
694  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
695  if (unichar_0 == INVALID_UNICHAR_ID ||
696  !word_res->uch_set->get_enabled(unichar_0) ||
697  unichar_O == INVALID_UNICHAR_ID ||
698  !word_res->uch_set->get_enabled(unichar_O)) {
699  return; // 0 or O are not present/enabled in unicharset
700  }
701  for (i = 1; i < best_choice->length(); ++i) {
702  if (best_choice->unichar_id(i) == unichar_0 ||
703  best_choice->unichar_id(i) == unichar_O) {
704  /* A0A */
705  if ((i+1) < best_choice->length() &&
706  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
707  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
708  best_choice->set_unichar_id(unichar_O, i);
709  }
710  /* A00A */
711  if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
712  (i+1) < best_choice->length() &&
713  (best_choice->unichar_id(i+1) == unichar_0 ||
714  best_choice->unichar_id(i+1) == unichar_O) &&
715  (i+2) < best_choice->length() &&
716  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
717  best_choice->set_unichar_id(unichar_O, i);
718  i++;
719  }
720  /* AA0<non digit or end of word> */
721  if ((i > 1) &&
722  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
723  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
724  (((i+1) < best_choice->length() &&
725  !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
726  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
727  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
728  (i == best_choice->length() - 1))) {
729  best_choice->set_unichar_id(unichar_O, i);
730  }
731  /* 9O9 */
732  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
733  (i+1) < best_choice->length() &&
734  non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
735  best_choice->set_unichar_id(unichar_0, i);
736  }
737  /* 9OOO */
738  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
739  (i+2) < best_choice->length() &&
740  (best_choice->unichar_id(i+1) == unichar_0 ||
741  best_choice->unichar_id(i+1) == unichar_O) &&
742  (best_choice->unichar_id(i+2) == unichar_0 ||
743  best_choice->unichar_id(i+2) == unichar_O)) {
744  best_choice->set_unichar_id(unichar_0, i);
745  best_choice->set_unichar_id(unichar_0, i+1);
746  best_choice->set_unichar_id(unichar_0, i+2);
747  i += 2;
748  }
749  /* 9OO<non upper> */
750  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
751  (i+2) < best_choice->length() &&
752  (best_choice->unichar_id(i+1) == unichar_0 ||
753  best_choice->unichar_id(i+1) == unichar_O) &&
754  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
755  best_choice->set_unichar_id(unichar_0, i);
756  best_choice->set_unichar_id(unichar_0, i+1);
757  i++;
758  }
759  /* 9O<non upper> */
760  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
761  (i+1) < best_choice->length() &&
762  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
763  best_choice->set_unichar_id(unichar_0, i);
764  }
765  /* 9[.,]OOO.. */
766  if ((i > 1) &&
767  (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
768  word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
769  (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
770  best_choice->unichar_id(i-2) == unichar_O)) {
771  if (best_choice->unichar_id(i-2) == unichar_O) {
772  best_choice->set_unichar_id(unichar_0, i-2);
773  }
774  while (i < best_choice->length() &&
775  (best_choice->unichar_id(i) == unichar_O ||
776  best_choice->unichar_id(i) == unichar_0)) {
777  best_choice->set_unichar_id(unichar_0, i);
778  i++;
779  }
780  i--;
781  }
782  }
783  }
784 }
const int kBlnXHeight
Definition: normalis.h:24
int16_t top() const
Definition: rect.h:58
Definition: rect.h:34
TBOX bounding_box() const
Definition: blobs.cpp:472
int length() const
Definition: ratngs.h:303
Definition: blobs.h:263
const int kBlnBaselineOffset
Definition: normalis.h:25
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:786
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:359
int16_t bottom() const
Definition: rect.h:65
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int UNICHAR_ID
Definition: unichar.h:34
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:790
void tesseract::Tesseract::flip_hyphens ( WERD_RES word)

Definition at line 617 of file reject.cpp.

617  {
618  WERD_CHOICE *best_choice = word_res->best_choice;
619  int i;
620  int prev_right = -9999;
621  int next_left;
622  TBOX out_box;
623  float aspect_ratio;
624 
626  return;
627 
628  int num_blobs = word_res->rebuild_word->NumBlobs();
629  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
630  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
631  TBLOB* blob = word_res->rebuild_word->blobs[i];
632  out_box = blob->bounding_box();
633  if (i + 1 == num_blobs)
634  next_left = 9999;
635  else
636  next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
637  // Don't touch small or touching blobs - it is too dangerous.
638  if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
639  (out_box.left() > prev_right) && (out_box.right() < next_left)) {
640  aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
641  if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
642  if (aspect_ratio >= tessedit_upper_flip_hyphen &&
643  word_res->uch_set->contains_unichar_id(unichar_dash) &&
644  word_res->uch_set->get_enabled(unichar_dash)) {
645  /* Certain HYPHEN */
646  best_choice->set_unichar_id(unichar_dash, i);
647  if (word_res->reject_map[i].rejected())
648  word_res->reject_map[i].setrej_hyphen_accept();
649  }
650  if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
651  word_res->reject_map[i].accepted())
652  //Suspected HYPHEN
653  word_res->reject_map[i].setrej_hyphen ();
654  }
655  else if (best_choice->unichar_id(i) == unichar_dash) {
656  if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
657  (word_res->reject_map[i].rejected()))
658  word_res->reject_map[i].setrej_hyphen_accept();
659  //Certain HYPHEN
660 
661  if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
662  (word_res->reject_map[i].accepted()))
663  //Suspected HYPHEN
664  word_res->reject_map[i].setrej_hyphen();
665  }
666  }
667  prev_right = out_box.right();
668  }
669 }
Definition: rect.h:34
TBOX bounding_box() const
Definition: blobs.cpp:472
int length() const
Definition: ratngs.h:303
Definition: blobs.h:263
int16_t height() const
Definition: rect.h:108
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:359
int16_t width() const
Definition: rect.h:115
int16_t right() const
Definition: rect.h:79
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int16_t left() const
Definition: rect.h:72
int UNICHAR_ID
Definition: unichar.h:34
void tesseract::Tesseract::font_recognition_pass ( PAGE_RES page_res)

font_recognition_pass

Smooth the fonts for the document.

Definition at line 2055 of file control.cpp.

2055  {
2056  PAGE_RES_IT page_res_it(page_res);
2057  WERD_RES *word; // current word
2058  STATS doc_fonts(0, font_table_size_); // font counters
2059 
2060  // Gather font id statistics.
2061  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2062  page_res_it.forward()) {
2063  word = page_res_it.word();
2064  if (word->fontinfo != nullptr) {
2065  doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
2066  }
2067  if (word->fontinfo2 != nullptr) {
2068  doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
2069  }
2070  }
2071  int16_t doc_font; // modal font
2072  int8_t doc_font_count; // modal font
2073  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2074  if (doc_font_count == 0)
2075  return;
2076  // Get the modal font pointer.
2077  const FontInfo* modal_font = nullptr;
2078  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2079  page_res_it.forward()) {
2080  word = page_res_it.word();
2081  if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
2082  modal_font = word->fontinfo;
2083  break;
2084  }
2085  if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
2086  modal_font = word->fontinfo2;
2087  break;
2088  }
2089  }
2090  ASSERT_HOST(modal_font != nullptr);
2091 
2092  // Assign modal font to weak words.
2093  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2094  page_res_it.forward()) {
2095  word = page_res_it.word();
2096  const int length = word->best_choice->length();
2097 
2098  const int count = word->fontinfo_id_count;
2099  if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2100  word->fontinfo = modal_font;
2101  // Counts only get 1 as it came from the doc.
2102  word->fontinfo_id_count = 1;
2103  word->italic = modal_font->is_italic() ? 1 : -1;
2104  word->bold = modal_font->is_bold() ? 1 : -1;
2105  }
2106  }
2107 }
bool is_italic() const
Definition: fontinfo.h:111
int length() const
Definition: ratngs.h:303
int8_t bold
Definition: pageres.h:301
const FontInfo * fontinfo2
Definition: pageres.h:304
int8_t fontinfo_id2_count
Definition: pageres.h:306
const FontInfo * fontinfo
Definition: pageres.h:303
int32_t universal_id
Definition: fontinfo.h:123
int8_t fontinfo_id_count
Definition: pageres.h:305
WERD_CHOICE * best_choice
Definition: pageres.h:234
#define ASSERT_HOST(x)
Definition: errcode.h:88
Definition: statistc.h:31
bool is_bold() const
Definition: fontinfo.h:112
WERD * word
Definition: pageres.h:188
int count(LIST var_list)
Definition: oldlist.cpp:96
int8_t italic
Definition: pageres.h:300
int16_t tesseract::Tesseract::fp_eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 857 of file fixspace.cpp.

857  {
858  WERD_RES_IT word_it(&word_res_list);
859  WERD_RES *word;
860  int16_t score = 0;
861  int16_t i;
862  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
863 
864  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
865  word = word_it.data();
866  if (word->rebuild_word == nullptr)
867  continue; // Can't handle cube words.
868  if (word->done ||
869  word->tess_accepted ||
870  word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
871  word->best_choice->permuter() == FREQ_DAWG_PERM ||
872  word->best_choice->permuter() == USER_DAWG_PERM ||
873  safe_dict_word(word) > 0) {
874  int num_blobs = word->rebuild_word->NumBlobs();
875  UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
876  for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
877  TBLOB* blob = word->rebuild_word->blobs[i];
878  if (word->best_choice->unichar_id(i) == space ||
879  blob_noise_score(blob) < small_limit) {
880  score -= 1; // penalise possibly erroneous non-space
881  } else if (word->reject_map[i].accepted()) {
882  score++;
883  }
884  }
885  }
886  }
887  if (score < 0)
888  score = 0;
889  return score;
890 }
const int kBlnXHeight
Definition: normalis.h:24
bool done
Definition: pageres.h:297
TWERD * rebuild_word
Definition: pageres.h:259
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:608
int length() const
Definition: ratngs.h:303
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:787
Definition: blobs.h:263
bool tess_accepted
Definition: pageres.h:295
REJMAP reject_map
Definition: pageres.h:286
const UNICHARSET * uch_set
Definition: pageres.h:205
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
int NumBlobs() const
Definition: blobs.h:427
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
WERD_CHOICE * best_choice
Definition: pageres.h:234
int UNICHAR_ID
Definition: unichar.h:34
uint8_t permuter() const
Definition: ratngs.h:346
GARBAGE_LEVEL tesseract::Tesseract::garbage_word ( WERD_RES word,
bool  ok_dict_word 
)

Definition at line 679 of file docqual.cpp.

679  {
680  enum STATES
681  {
682  JUNK,
683  FIRST_UPPER,
684  FIRST_LOWER,
685  FIRST_NUM,
686  SUBSEQUENT_UPPER,
687  SUBSEQUENT_LOWER,
688  SUBSEQUENT_NUM
689  };
690  const char *str = word->best_choice->unichar_string().string();
691  const char *lengths = word->best_choice->unichar_lengths().string();
692  STATES state = JUNK;
693  int len = 0;
694  int isolated_digits = 0;
695  int isolated_alphas = 0;
696  int bad_char_count = 0;
697  int tess_rejs = 0;
698  int dodgy_chars = 0;
699  int ok_chars;
700  UNICHAR_ID last_char = -1;
701  int alpha_repetition_count = 0;
702  int longest_alpha_repetition_count = 0;
703  int longest_lower_run_len = 0;
704  int lower_string_count = 0;
705  int longest_upper_run_len = 0;
706  int upper_string_count = 0;
707  int total_alpha_count = 0;
708  int total_digit_count = 0;
709 
710  for (; *str != '\0'; str += *(lengths++)) {
711  len++;
712  if (word->uch_set->get_isupper (str, *lengths)) {
713  total_alpha_count++;
714  switch (state) {
715  case SUBSEQUENT_UPPER:
716  case FIRST_UPPER:
717  state = SUBSEQUENT_UPPER;
718  upper_string_count++;
719  if (longest_upper_run_len < upper_string_count)
720  longest_upper_run_len = upper_string_count;
721  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
722  alpha_repetition_count++;
723  if (longest_alpha_repetition_count < alpha_repetition_count) {
724  longest_alpha_repetition_count = alpha_repetition_count;
725  }
726  }
727  else {
728  last_char = word->uch_set->unichar_to_id(str, *lengths);
729  alpha_repetition_count = 1;
730  }
731  break;
732  case FIRST_NUM:
733  isolated_digits++;
734  // Fall through.
735  default:
736  state = FIRST_UPPER;
737  last_char = word->uch_set->unichar_to_id(str, *lengths);
738  alpha_repetition_count = 1;
739  upper_string_count = 1;
740  break;
741  }
742  }
743  else if (word->uch_set->get_islower (str, *lengths)) {
744  total_alpha_count++;
745  switch (state) {
746  case SUBSEQUENT_LOWER:
747  case FIRST_LOWER:
748  state = SUBSEQUENT_LOWER;
749  lower_string_count++;
750  if (longest_lower_run_len < lower_string_count)
751  longest_lower_run_len = lower_string_count;
752  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
753  alpha_repetition_count++;
754  if (longest_alpha_repetition_count < alpha_repetition_count) {
755  longest_alpha_repetition_count = alpha_repetition_count;
756  }
757  }
758  else {
759  last_char = word->uch_set->unichar_to_id(str, *lengths);
760  alpha_repetition_count = 1;
761  }
762  break;
763  case FIRST_NUM:
764  isolated_digits++;
765  // Fall through.
766  default:
767  state = FIRST_LOWER;
768  last_char = word->uch_set->unichar_to_id(str, *lengths);
769  alpha_repetition_count = 1;
770  lower_string_count = 1;
771  break;
772  }
773  }
774  else if (word->uch_set->get_isdigit (str, *lengths)) {
775  total_digit_count++;
776  switch (state) {
777  case FIRST_NUM:
778  state = SUBSEQUENT_NUM;
779  case SUBSEQUENT_NUM:
780  break;
781  case FIRST_UPPER:
782  case FIRST_LOWER:
783  isolated_alphas++;
784  // Fall through.
785  default:
786  state = FIRST_NUM;
787  break;
788  }
789  }
790  else {
791  if (*lengths == 1 && *str == ' ')
792  tess_rejs++;
793  else
794  bad_char_count++;
795  switch (state) {
796  case FIRST_NUM:
797  isolated_digits++;
798  break;
799  case FIRST_UPPER:
800  case FIRST_LOWER:
801  isolated_alphas++;
802  default:
803  break;
804  }
805  state = JUNK;
806  }
807  }
808 
809  switch (state) {
810  case FIRST_NUM:
811  isolated_digits++;
812  break;
813  case FIRST_UPPER:
814  case FIRST_LOWER:
815  isolated_alphas++;
816  default:
817  break;
818  }
819 
821  total_alpha_count += total_digit_count - isolated_digits;
822  }
823 
824  if (crunch_leave_ok_strings && len >= 4 &&
825  2 * (total_alpha_count - isolated_alphas) > len &&
826  longest_alpha_repetition_count < crunch_long_repetitions) {
827  if ((crunch_accept_ok &&
828  acceptable_word_string(*word->uch_set, str, lengths) !=
829  AC_UNACCEPTABLE) ||
830  longest_lower_run_len > crunch_leave_lc_strings ||
831  longest_upper_run_len > crunch_leave_uc_strings)
832  return G_NEVER_CRUNCH;
833  }
834  if (word->reject_map.length() > 1 &&
835  strpbrk(str, " ") == nullptr &&
836  (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
837  word->best_choice->permuter() == FREQ_DAWG_PERM ||
838  word->best_choice->permuter() == USER_DAWG_PERM ||
839  word->best_choice->permuter() == NUMBER_PERM ||
840  acceptable_word_string(*word->uch_set, str, lengths) !=
841  AC_UNACCEPTABLE || ok_dict_word))
842  return G_OK;
843 
844  ok_chars = len - bad_char_count - isolated_digits -
845  isolated_alphas - tess_rejs;
846 
847  if (crunch_debug > 3) {
848  tprintf("garbage_word: \"%s\"\n",
849  word->best_choice->unichar_string().string());
850  tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
851  len,
852  bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
853  }
854  if (bad_char_count == 0 &&
855  tess_rejs == 0 &&
856  (len > isolated_digits + isolated_alphas || len <= 2))
857  return G_OK;
858 
859  if (tess_rejs > ok_chars ||
860  (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
861  return G_TERRIBLE;
862 
863  if (len > 4) {
864  dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
865  isolated_alphas;
866  if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5)
867  return G_DODGY;
868  else
869  return G_OK;
870  } else {
871  dodgy_chars = 2 * tess_rejs + bad_char_count;
872  if ((len == 4 && dodgy_chars > 2) ||
873  (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
874  return G_DODGY;
875  else
876  return G_OK;
877  }
878 }
Unacceptable word.
Definition: control.h:30
int32_t length() const
Definition: rejctmap.h:223
const STRING & unichar_string() const
Definition: ratngs.h:541
REJMAP reject_map
Definition: pageres.h:286
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
const UNICHARSET * uch_set
Definition: pageres.h:205
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1759
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
Definition: docqual.h:32
WERD_CHOICE * best_choice
Definition: pageres.h:234
int UNICHAR_ID
Definition: unichar.h:34
const STRING & unichar_lengths() const
Definition: ratngs.h:548
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
uint8_t permuter() const
Definition: ratngs.h:346
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498
UNICHAR_ID tesseract::Tesseract::get_rep_char ( WERD_RES word)

Definition at line 251 of file output.cpp.

251  { // what char is repeated?
252  int i;
253  for (i = 0; ((i < word->reject_map.length()) &&
254  (word->reject_map[i].rejected())); ++i);
255 
256  if (i < word->reject_map.length()) {
257  return word->best_choice->unichar_id(i);
258  } else {
259  return word->uch_set->unichar_to_id(unrecognised_char.string());
260  }
261 }
int32_t length() const
Definition: rejctmap.h:223
REJMAP reject_map
Definition: pageres.h:286
const UNICHARSET * uch_set
Definition: pageres.h:205
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
WERD_CHOICE * best_choice
Definition: pageres.h:234
Tesseract* tesseract::Tesseract::get_sub_lang ( int  index) const
inline

Definition at line 281 of file tesseractclass.h.

281  {
282  return sub_langs_[index];
283  }
Dict & tesseract::Tesseract::getDict ( )
overridevirtual

Reimplemented from tesseract::Classify.

Definition at line 564 of file tesseractclass.cpp.

565 {
566  if (0 == Classify::getDict().NumDawgs() && AnyLSTMLang())
567  {
568  if (lstm_recognizer_ && lstm_recognizer_->GetDict())
569  {
570  return *const_cast<Dict*>(lstm_recognizer_->GetDict());
571  }
572  }
573  return Classify::getDict();
574  }
const Dict * GetDict() const
virtual Dict & getDict()
Definition: classify.h:107
bool AnyLSTMLang() const
ImageData * tesseract::Tesseract::GetLineData ( const TBOX line_box,
const GenericVector< TBOX > &  boxes,
const GenericVector< STRING > &  texts,
int  start_box,
int  end_box,
const BLOCK block 
)

Definition at line 129 of file linerec.cpp.

133  {
134  TBOX revised_box;
135  ImageData* image_data = GetRectImage(line_box, block, kImagePadding,
136  &revised_box);
137  if (image_data == nullptr) return nullptr;
138  image_data->set_page_number(applybox_page);
139  // Copy the boxes and shift them so they are relative to the image.
140  FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
141  ICOORD shift = -revised_box.botleft();
142  GenericVector<TBOX> line_boxes;
143  GenericVector<STRING> line_texts;
144  for (int b = start_box; b < end_box; ++b) {
145  TBOX box = boxes[b];
146  box.rotate(block_rotation);
147  box.move(shift);
148  line_boxes.push_back(box);
149  line_texts.push_back(texts[b]);
150  }
151  GenericVector<int> page_numbers;
152  page_numbers.init_to_size(line_boxes.size(), applybox_page);
153  image_data->AddBoxes(line_boxes, line_texts, page_numbers);
154  return image_data;
155 }
Definition: rect.h:34
FCOORD re_rotation() const
Definition: ocrblock.h:135
float x() const
Definition: points.h:207
Definition: points.h:188
integer coordinate
Definition: points.h:31
void init_to_size(int size, const T &t)
const ICOORD & botleft() const
Definition: rect.h:92
float y() const
Definition: points.h:210
int push_back(T object)
void rotate(const FCOORD &vec)
Definition: rect.h:197
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
Definition: linerec.cpp:163
void move(const ICOORD vec)
Definition: rect.h:157
int size() const
Definition: genericvector.h:70
const int kImagePadding
Definition: imagedata.h:39
ImageData * tesseract::Tesseract::GetRectImage ( const TBOX box,
const BLOCK block,
int  padding,
TBOX revised_box 
) const

Definition at line 163 of file linerec.cpp.

164  {
165  TBOX wbox = box;
166  wbox.pad(padding, padding);
167  *revised_box = wbox;
168  // Number of clockwise 90 degree rotations needed to get back to tesseract
169  // coords from the clipped image.
170  int num_rotations = 0;
171  if (block.re_rotation().y() > 0.0f)
172  num_rotations = 1;
173  else if (block.re_rotation().x() < 0.0f)
174  num_rotations = 2;
175  else if (block.re_rotation().y() < 0.0f)
176  num_rotations = 3;
177  // Handle two cases automatically: 1 the box came from the block, 2 the box
178  // came from a box file, and refers to the image, which the block may not.
179  if (block.pdblk.bounding_box().major_overlap(*revised_box))
180  revised_box->rotate(block.re_rotation());
181  // Now revised_box always refers to the image.
182  // BestPix is never colormapped, but may be of any depth.
183  Pix* pix = BestPix();
184  int width = pixGetWidth(pix);
185  int height = pixGetHeight(pix);
186  TBOX image_box(0, 0, width, height);
187  // Clip to image bounds;
188  *revised_box &= image_box;
189  if (revised_box->null_box()) return nullptr;
190  Box* clip_box = boxCreate(revised_box->left(), height - revised_box->top(),
191  revised_box->width(), revised_box->height());
192  Pix* box_pix = pixClipRectangle(pix, clip_box, nullptr);
193  if (box_pix == nullptr) return nullptr;
194  boxDestroy(&clip_box);
195  if (num_rotations > 0) {
196  Pix* rot_pix = pixRotateOrth(box_pix, num_rotations);
197  pixDestroy(&box_pix);
198  box_pix = rot_pix;
199  }
200  // Convert sub-8-bit images to 8 bit.
201  int depth = pixGetDepth(box_pix);
202  if (depth < 8) {
203  Pix* grey;
204  grey = pixConvertTo8(box_pix, false);
205  pixDestroy(&box_pix);
206  box_pix = grey;
207  }
208  bool vertical_text = false;
209  if (num_rotations > 0) {
210  // Rotated the clipped revised box back to internal coordinates.
211  FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
212  revised_box->rotate(rotation);
213  if (num_rotations != 2)
214  vertical_text = true;
215  }
216  return new ImageData(vertical_text, box_pix);
217 }
int16_t top() const
Definition: rect.h:58
Definition: rect.h:34
FCOORD re_rotation() const
Definition: ocrblock.h:135
float x() const
Definition: points.h:207
Definition: points.h:188
int16_t height() const
Definition: rect.h:108
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:60
float y() const
Definition: points.h:210
Pix * BestPix() const
int16_t width() const
Definition: rect.h:115
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:191
void rotate(const FCOORD &vec)
Definition: rect.h:197
int16_t left() const
Definition: rect.h:72
bool null_box() const
Definition: rect.h:50
void pad(int xpad, int ypad)
Definition: rect.h:131
void tesseract::Tesseract::GetSubAndSuperscriptCandidates ( const WERD_RES word,
int *  num_rebuilt_leading,
ScriptPos leading_pos,
float *  leading_certainty,
int *  num_rebuilt_trailing,
ScriptPos trailing_pos,
float *  trailing_certainty,
float *  avg_certainty,
float *  unlikely_threshold 
)

Determine how many characters (rebuilt blobs) on each end of a given word might plausibly be superscripts so SubAndSuperscriptFix can try to re-recognize them. Even if we find no whole blobs at either end, we will set *unlikely_threshold to a certainty that might be used to select "bad enough" outlier characters. If *unlikely_threshold is set to 0, though, there's really no hope.

Parameters
[in]wordThe word to examine.
[out]num_rebuilt_leadingthe number of rebuilt blobs at the start of the word which are all up or down and seem badly classified.
[out]leading_pos"super" or "sub" (for debugging)
[out]leading_certaintythe worst certainty in the leading blobs.
[out]num_rebuilt_trailingthe number of rebuilt blobs at the end of the word which are all up or down and seem badly classified.
[out]trailing_pos"super" or "sub" (for debugging)
[out]trailing_certaintythe worst certainty in the trailing blobs.
[out]avg_certaintythe average certainty of "normal" blobs in the word.
[out]unlikely_thresholdthe threshold (on certainty) we used to select "bad enough" outlier characters.

Definition at line 254 of file superscript.cpp.

262  {
263  *avg_certainty = *unlikely_threshold = 0.0f;
264  *num_rebuilt_leading = *num_rebuilt_trailing = 0;
265  *leading_certainty = *trailing_certainty = 0.0f;
266 
267  int super_y_bottom =
269  int sub_y_top =
271 
272  // Step one: Get an average certainty for "normally placed" characters.
273 
274  // Counts here are of blobs in the rebuild_word / unichars in best_choice.
275  *leading_pos = *trailing_pos = SP_NORMAL;
276  int leading_outliers = 0;
277  int trailing_outliers = 0;
278  int num_normal = 0;
279  float normal_certainty_total = 0.0f;
280  float worst_normal_certainty = 0.0f;
281  ScriptPos last_pos = SP_NORMAL;
282  int num_blobs = word->rebuild_word->NumBlobs();
283  for (int b = 0; b < num_blobs; ++b) {
284  TBOX box = word->rebuild_word->blobs[b]->bounding_box();
285  ScriptPos pos = SP_NORMAL;
286  if (box.bottom() >= super_y_bottom) {
287  pos = SP_SUPERSCRIPT;
288  } else if (box.top() <= sub_y_top) {
289  pos = SP_SUBSCRIPT;
290  }
291  if (pos == SP_NORMAL) {
292  if (word->best_choice->unichar_id(b) != 0) {
293  float char_certainty = word->best_choice->certainty(b);
294  if (char_certainty < worst_normal_certainty) {
295  worst_normal_certainty = char_certainty;
296  }
297  num_normal++;
298  normal_certainty_total += char_certainty;
299  }
300  if (trailing_outliers == b) {
301  leading_outliers = trailing_outliers;
302  *leading_pos = last_pos;
303  }
304  trailing_outliers = 0;
305  } else {
306  if (last_pos == pos) {
307  trailing_outliers++;
308  } else {
309  trailing_outliers = 1;
310  }
311  }
312  last_pos = pos;
313  }
314  *trailing_pos = last_pos;
315  if (num_normal >= 3) { // throw out the worst as an outlier.
316  num_normal--;
317  normal_certainty_total -= worst_normal_certainty;
318  }
319  if (num_normal > 0) {
320  *avg_certainty = normal_certainty_total / num_normal;
321  *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
322  }
323  if (num_normal == 0 ||
324  (leading_outliers == 0 && trailing_outliers == 0)) {
325  return;
326  }
327 
328  // Step two: Try to split off bits of the word that are both outliers
329  // and have much lower certainty than average
330  // Calculate num_leading and leading_certainty.
331  for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
332  *num_rebuilt_leading < leading_outliers;
333  (*num_rebuilt_leading)++) {
334  float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
335  if (char_certainty > *unlikely_threshold) {
336  break;
337  }
338  if (char_certainty < *leading_certainty) {
339  *leading_certainty = char_certainty;
340  }
341  }
342 
343  // Calculate num_trailing and trailing_certainty.
344  for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
345  *num_rebuilt_trailing < trailing_outliers;
346  (*num_rebuilt_trailing)++) {
347  int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
348  float char_certainty = word->best_choice->certainty(blob_idx);
349  if (char_certainty > *unlikely_threshold) {
350  break;
351  }
352  if (char_certainty < *trailing_certainty) {
353  *trailing_certainty = char_certainty;
354  }
355  }
356 }
const int kBlnXHeight
Definition: normalis.h:24
int16_t top() const
Definition: rect.h:58
Definition: rect.h:34
TWERD * rebuild_word
Definition: pageres.h:259
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
const int kBlnBaselineOffset
Definition: normalis.h:25
int16_t bottom() const
Definition: rect.h:65
int NumBlobs() const
Definition: blobs.h:427
float certainty() const
Definition: ratngs.h:330
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
WERD_CHOICE * best_choice
Definition: pageres.h:234
double superscript_worse_certainty
int tesseract::Tesseract::ImageHeight ( ) const
inline

Definition at line 255 of file tesseractclass.h.

255  {
256  return pixGetHeight(pix_binary_);
257  }
int tesseract::Tesseract::ImageWidth ( ) const
inline

Definition at line 252 of file tesseractclass.h.

252  {
253  return pixGetWidth(pix_binary_);
254  }
FILE * tesseract::Tesseract::init_recog_training ( const STRING fname)

Definition at line 36 of file recogtraining.cpp.

36  {
38  tessedit_tess_adaption_mode.set_value(0); // turn off adaption
39  tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
40  // Explore all segmentations.
42  }
43 
44  STRING output_fname = fname;
45  const char* lastdot = strrchr(output_fname.string(), '.');
46  if (lastdot != nullptr)
47  output_fname[lastdot - output_fname.string()] = '\0';
48  output_fname += ".txt";
49  FILE* output_file = fopen(output_fname.string(), "a+");
50  if (output_file == nullptr) {
51  tprintf("Error: Could not open file %s\n", output_fname.string());
52  ASSERT_HOST(output_file);
53  }
54  return output_file;
55 }
Definition: strngs.h:45
bool stopper_no_acceptable_choices
Definition: dict.h:631
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
Dict & getDict() override
#define ASSERT_HOST(x)
Definition: errcode.h:88
int tesseract::Tesseract::init_tesseract ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params,
TessdataManager mgr 
)

Definition at line 284 of file tessedit.cpp.

290  {
291  GenericVector<STRING> langs_to_load;
292  GenericVector<STRING> langs_not_to_load;
293  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
294 
295  sub_langs_.delete_data_pointers();
296  sub_langs_.clear();
297  // Find the first loadable lang and load into this.
298  // Add any languages that this language requires
299  bool loaded_primary = false;
300  // Load the rest into sub_langs_.
301  for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
302  if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
303  const char* lang_str = langs_to_load[lang_index].string();
304  Tesseract* tess_to_init;
305  if (!loaded_primary) {
306  tess_to_init = this;
307  } else {
308  tess_to_init = new Tesseract;
309  }
310 
311  int result = tess_to_init->init_tesseract_internal(
312  arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
313  vars_values, set_only_non_debug_params, mgr);
314  // Forget that language, but keep any reader we were given.
315  mgr->Clear();
316 
317  if (!loaded_primary) {
318  if (result < 0) {
319  tprintf("Failed loading language '%s'\n", lang_str);
320  } else {
321  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
322  &langs_to_load, &langs_not_to_load);
323  loaded_primary = true;
324  }
325  } else {
326  if (result < 0) {
327  tprintf("Failed loading language '%s'\n", lang_str);
328  delete tess_to_init;
329  } else {
330  sub_langs_.push_back(tess_to_init);
331  // Add any languages that this language requires
332  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
333  &langs_to_load, &langs_not_to_load);
334  }
335  }
336  }
337  }
338  if (!loaded_primary) {
339  tprintf("Tesseract couldn't load any languages!\n");
340  return -1; // Couldn't load any language!
341  }
342 #ifndef DISABLED_LEGACY_ENGINE
343  if (!sub_langs_.empty()) {
344  // In multilingual mode word ratings have to be directly comparable,
345  // so use the same language model weights for all languages:
346  // use the primary language's params model if
347  // tessedit_use_primary_params_model is set,
348  // otherwise use default language model weights.
350  for (int s = 0; s < sub_langs_.size(); ++s) {
351  sub_langs_[s]->language_model_->getParamsModel().Copy(
352  this->language_model_->getParamsModel());
353  }
354  tprintf("Using params model of the primary language\n");
355  } else {
356  this->language_model_->getParamsModel().Clear();
357  for (int s = 0; s < sub_langs_.size(); ++s) {
358  sub_langs_[s]->language_model_->getParamsModel().Clear();
359  }
360  }
361  }
362 
364 #endif // ndef DISABLED_LEGACY_ENGINE
365  return 0;
366 }
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
void SetupUniversalFontIds()
Definition: tessedit.cpp:429
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:252
int size() const
Definition: genericvector.h:70
int tesseract::Tesseract::init_tesseract ( const char *  datapath,
const char *  language,
OcrEngineMode  oem 
)
inline

Definition at line 511 of file tesseractclass.h.

512  {
513  TessdataManager mgr;
514  return init_tesseract(datapath, nullptr, language, oem, nullptr, 0, nullptr,
515  nullptr, false, &mgr);
516  }
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:284
int tesseract::Tesseract::init_tesseract_internal ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params,
TessdataManager mgr 
)

Definition at line 384 of file tessedit.cpp.

390  {
391  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
392  configs_size, vars_vec, vars_values,
393  set_only_non_debug_params, mgr)) {
394  return -1;
395  }
397  return 0;
398  }
399  // If only LSTM will be used, skip loading Tesseract classifier's
400  // pre-trained templates and dictionary.
402  program_editup(textbase, init_tesseract ? mgr : nullptr,
403  init_tesseract ? mgr : nullptr);
404  return 0; // Normal exit
405 }
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:80
void program_editup(const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
Definition: tface.cpp:40
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:284
bool tesseract::Tesseract::init_tesseract_lang_data ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params,
TessdataManager mgr 
)

Definition at line 80 of file tessedit.cpp.

85  {
86  // Set the basename, compute the data directory.
87  main_setup(arg0, textbase);
88 
89  // Set the language data path prefix
90  lang = language != nullptr ? language : "eng";
94 
95  // Initialize TessdataManager.
96  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
97  if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) {
98  tprintf("Error opening data file %s\n", tessdata_path.string());
99  tprintf(
100  "Please make sure the TESSDATA_PREFIX environment variable is set"
101  " to your \"tessdata\" directory.\n");
102  return false;
103  }
104 #ifndef DISABLED_LEGACY_ENGINE
105  if (oem == OEM_DEFAULT) {
106  // Set the engine mode from availability, which can then be overridden by
107  // the config file when we read it below.
108  if (!mgr->IsLSTMAvailable()) {
110  } else if (!mgr->IsBaseAvailable()) {
112  } else {
114  }
115  }
116 #endif // ndef DISABLED_LEGACY_ENGINE
117 
118  // If a language specific config file (lang.config) exists, load it in.
119  TFile fp;
120  if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
122  this->params());
123  }
124 
125  SetParamConstraint set_params_constraint =
126  set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
128  // Load tesseract variables from config files. This is done after loading
129  // language-specific variables from [lang].traineddata file, so that custom
130  // config files can override values in [lang].traineddata file.
131  for (int i = 0; i < configs_size; ++i) {
132  read_config_file(configs[i], set_params_constraint);
133  }
134 
135  // Set params specified in vars_vec (done after setting params from config
136  // files, so that params in vars_vec can override those from files).
137  if (vars_vec != nullptr && vars_values != nullptr) {
138  for (int i = 0; i < vars_vec->size(); ++i) {
139  if (!ParamUtils::SetParam((*vars_vec)[i].string(),
140  (*vars_values)[i].string(),
141  set_params_constraint, this->params())) {
142  tprintf("Error setting param %s\n", (*vars_vec)[i].string());
143  exit(1);
144  }
145  }
146  }
147 
148  if (!tessedit_write_params_to_file.empty()) {
149  FILE* params_file = fopen(tessedit_write_params_to_file.string(), "wb");
150  if (params_file != nullptr) {
151  ParamUtils::PrintParams(params_file, this->params());
152  fclose(params_file);
153  } else {
154  tprintf("Failed to open %s for writing params.\n",
156  }
157  }
158 
159  // Determine which ocr engine(s) should be loaded and used for recognition.
160  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
161 
162  // If we are only loading the config file (and so not planning on doing any
163  // recognition) then there's nothing else do here.
165  return true;
166  }
167 
168 // The various OcrEngineMode settings (see publictypes.h) determine which
169 // engine-specific data files need to be loaded.
170 // If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
171 #ifndef ANDROID_BUILD
172 # ifdef DISABLED_LEGACY_ENGINE
174 # else
177 # endif // ndef DISABLED_LEGACY_ENGINE
178  if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
179  lstm_recognizer_ = new LSTMRecognizer;
180  ASSERT_HOST(lstm_recognizer_->Load(
181  this->params(), lstm_use_matrix ? language : nullptr, mgr));
182  } else {
183  tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
185  }
186  }
187 #endif // ndef ANDROID_BUILD
188 
189  // Load the unicharset
191  // Avoid requiring a unicharset when we aren't running base tesseract.
192 #ifndef ANDROID_BUILD
193  unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
194 #endif // ndef ANDROID_BUILD
195  }
196 #ifndef DISABLED_LEGACY_ENGINE
197  else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
198  !unicharset.load_from_file(&fp, false)) {
199  return false;
200  }
201 #endif // ndef DISABLED_LEGACY_ENGINE
202  if (unicharset.size() > MAX_NUM_CLASSES) {
203  tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
204  return false;
205  }
206  right_to_left_ = unicharset.major_right_to_left();
207 
208  // Setup initial unichar ambigs table and read universal ambigs.
209  UNICHARSET encoder_unicharset;
210  encoder_unicharset.CopyFrom(unicharset);
212  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
213 
214  if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
215  unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp,
218  }
219 #ifndef DISABLED_LEGACY_ENGINE
220  // Init ParamsModel.
221  // Load pass1 and pass2 weights (for now these two sets are the same, but in
222  // the future separate sets of weights can be generated).
224  ++p) {
225  language_model_->getParamsModel().SetPass(
226  static_cast<ParamsModel::PassEnum>(p));
227  if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
228  if (!language_model_->getParamsModel().LoadFromFp(lang.string(), &fp)) {
229  return false;
230  }
231  }
232  }
233 #endif // ndef DISABLED_LEGACY_ENGINE
234 
235  return true;
236 }
ParamsVectors * params()
Definition: ccutil.h:65
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
void main_setup(const char *argv0, const char *basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: mainblk.cpp:44
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:70
char * tessedit_write_params_to_file
SetParamConstraint
Definition: params.h:35
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:448
Definition: strngs.h:45
int ambigs_debug_level
Definition: ccutil.h:83
const UNICHARSET & GetUnicharset() const
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:180
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:72
#define MAX_NUM_CLASSES
Definition: matchdefs.h:30
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:63
UNICHARSET unicharset
Definition: ccutil.h:71
int size() const
Definition: unicharset.h:341
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
Definition: params.cpp:63
bool major_right_to_left() const
Definition: unicharset.cpp:992
bool Load(const ParamsVectors *params, const char *lang, TessdataManager *mgr)
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
STRING lang
Definition: ccutil.h:69
STRING datadir
Definition: ccutil.h:67
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:92
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:388
#define ASSERT_HOST(x)
Definition: errcode.h:88
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:49
int size() const
Definition: genericvector.h:70
bool use_ambigs_for_adaption
Definition: ccutil.h:87
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:49
STRING language_data_path_prefix
Definition: ccutil.h:70
int tesseract::Tesseract::init_tesseract_lm ( const char *  arg0,
const char *  textbase,
const char *  language,
TessdataManager mgr 
)

Definition at line 450 of file tessedit.cpp.

451  {
452  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
453  nullptr, 0, nullptr, nullptr, false, mgr))
454  return -1;
456  getDict().Load(lang, mgr);
457  getDict().FinishLoad();
458  return 0;
459 }
bool FinishLoad()
Definition: dict.cpp:360
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:80
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:201
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:219
Dict & getDict() override
STRING lang
Definition: ccutil.h:69
static TESS_API DawgCache * GlobalDawgCache()
Definition: dict.cpp:193
void tesseract::Tesseract::join_words ( WERD_RES word,
WERD_RES word2,
BlamerBundle orig_bb 
) const

Definition at line 234 of file tfacepp.cpp.

236  {
237  TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
238  TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
239  // Tack the word2 outputs onto the end of the word outputs.
240  word->chopped_word->blobs += word2->chopped_word->blobs;
241  word->rebuild_word->blobs += word2->rebuild_word->blobs;
242  word2->chopped_word->blobs.clear();
243  word2->rebuild_word->blobs.clear();
244  TPOINT split_pt;
245  split_pt.x = (prev_box.right() + blob_box.left()) / 2;
246  split_pt.y = (prev_box.top() + prev_box.bottom() +
247  blob_box.top() + blob_box.bottom()) / 4;
248  // Move the word2 seams onto the end of the word1 seam_array.
249  // Since the seam list is one element short, an empty seam marking the
250  // end of the last blob in the first word is needed first.
251  word->seam_array.push_back(new SEAM(0.0f, split_pt));
252  word->seam_array += word2->seam_array;
253  word2->seam_array.truncate(0);
254  // Fix widths and gaps.
255  word->blob_widths += word2->blob_widths;
256  word->blob_gaps += word2->blob_gaps;
257  // Fix the ratings matrix.
258  int rat1 = word->ratings->dimension();
259  int rat2 = word2->ratings->dimension();
260  word->ratings->AttachOnCorner(word2->ratings);
261  ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
262  word->best_state += word2->best_state;
263  // Append the word choices.
264  *word->raw_choice += *word2->raw_choice;
265 
266  // How many alt choices from each should we try to get?
267  const int kAltsPerPiece = 2;
268  // When do we start throwing away extra alt choices?
269  const int kTooManyAltChoices = 100;
270 
271  // Construct the cartesian product of the best_choices of word(1) and word2.
272  WERD_CHOICE_LIST joined_choices;
273  WERD_CHOICE_IT jc_it(&joined_choices);
274  WERD_CHOICE_IT bc1_it(&word->best_choices);
275  WERD_CHOICE_IT bc2_it(&word2->best_choices);
276  int num_word1_choices = word->best_choices.length();
277  int total_joined_choices = num_word1_choices;
278  // Nota Bene: For the main loop here, we operate only on the 2nd and greater
279  // word2 choices, and put them in the joined_choices list. The 1st word2
280  // choice gets added to the original word1 choices in-place after we have
281  // finished with them.
282  int bc2_index = 1;
283  for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
284  if (total_joined_choices >= kTooManyAltChoices &&
285  bc2_index > kAltsPerPiece)
286  break;
287  int bc1_index = 0;
288  for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
289  ++bc1_index, bc1_it.forward()) {
290  if (total_joined_choices >= kTooManyAltChoices &&
291  bc1_index > kAltsPerPiece)
292  break;
293  auto *wc = new WERD_CHOICE(*bc1_it.data());
294  *wc += *bc2_it.data();
295  jc_it.add_after_then_move(wc);
296  ++total_joined_choices;
297  }
298  }
299  // Now that we've filled in as many alternates as we want, paste the best
300  // choice for word2 onto the original word alt_choices.
301  bc1_it.move_to_first();
302  bc2_it.move_to_first();
303  for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
304  *bc1_it.data() += *bc2_it.data();
305  }
306  bc1_it.move_to_last();
307  bc1_it.add_list_after(&joined_choices);
308 
309  // Restore the pointer to original blamer bundle and combine blamer
310  // information recorded in the splits.
311  if (orig_bb != nullptr) {
312  orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
314  delete word->blamer_bundle;
315  word->blamer_bundle = orig_bb;
316  }
317  word->SetupBoxWord();
318  word->reject_map.initialise(word->box_word->length());
319  delete word2;
320 }
int16_t top() const
Definition: rect.h:58
Definition: rect.h:34
TWERD * rebuild_word
Definition: pageres.h:259
int16_t x
Definition: blobs.h:73
GenericVector< int > blob_widths
Definition: pageres.h:218
void SetupBoxWord()
Definition: pageres.cpp:853
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
void AttachOnCorner(BandTriMatrix< T > *array2)
Definition: matrix.h:553
TBOX bounding_box() const
Definition: blobs.cpp:472
GenericVector< SEAM * > seam_array
Definition: pageres.h:216
WERD_CHOICE_LIST best_choices
Definition: pageres.h:242
int length() const
Definition: boxword.h:83
REJMAP reject_map
Definition: pageres.h:286
int16_t y
Definition: blobs.h:74
Definition: blobs.h:52
Definition: seam.h:38
void initialise(int16_t length)
Definition: rejctmap.cpp:273
void truncate(int size)
tesseract::BoxWord * box_word
Definition: pageres.h:265
int push_back(T object)
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
MATRIX * ratings
Definition: pageres.h:230
#define ASSERT_HOST(x)
Definition: errcode.h:88
int dimension() const
Definition: matrix.h:536
int16_t left() const
Definition: rect.h:72
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
Definition: blamer.cpp:230
TWERD * chopped_word
Definition: pageres.h:214
bool wordrec_debug_blamer
Definition: wordrec.h:236
GenericVector< int > best_state
Definition: pageres.h:270
GenericVector< int > blob_gaps
Definition: pageres.h:221
WERD_CHOICE * raw_choice
Definition: pageres.h:239
T & back() const
BlamerBundle * blamer_bundle
Definition: pageres.h:245
void tesseract::Tesseract::LSTMRecognizeWord ( const BLOCK block,
ROW row,
WERD_RES word,
PointerVector< WERD_RES > *  words 
)

Definition at line 222 of file linerec.cpp.

223  {
224  TBOX word_box = word->word->bounding_box();
225  // Get the word image - no frills.
228  // In single word mode, use the whole image without any other row/word
229  // interpretation.
230  word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
231  } else {
232  float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
233  if (baseline + row->descenders() < word_box.bottom())
234  word_box.set_bottom(baseline + row->descenders());
235  if (baseline + row->x_height() + row->ascenders() > word_box.top())
236  word_box.set_top(baseline + row->x_height() + row->ascenders());
237  }
238  ImageData* im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
239  if (im_data == nullptr) return;
240  lstm_recognizer_->RecognizeLine(*im_data, true, classify_debug_level > 0,
242  word_box, words, lstm_choice_mode);
243  delete im_data;
244  SearchWords(words);
245 }
const float kCertaintyScale
Definition: linerec.cpp:36
int16_t top() const
Definition: rect.h:58
float x_height() const
Definition: ocrrow.h:64
void set_top(int y)
Definition: rect.h:61
Definition: rect.h:34
const float kWorstDictCertainty
Definition: linerec.cpp:38
void SearchWords(PointerVector< WERD_RES > *words)
Definition: linerec.cpp:250
int ImageWidth() const
hacks that are Tesseract-specific.
Definition: publictypes.h:179
float descenders() const
Definition: ocrrow.h:85
void RecognizeLine(const ImageData &image_data, bool invert, bool debug, double worst_dict_cert, const TBOX &line_box, PointerVector< WERD_RES > *words, int lstm_choice_mode=0)
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
TBOX bounding_box() const
Definition: werd.cpp:148
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
Definition: linerec.cpp:163
float base_line(float xpos) const
Definition: ocrrow.h:59
int16_t left() const
Definition: rect.h:72
void set_bottom(int y)
Definition: rect.h:68
int ImageHeight() const
WERD * word
Definition: pageres.h:188
const int kImagePadding
Definition: imagedata.h:39
float ascenders() const
Definition: ocrrow.h:82
Treat the image as a single word.
Definition: publictypes.h:174
void tesseract::Tesseract::make_reject_map ( WERD_RES word,
ROW row,
int16_t  pass 
)
void tesseract::Tesseract::match_current_words ( WERD_RES_LIST &  words,
ROW row,
BLOCK block 
)

Definition at line 223 of file fixspace.cpp.

224  {
225  WERD_RES_IT word_it(&words);
226  WERD_RES *word;
227  // Since we are not using PAGE_RES to iterate over words, we need to update
228  // prev_word_best_choice_ before calling classify_word_pass2().
229  prev_word_best_choice_ = nullptr;
230  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
231  word = word_it.data();
232  if ((!word->part_of_combo) && (word->box_word == nullptr)) {
233  WordData word_data(block, row, word);
234  SetupWordPassN(2, &word_data);
235  classify_word_and_language(2, nullptr, &word_data);
236  }
238  }
239 }
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481
tesseract::BoxWord * box_word
Definition: pageres.h:265
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:177
WERD_CHOICE * best_choice
Definition: pageres.h:234
bool part_of_combo
Definition: pageres.h:334
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1333
void tesseract::Tesseract::match_word_pass_n ( int  pass_n,
WERD_RES word,
ROW row,
BLOCK block 
)

match_word_pass2

Baseline normalize the word and pass it to Tess.

Definition at line 1644 of file control.cpp.

1645  {
1646  if (word->tess_failed) return;
1647  tess_segment_pass_n(pass_n, word);
1648 
1649  if (!word->tess_failed) {
1650  if (!word->word->flag (W_REP_CHAR)) {
1651  word->fix_quotes();
1653  word->fix_hyphens();
1654  /* Don't trust fix_quotes! - though I think I've fixed the bug */
1655  if (word->best_choice->length() != word->box_word->length()) {
1656  tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1657  " #Blobs=%d\n",
1658  word->best_choice->debug_string().string(),
1659  word->best_choice->length(),
1660  word->box_word->length());
1661 
1662  }
1663  word->tess_accepted = tess_acceptable_word(word);
1664 
1665  // Also sets word->done flag
1666  make_reject_map(word, row, pass_n);
1667  }
1668  }
1669  set_word_fonts(word);
1670 
1671  ASSERT_HOST(word->raw_choice != nullptr);
1672 }
bool tess_failed
Definition: pageres.h:287
int length() const
Definition: ratngs.h:303
bool tess_accepted
Definition: pageres.h:295
repeated character
Definition: werd.h:38
int length() const
Definition: boxword.h:83
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:62
void fix_quotes()
Definition: pageres.cpp:1022
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1976
void fix_hyphens()
Definition: pageres.cpp:1051
tesseract::BoxWord * box_word
Definition: pageres.h:265
WERD_CHOICE * best_choice
Definition: pageres.h:234
#define ASSERT_HOST(x)
Definition: errcode.h:88
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
const STRING debug_string() const
Definition: ratngs.h:505
WERD_CHOICE * raw_choice
Definition: pageres.h:239
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
WERD * word
Definition: pageres.h:188
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:32
void tesseract::Tesseract::MaximallyChopWord ( const GenericVector< TBOX > &  boxes,
BLOCK block,
ROW row,
WERD_RES word_res 
)

Tests the chopper by exhaustively running chop_one_blob. The word_res will contain filled chopped_word, seam_array, denorm, box_word and best_state for the maximally chopped word.

Definition at line 243 of file applybox.cpp.

245  {
246  if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
247  tessedit_ocr_engine_mode, nullptr,
251  row, block)) {
252  word_res->CloneChoppedToRebuild();
253  return;
254  }
255  if (chop_debug) {
256  tprintf("Maximally chopping word at:");
257  word_res->word->bounding_box().print();
258  }
259  GenericVector<BLOB_CHOICE*> blob_choices;
260  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
261  auto rating = static_cast<float>(INT8_MAX);
262  for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
263  // The rating and certainty are not quite arbitrary. Since
264  // select_blob_to_chop uses the worst certainty to choose, they all have
265  // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
266  // in here, and then divide by e each time they are chopped, which
267  // should guarantee a set of unequal values for the whole tree of blobs
268  // produced, however much chopping is required. The chops are thus only
269  // limited by the ability of the chopper to find suitable chop points,
270  // and not by the value of the certainties.
271  auto* choice =
272  new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
273  blob_choices.push_back(choice);
274  rating -= 0.125f;
275  }
276  const double e = exp(1.0); // The base of natural logs.
277  int blob_number;
278  int right_chop_index = 0;
280  // We only chop if the language is not fixed pitch like CJK.
281  SEAM* seam = nullptr;
282  while ((seam = chop_one_blob(boxes, blob_choices, word_res,
283  &blob_number)) != nullptr) {
284  word_res->InsertSeam(blob_number, seam);
285  BLOB_CHOICE* left_choice = blob_choices[blob_number];
286  rating = left_choice->rating() / e;
287  left_choice->set_rating(rating);
288  left_choice->set_certainty(-rating);
289  // combine confidence w/ serial #
290  auto* right_choice = new BLOB_CHOICE(++right_chop_index,
291  rating - 0.125f, -rating, -1,
292  0.0f, 0.0f, 0.0f, BCC_FAKE);
293  blob_choices.insert(right_choice, blob_number + 1);
294  }
295  }
296  word_res->CloneChoppedToRebuild();
297  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
298 }
void print() const
Definition: rect.h:278
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:306
float rating() const
Definition: ratngs.h:80
void set_rating(float newrat)
Definition: ratngs.h:148
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:230
UNICHARSET unicharset
Definition: ccutil.h:71
Pix * BestPix() const
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:881
Definition: seam.h:38
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
bool empty() const
Definition: genericvector.h:89
int push_back(T object)
void insert(const T &t, int index)
TBOX bounding_box() const
Definition: werd.cpp:148
int NumBlobs() const
Definition: blobs.h:427
#define ASSERT_HOST(x)
Definition: errcode.h:88
void CloneChoppedToRebuild()
Definition: pageres.cpp:839
TWERD * chopped_word
Definition: pageres.h:214
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:371
int size() const
Definition: genericvector.h:70
WERD * word
Definition: pageres.h:188
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:422
bool classify_bln_numeric_mode
Definition: classify.h:540
void set_certainty(float newrat)
Definition: ratngs.h:151
Pix** tesseract::Tesseract::mutable_pix_binary ( )
inline

Definition at line 198 of file tesseractclass.h.

198  {
199  pixDestroy(&pix_binary_);
200  return &pix_binary_;
201  }
Textord* tesseract::Tesseract::mutable_textord ( )
inline

Definition at line 271 of file tesseractclass.h.

271  {
272  return &textord_;
273  }
void tesseract::Tesseract::nn_match_word ( WERD_RES word,
ROW row 
)
void tesseract::Tesseract::nn_recover_rejects ( WERD_RES word,
ROW row 
)
bool tesseract::Tesseract::noise_outlines ( TWERD word)

Definition at line 980 of file docqual.cpp.

980  {
981  TBOX box; // BB of outline
982  int16_t outline_count = 0;
983  int16_t small_outline_count = 0;
984  int16_t max_dimension;
985  float small_limit = kBlnXHeight * crunch_small_outlines_size;
986 
987  for (int b = 0; b < word->NumBlobs(); ++b) {
988  TBLOB* blob = word->blobs[b];
989  for (TESSLINE* ol = blob->outlines; ol != nullptr; ol = ol->next) {
990  outline_count++;
991  box = ol->bounding_box();
992  if (box.height() > box.width())
993  max_dimension = box.height();
994  else
995  max_dimension = box.width();
996  if (max_dimension < small_limit)
997  small_outline_count++;
998  }
999  }
1000  return small_outline_count >= outline_count;
1001 }
const int kBlnXHeight
Definition: normalis.h:24
TESSLINE * next
Definition: blobs.h:260
Definition: rect.h:34
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
TESSLINE * outlines
Definition: blobs.h:379
Definition: blobs.h:263
int16_t height() const
Definition: rect.h:108
int16_t width() const
Definition: rect.h:115
int NumBlobs() const
Definition: blobs.h:427
bool tesseract::Tesseract::non_0_digit ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 790 of file reject.cpp.

790  {
791  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
792 }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:687
bool tesseract::Tesseract::non_O_upper ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 786 of file reject.cpp.

786  {
787  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
788 }
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:687
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
int tesseract::Tesseract::num_sub_langs ( ) const
inline

Definition at line 278 of file tesseractclass.h.

278  {
279  return sub_langs_.size();
280  }
bool tesseract::Tesseract::one_ell_conflict ( WERD_RES word_res,
bool  update_map 
)

Definition at line 293 of file reject.cpp.

293  {
294  const char *word;
295  const char *lengths;
296  int16_t word_len; //its length
297  int16_t first_alphanum_index_;
298  int16_t first_alphanum_offset_;
299  int16_t i;
300  int16_t offset;
301  bool non_conflict_set_char; //non conf set a/n?
302  bool conflict = false;
303  bool allow_1s;
304  ACCEPTABLE_WERD_TYPE word_type;
305  bool dict_perm_type;
306  bool dict_word_ok;
307  int dict_word_type;
308 
309  word = word_res->best_choice->unichar_string().string ();
310  lengths = word_res->best_choice->unichar_lengths().string();
311  word_len = strlen(lengths);
312  /*
313  If there are no occurrences of the conflict set characters then the word
314  is OK.
315  */
316  if (strpbrk(word, conflict_set_I_l_1.string ()) == nullptr)
317  return false;
318 
319  /*
320  There is a conflict if there are NO other (confirmed) alphanumerics apart
321  from those in the conflict set.
322  */
323 
324  for (i = 0, offset = 0, non_conflict_set_char = false;
325  (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
326  non_conflict_set_char =
327  (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
328  word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
329  !STRING (conflict_set_I_l_1).contains (word[offset]);
330  if (!non_conflict_set_char) {
331  if (update_map)
332  reject_I_1_L(word_res);
333  return true;
334  }
335 
336  /*
337  If the word is accepted by a dawg permuter, and the first alpha character
338  is "I" or "l", check to see if the alternative is also a dawg word. If it
339  is, then there is a potential error otherwise the word is ok.
340  */
341 
342  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
343  (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
345  (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
346  (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
347  dict_word_type = dict_word(*(word_res->best_choice));
348  dict_word_ok = (dict_word_type > 0) &&
349  (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
350 
351  if ((rej_1Il_use_dict_word && dict_word_ok) ||
352  (rej_1Il_trust_permuter_type && dict_perm_type) ||
353  (dict_perm_type && dict_word_ok)) {
354  first_alphanum_index_ = first_alphanum_index (word, lengths);
355  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
356  if (lengths[first_alphanum_index_] == 1 &&
357  word[first_alphanum_offset_] == 'I') {
358  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
359  if (safe_dict_word(word_res) > 0) {
360  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
361  if (update_map)
362  word_res->reject_map[first_alphanum_index_].
363  setrej_1Il_conflict();
364  return true;
365  }
366  else {
367  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
368  return false;
369  }
370  }
371 
372  if (lengths[first_alphanum_index_] == 1 &&
373  word[first_alphanum_offset_] == 'l') {
374  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
375  if (safe_dict_word(word_res) > 0) {
376  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
377  if (update_map)
378  word_res->reject_map[first_alphanum_index_].
379  setrej_1Il_conflict();
380  return true;
381  }
382  else {
383  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
384  return false;
385  }
386  }
387  return false;
388  }
389 
390  /*
391  NEW 1Il code. The old code relied on permuter types too much. In fact,
392  tess will use TOP_CHOICE permute for good things like "palette".
393  In this code the string is examined independently to see if it looks like
394  a well formed word.
395  */
396 
397  /*
398  REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
399  dictionary word.
400  */
401  first_alphanum_index_ = first_alphanum_index (word, lengths);
402  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
403  if (lengths[first_alphanum_index_] == 1 &&
404  word[first_alphanum_offset_] == 'l') {
405  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
406  if (safe_dict_word(word_res) > 0)
407  return false;
408  else
409  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
410  }
411  else if (lengths[first_alphanum_index_] == 1 &&
412  word[first_alphanum_offset_] == 'I') {
413  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
414  if (safe_dict_word(word_res) > 0)
415  return false;
416  else
417  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
418  }
419  /*
420  For strings containing digits:
421  If there are no alphas OR the numeric permuter liked the word,
422  reject any non 1 conflict chs
423  Else reject all conflict chs
424  */
425  if (word_contains_non_1_digit (word, lengths)) {
426  allow_1s = (alpha_count (word, lengths) == 0) ||
427  (word_res->best_choice->permuter () == NUMBER_PERM);
428 
429  int16_t offset;
430  conflict = false;
431  for (i = 0, offset = 0; word[offset] != '\0';
432  offset += word_res->best_choice->unichar_lengths()[i++]) {
433  if ((!allow_1s || (word[offset] != '1')) &&
434  STRING (conflict_set_I_l_1).contains (word[offset])) {
435  if (update_map)
436  word_res->reject_map[i].setrej_1Il_conflict ();
437  conflict = true;
438  }
439  }
440  return conflict;
441  }
442  /*
443  For anything else. See if it conforms to an acceptable word type. If so,
444  treat accordingly.
445  */
446  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
447  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
448  first_alphanum_index_ = first_alphanum_index (word, lengths);
449  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
450  if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
451  if (update_map)
452  word_res->reject_map[first_alphanum_index_].
453  setrej_1Il_conflict ();
454  return true;
455  }
456  else
457  return false;
458  }
459  else if (word_type == AC_UPPER_CASE) {
460  return false;
461  }
462  else {
463  if (update_map)
464  reject_I_1_L(word_res);
465  return true;
466  }
467 }
Definition: strngs.h:45
const STRING & unichar_string() const
Definition: ratngs.h:541
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:608
ACCEPTABLE_WERD_TYPE
Definition: control.h:28
int16_t first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:470
REJMAP reject_map
Definition: pageres.h:286
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
const UNICHARSET * uch_set
Definition: pageres.h:205
ALL but initial lc.
Definition: control.h:33
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1759
const char * string() const
Definition: strngs.cpp:194
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:510
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:496
bool contains(char c) const
Definition: strngs.cpp:185
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:483
WERD_CHOICE * best_choice
Definition: pageres.h:234
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:194
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:89
ALL upper case.
Definition: control.h:32
const STRING & unichar_lengths() const
Definition: ratngs.h:548
uint8_t permuter() const
Definition: ratngs.h:346
ALL lower case.
Definition: control.h:31
void tesseract::Tesseract::output_pass ( PAGE_RES_IT page_res_it,
const TBOX target_word_box 
)

Definition at line 36 of file output.cpp.

38  {
39  BLOCK_RES *block_of_last_word;
40  bool force_eol; //During output
41  BLOCK *nextblock; //block of next word
42  WERD *nextword; //next word
43 
44  page_res_it.restart_page ();
45  block_of_last_word = nullptr;
46  while (page_res_it.word () != nullptr) {
47  check_debug_pt (page_res_it.word (), 120);
48 
49  if (target_word_box) {
50  TBOX current_word_box = page_res_it.word()->word->bounding_box();
51  FCOORD center_pt(
52  (current_word_box.right() + current_word_box.left()) / 2,
53  (current_word_box.bottom() + current_word_box.top()) / 2);
54  if (!target_word_box->contains(center_pt)) {
55  page_res_it.forward();
56  continue;
57  }
58  }
60  block_of_last_word != page_res_it.block ()) {
61  block_of_last_word = page_res_it.block ();
62  }
63 
64  force_eol = (tessedit_write_block_separators &&
65  (page_res_it.block () != page_res_it.next_block ())) ||
66  (page_res_it.next_word () == nullptr);
67 
68  if (page_res_it.next_word () != nullptr)
69  nextword = page_res_it.next_word ()->word;
70  else
71  nextword = nullptr;
72  if (page_res_it.next_block () != nullptr)
73  nextblock = page_res_it.next_block ()->block;
74  else
75  nextblock = nullptr;
76  //regardless of tilde crunching
77  write_results(page_res_it,
78  determine_newline_type(page_res_it.word()->word,
79  page_res_it.block()->block,
80  nextword, nextblock), force_eol);
81  page_res_it.forward();
82  }
83 }
Definition: werd.h:56
BLOCK * block
Definition: pageres.h:116
int16_t top() const
Definition: rect.h:58
WERD_RES * restart_page()
Definition: pageres.h:702
Definition: rect.h:34
Definition: points.h:188
BLOCK_RES * block() const
Definition: pageres.h:761
bool contains(const FCOORD pt) const
Definition: rect.h:333
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
Definition: output.cpp:213
BLOCK_RES * next_block() const
Definition: pageres.h:770
WERD_RES * next_word() const
Definition: pageres.h:764
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1863
WERD_RES * word() const
Definition: pageres.h:755
WERD_RES * forward()
Definition: pageres.h:735
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
TBOX bounding_box() const
Definition: werd.cpp:148
int16_t left() const
Definition: rect.h:72
void write_results(PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
Definition: output.cpp:98
WERD * word
Definition: pageres.h:188
Definition: ocrblock.h:29
void tesseract::Tesseract::ParseLanguageString ( const char *  lang_str,
GenericVector< STRING > *  to_load,
GenericVector< STRING > *  not_to_load 
)

Definition at line 252 of file tessedit.cpp.

254  {
255  STRING remains(lang_str);
256  while (remains.length() > 0) {
257  // Find the start of the lang code and which vector to add to.
258  const char* start = remains.string();
259  while (*start == '+') ++start;
260  GenericVector<STRING>* target = to_load;
261  if (*start == '~') {
262  target = not_to_load;
263  ++start;
264  }
265  // Find the index of the end of the lang code in string start.
266  int end = strlen(start);
267  const char* plus = strchr(start, '+');
268  if (plus != nullptr && plus - start < end) end = plus - start;
269  STRING lang_code(start);
270  lang_code.truncate_at(end);
271  STRING next(start + end);
272  remains = next;
273  // Check whether lang_code is already in the target vector and add.
274  if (!IsStrInList(lang_code, *target)) {
275  target->push_back(lang_code);
276  }
277  }
278 }
Definition: strngs.h:45
int push_back(T object)
void tesseract::Tesseract::pgeditor_main ( int  width,
int  height,
PAGE_RES page_res 
)

pgeditor_main()

Top level editor operation: Setup a new window and an according event handler

Definition at line 380 of file pgedit.cpp.

380  {
381  current_page_res = page_res;
382  if (current_page_res->block_res_list.empty())
383  return;
384 
385  recog_done = false;
386  stillRunning = true;
387 
388  build_image_window(width, height);
389  word_display_mode.turn_on_bit(DF_EDGE_STEP);
391 #ifndef GRAPHICS_DISABLED
392  pe = new ParamsEditor(this, image_win);
393 #endif
394  PGEventHandler pgEventHandler(this);
395 
396  image_win->AddEventHandler(&pgEventHandler);
397  image_win->AddMessageBox();
398 
399  SVMenuNode* svMenuRoot = build_menu_new();
400 
401  svMenuRoot->BuildMenu(image_win);
402  image_win->SetVisible(true);
403 
404  image_win->AwaitEvent(SVET_DESTROY);
405  image_win->AddEventHandler(nullptr);
406 }
void BuildMenu(ScrollView *sv, bool menu_bar=true)
Definition: svmnode.cpp:120
void SetVisible(bool visible)
Definition: scrollview.cpp:549
SVMenuNode * build_menu_new()
Definition: pgedit.cpp:300
bool word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:926
void AddMessageBox()
Definition: scrollview.cpp:578
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:443
Edge steps.
Definition: werd.h:49
void do_re_display(bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
Definition: pgedit.cpp:351
void AddEventHandler(SVEventHandler *listener)
Add an Event Listener to this ScrollView Window.
Definition: scrollview.cpp:414
void turn_on_bit(uint8_t bit_num)
Definition: bits16.h:32
BLOCK_RES_LIST block_res_list
Definition: pageres.h:80
Pix* tesseract::Tesseract::pix_binary ( ) const
inline

Definition at line 202 of file tesseractclass.h.

202  {
203  return pix_binary_;
204  }
Pix* tesseract::Tesseract::pix_grey ( ) const
inline

Definition at line 205 of file tesseractclass.h.

205  {
206  return pix_grey_;
207  }
Pix* tesseract::Tesseract::pix_original ( ) const
inline

Definition at line 212 of file tesseractclass.h.

212  {
213  return pix_original_;
214  }
bool tesseract::Tesseract::potential_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level,
bool  ok_dict_word 
)

Definition at line 541 of file docqual.cpp.

543  {
544  float rating_per_ch;
545  int adjusted_len;
546  const char *str = word->best_choice->unichar_string().string();
547  const char *lengths = word->best_choice->unichar_lengths().string();
548  bool word_crunchable;
549  int poor_indicator_count = 0;
550 
551  word_crunchable = !crunch_leave_accept_strings ||
552  word->reject_map.length() < 3 ||
554  str, lengths) == AC_UNACCEPTABLE &&
555  !ok_dict_word);
556 
557  adjusted_len = word->reject_map.length();
558  if (adjusted_len > 10)
559  adjusted_len = 10;
560  rating_per_ch = word->best_choice->rating() / adjusted_len;
561 
562  if (rating_per_ch > crunch_pot_poor_rate) {
563  if (crunch_debug > 2) {
564  tprintf("Potential poor rating on \"%s\"\n",
565  word->best_choice->unichar_string().string());
566  }
567  poor_indicator_count++;
568  }
569 
570  if (word_crunchable &&
572  if (crunch_debug > 2) {
573  tprintf("Potential poor cert on \"%s\"\n",
574  word->best_choice->unichar_string().string());
575  }
576  poor_indicator_count++;
577  }
578 
579  if (garbage_level != G_OK) {
580  if (crunch_debug > 2) {
581  tprintf("Potential garbage on \"%s\"\n",
582  word->best_choice->unichar_string().string());
583  }
584  poor_indicator_count++;
585  }
586  return poor_indicator_count >= crunch_pot_indicators;
587 }
Unacceptable word.
Definition: control.h:30
int32_t length() const
Definition: rejctmap.h:223
const STRING & unichar_string() const
Definition: ratngs.h:541
REJMAP reject_map
Definition: pageres.h:286
const UNICHARSET * uch_set
Definition: pageres.h:205
float rating() const
Definition: ratngs.h:327
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1759
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
Definition: docqual.h:32
float certainty() const
Definition: ratngs.h:330
WERD_CHOICE * best_choice
Definition: pageres.h:234
const STRING & unichar_lengths() const
Definition: ratngs.h:548
void tesseract::Tesseract::PreenXHeights ( BLOCK_LIST *  block_list)

Any row xheight that is significantly different from the median is set to the median.

Definition at line 181 of file applybox.cpp.

181  {
182  const double median_xheight = MedianXHeight(block_list);
183  const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
184  // Strip all fuzzy space markers to simplify the PAGE_RES.
185  BLOCK_IT b_it(block_list);
186  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
187  BLOCK* block = b_it.data();
188  ROW_IT r_it(block->row_list());
189  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
190  ROW* row = r_it.data();
191  const double diff = fabs(row->x_height() - median_xheight);
192  if (diff > max_deviation) {
193  if (applybox_debug) {
194  tprintf("row xheight=%g, but median xheight = %g\n",
195  row->x_height(), median_xheight);
196  }
197  row->set_x_height(static_cast<float>(median_xheight));
198  }
199  }
200  }
201 }
float x_height() const
Definition: ocrrow.h:64
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:34
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void set_x_height(float new_xheight)
Definition: ocrrow.h:67
Definition: ocrrow.h:36
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:117
Definition: ocrblock.h:29
void tesseract::Tesseract::PrepareForPageseg ( )

Definition at line 644 of file tesseractclass.cpp.

644  {
646  // Find the max splitter strategy over all langs.
647  auto max_pageseg_strategy =
649  static_cast<int32_t>(pageseg_devanagari_split_strategy));
650  for (int i = 0; i < sub_langs_.size(); ++i) {
651  auto pageseg_strategy =
653  static_cast<int32_t>(sub_langs_[i]->pageseg_devanagari_split_strategy));
654  if (pageseg_strategy > max_pageseg_strategy)
655  max_pageseg_strategy = pageseg_strategy;
656  pixDestroy(&sub_langs_[i]->pix_binary_);
657  sub_langs_[i]->pix_binary_ = pixClone(pix_binary());
658  }
659  // Perform shiro-rekha (top-line) splitting and replace the current image by
660  // the newly split image.
661  splitter_.set_orig_pix(pix_binary());
662  splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
663  if (splitter_.Split(true, &pixa_debug_)) {
664  ASSERT_HOST(splitter_.splitted_image());
665  pixDestroy(&pix_binary_);
666  pix_binary_ = pixClone(splitter_.splitted_image());
667  }
668 }
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)
void set_pageseg_split_strategy(SplitStrategy strategy)
#define ASSERT_HOST(x)
Definition: errcode.h:88
Pix * pix_binary() const
void set_use_cjk_fp_model(bool flag)
Definition: textord.h:95
void tesseract::Tesseract::PrepareForTessOCR ( BLOCK_LIST *  block_list,
Tesseract osd_tess,
OSResults osr 
)

Definition at line 675 of file tesseractclass.cpp.

676  {
677  // Find the max splitter strategy over all langs.
678  auto max_ocr_strategy =
680  static_cast<int32_t>(ocr_devanagari_split_strategy));
681  for (int i = 0; i < sub_langs_.size(); ++i) {
682  auto ocr_strategy =
684  static_cast<int32_t>(sub_langs_[i]->ocr_devanagari_split_strategy));
685  if (ocr_strategy > max_ocr_strategy)
686  max_ocr_strategy = ocr_strategy;
687  }
688  // Utilize the segmentation information available.
689  splitter_.set_segmentation_block_list(block_list);
690  splitter_.set_ocr_split_strategy(max_ocr_strategy);
691  // Run the splitter for OCR
692  bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
693  // Restore pix_binary to the binarized original pix for future reference.
694  ASSERT_HOST(splitter_.orig_pix());
695  pixDestroy(&pix_binary_);
696  pix_binary_ = pixClone(splitter_.orig_pix());
697  // If the pageseg and ocr strategies are different, refresh the block list
698  // (from the last SegmentImage call) with blobs from the real image to be used
699  // for OCR.
700  if (splitter_.HasDifferentSplitStrategies()) {
701  BLOCK block("", true, 0, 0, 0, 0, pixGetWidth(pix_binary_),
702  pixGetHeight(pix_binary_));
703  Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() :
704  splitter_.orig_pix();
705  extract_edges(pix_for_ocr, &block);
706  splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
707  }
708  // The splitter isn't needed any more after this, so save memory by clearing.
709  splitter_.Clear();
710 }
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:330
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)
void set_segmentation_block_list(BLOCK_LIST *block_list)
#define ASSERT_HOST(x)
Definition: errcode.h:88
void set_ocr_split_strategy(SplitStrategy strategy)
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
Definition: ocrblock.h:29
void tesseract::Tesseract::PrerecAllWordsPar ( const GenericVector< WordData > &  words)

Definition at line 39 of file par_control.cpp.

39  {
40  // Prepare all the blobs.
42  for (int w = 0; w < words.size(); ++w) {
43  if (words[w].word->ratings != nullptr &&
44  words[w].word->ratings->get(0, 0) == nullptr) {
45  for (int s = 0; s < words[w].lang_words.size(); ++s) {
46  Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] : this;
47  const WERD_RES& word = *words[w].lang_words[s];
48  for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
49  blobs.push_back(BlobData(b, sub, word));
50  }
51  }
52  }
53  }
54  // Pre-classify all the blobs.
55  if (tessedit_parallelize > 1) {
56 #ifdef _OPENMP
57 #pragma omp parallel for num_threads(10)
58 #endif // _OPENMP
59  for (int b = 0; b < blobs.size(); ++b) {
60  *blobs[b].choices =
61  blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, nullptr);
62  }
63  } else {
64  // TODO(AMD) parallelize this.
65  for (int b = 0; b < blobs.size(); ++b) {
66  *blobs[b].choices =
67  blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, nullptr);
68  }
69  }
70 }
T & get(int index) const
Definition: callcpp.h:30
int push_back(T object)
int NumBlobs() const
Definition: blobs.h:427
TWERD * chopped_word
Definition: pageres.h:214
int size() const
Definition: genericvector.h:70
bool tesseract::Tesseract::process_cmd_win_event ( int32_t  cmd_event,
char *  new_value 
)

Definition at line 417 of file pgedit.cpp.

420  {
421  char msg[160];
422  bool exit = false;
423 
424  color_mode = CM_RAINBOW;
425 
426  // Run recognition on the full page if needed.
427  switch (cmd_event) {
428  case BLAMER_CMD_EVENT:
432  case SHOW_BOLD_CMD_EVENT:
438  if (!recog_done) {
439  recog_all_words(current_page_res, nullptr, nullptr, nullptr, 0);
440  recog_done = true;
441  }
442  break;
443  default:
444  break;
445  }
446 
447  char* parameter;
448 
449  switch (cmd_event) {
450  case NULL_CMD_EVENT:
451  break;
452 
454  case DUMP_WERD_CMD_EVENT:
457  case RECOG_WERDS:
458  case RECOG_PSEUDO:
459  case SHOW_BLOB_FEATURES:
460  mode =static_cast<CMD_EVENTS>(cmd_event);
461  break;
463  mode = DEBUG_WERD_CMD_EVENT;
464  parameter = image_win->ShowInputDialog("Config File Name");
465  word_config_ = parameter;
466  delete[] parameter;
467  break;
469  if (new_value[0] == 'T')
470  word_display_mode.turn_on_bit(DF_BOX);
471  else
472  word_display_mode.turn_off_bit(DF_BOX);
473  mode = CHANGE_DISP_CMD_EVENT;
474  break;
475  case BLAMER_CMD_EVENT:
476  if (new_value[0] == 'T')
477  word_display_mode.turn_on_bit(DF_BLAMER);
478  else
479  word_display_mode.turn_off_bit(DF_BLAMER);
481  mode = CHANGE_DISP_CMD_EVENT;
482  break;
484  if (new_value[0] == 'T')
485  word_display_mode.turn_on_bit(DF_TEXT);
486  else
487  word_display_mode.turn_off_bit(DF_TEXT);
488  mode = CHANGE_DISP_CMD_EVENT;
489  break;
490  case POLYGONAL_CMD_EVENT:
491  if (new_value[0] == 'T')
492  word_display_mode.turn_on_bit(DF_POLYGONAL);
493  else
494  word_display_mode.turn_off_bit(DF_POLYGONAL);
495  mode = CHANGE_DISP_CMD_EVENT;
496  break;
497  case BL_NORM_CMD_EVENT:
498  if (new_value[0] == 'T')
499  word_display_mode.turn_on_bit(DF_BN_POLYGONAL);
500  else
501  word_display_mode.turn_off_bit(DF_BN_POLYGONAL);
502  mode = CHANGE_DISP_CMD_EVENT;
503  break;
504  case BITMAP_CMD_EVENT:
505  if (new_value[0] == 'T')
506  word_display_mode.turn_on_bit(DF_EDGE_STEP);
507  else
508  word_display_mode.turn_off_bit(DF_EDGE_STEP);
509  mode = CHANGE_DISP_CMD_EVENT;
510  break;
513  break;
514  case IMAGE_CMD_EVENT:
515  display_image =(new_value[0] == 'T');
517  break;
518  case BLOCKS_CMD_EVENT:
519  display_blocks =(new_value[0] == 'T');
521  break;
522  case BASELINES_CMD_EVENT:
523  display_baselines =(new_value[0] == 'T');
525  break;
527  color_mode = CM_SUBSCRIPT;
529  break;
531  color_mode = CM_SUPERSCRIPT;
533  break;
535  color_mode = CM_ITALIC;
537  break;
538  case SHOW_BOLD_CMD_EVENT:
539  color_mode = CM_BOLD;
541  break;
543  color_mode = CM_UNDERLINE;
545  break;
547  color_mode = CM_FIXEDPITCH;
549  break;
551  color_mode = CM_SERIF;
553  break;
555  color_mode = CM_SMALLCAPS;
557  break;
559  color_mode = CM_DROPCAPS;
561  break;
562  case REFRESH_CMD_EVENT:
564  break;
565  case QUIT_CMD_EVENT:
566  exit = true;
568  break;
569 
570  default:
571  snprintf(msg, sizeof(msg), "Unrecognised event %" PRId32 "(%s)",
572  cmd_event, new_value);
573  image_win->AddMessage(msg);
574  break;
575  }
576  return exit;
577 }
bool word_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:742
static void Exit()
Definition: scrollview.cpp:583
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:303
Correct ascii.
Definition: werd.h:47
char * ShowInputDialog(const char *msg)
Definition: scrollview.cpp:733
bool word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:926
Blamer information.
Definition: werd.h:51
Edge steps.
Definition: werd.h:49
void turn_off_bit(uint8_t bit_num)
Definition: bits16.h:37
void do_re_display(bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
Definition: pgedit.cpp:351
CMD_EVENTS
Definition: pgedit.cpp:46
Polyg approx.
Definition: werd.h:48
void turn_on_bit(uint8_t bit_num)
Definition: bits16.h:32
Bounding box.
Definition: werd.h:46
void AddMessage(const char *format,...)
Definition: scrollview.cpp:561
BL normalisd polyapx.
Definition: werd.h:50
void tesseract::Tesseract::process_image_event ( const SVEvent event)

process_image_event()

User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.

Definition at line 589 of file pgedit.cpp.

590  {
591  // The following variable should remain static, since it is used by
592  // debug editor, which uses a single Tesseract instance.
593  static ICOORD down;
594  ICOORD up;
595  TBOX selection_box;
596  char msg[80];
597 
598  switch(event.type) {
599 
600  case SVET_SELECTION:
601  if (event.type == SVET_SELECTION) {
602  down.set_x(event.x + event.x_size);
603  down.set_y(event.y + event.y_size);
604  if (mode == SHOW_POINT_CMD_EVENT)
605  show_point(current_page_res, event.x, event.y);
606  }
607 
608  up.set_x(event.x);
609  up.set_y(event.y);
610 
611  selection_box = TBOX(down, up);
612 
613  switch(mode) {
616  current_page_res,
617  selection_box,
619  break;
620  case DUMP_WERD_CMD_EVENT:
621  process_selected_words(current_page_res,
622  selection_box,
624  break;
626  process_selected_words(current_page_res,
627  selection_box,
629  break;
631  debug_word(current_page_res, selection_box);
632  break;
634  break; // ignore up event
635 
636  case RECOG_WERDS:
637  #ifndef DISABLED_LEGACY_ENGINE
638  image_win->AddMessage("Recogging selected words");
639  this->process_selected_words(current_page_res,
640  selection_box,
642  #endif // ndef DISABLED_LEGACY_ENGINE
643  break;
644  case RECOG_PSEUDO:
645  image_win->AddMessage("Recogging selected blobs");
646  recog_pseudo_word(current_page_res, selection_box);
647  break;
648  case SHOW_BLOB_FEATURES:
649  blob_feature_display(current_page_res, selection_box);
650  break;
651 
652  default:
653  sprintf(msg, "Mode %d not yet implemented", mode);
654  image_win->AddMessage(msg);
655  break;
656  }
657  default:
658  break;
659  }
660 }
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:62
Definition: rect.h:34
void set_x(int16_t xin)
rewrite function
Definition: points.h:61
bool word_blank_and_set_display(PAGE_RES_IT *pr_its)
Definition: pgedit.cpp:698
int x_size
Definition: scrollview.h:68
integer coordinate
Definition: points.h:31
void set_y(int16_t yin)
rewrite function
Definition: points.h:65
int x
Definition: scrollview.h:66
bool word_dumper(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:902
int y
Definition: scrollview.h:67
bool word_bln_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:710
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
Definition: pagewalk.cpp:30
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:667
int y_size
Definition: scrollview.h:69
void AddMessage(const char *format,...)
Definition: scrollview.cpp:561
SVEventType type
Definition: scrollview.h:64
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:941
bool recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:77
void tesseract::Tesseract::process_selected_words ( PAGE_RES page_res,
TBOX selection_box,
bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_processor 
)

Definition at line 30 of file pagewalk.cpp.

33  {
34  for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != nullptr;
35  page_res_it.forward()) {
36  WERD* word = page_res_it.word()->word;
37  if (word->bounding_box().overlap(selection_box)) {
38  if (!(this->*word_processor)(&page_res_it))
39  return;
40  }
41  }
42 }
Definition: werd.h:56
bool overlap(const TBOX &box) const
Definition: rect.h:355
WERD_RES * word() const
Definition: pageres.h:755
TBOX bounding_box() const
Definition: werd.cpp:148
bool tesseract::Tesseract::ProcessTargetWord ( const TBOX word_box,
const TBOX target_word_box,
const char *  word_config,
int  pass 
)

Definition at line 120 of file control.cpp.

123  {
124  if (word_config != nullptr) {
125  if (word_box.major_overlap(target_word_box)) {
126  if (backup_config_file_ == nullptr) {
127  backup_config_file_ = kBackUpConfigFile;
128  FILE* config_fp = fopen(backup_config_file_, "wb");
129  if (config_fp == nullptr) {
130  tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
131  } else {
132  ParamUtils::PrintParams(config_fp, params());
133  fclose(config_fp);
134  }
135  ParamUtils::ReadParamsFile(word_config,
137  params());
138  }
139  } else {
140  if (backup_config_file_ != nullptr) {
141  ParamUtils::ReadParamsFile(backup_config_file_,
143  params());
144  backup_config_file_ = nullptr;
145  }
146  }
147  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
148  return false;
149  }
150  return true;
151 }
ParamsVectors * params()
Definition: ccutil.h:65
bool major_overlap(const TBOX &box) const
Definition: rect.h:368
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:180
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
const char *const kBackUpConfigFile
Definition: control.cpp:48
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:42
void tesseract::Tesseract::quality_based_rejection ( PAGE_RES_IT page_res_it,
bool  good_quality_doc 
)

Definition at line 138 of file docqual.cpp.

139  {
140  if ((tessedit_good_quality_unrej && good_quality_doc))
141  unrej_good_quality_words(page_res_it);
142  doc_and_block_rejection(page_res_it, good_quality_doc);
143  if (unlv_tilde_crunching) {
144  tilde_crunch(page_res_it);
145  tilde_delete(page_res_it);
146  }
147 }
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:589
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:160
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:417
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:232
void tesseract::Tesseract::read_config_file ( const char *  filename,
SetParamConstraint  constraint 
)

Definition at line 49 of file tessedit.cpp.

50  {
51  STRING path = datadir;
52  path += "configs/";
53  path += filename;
54  FILE* fp;
55  if ((fp = fopen(path.string(), "rb")) != nullptr) {
56  fclose(fp);
57  } else {
58  path = datadir;
59  path += "tessconfigs/";
60  path += filename;
61  if ((fp = fopen(path.string(), "rb")) != nullptr) {
62  fclose(fp);
63  } else {
64  path = filename;
65  }
66  }
67  ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
68 }
ParamsVectors * params()
Definition: ccutil.h:65
Definition: strngs.h:45
const char * string() const
Definition: strngs.cpp:194
STRING datadir
Definition: ccutil.h:67
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:42
bool tesseract::Tesseract::ReassignDiacritics ( int  pass,
PAGE_RES_IT pr_it,
bool *  make_next_word_fuzzy 
)

Definition at line 944 of file control.cpp.

945  {
946 #ifdef DISABLED_LEGACY_ENGINE
947  return false;
948 #else
949  *make_next_word_fuzzy = false;
950  WERD* real_word = pr_it->word()->word;
951  if (real_word->rej_cblob_list()->empty() ||
952  real_word->cblob_list()->empty() ||
953  real_word->rej_cblob_list()->length() > noise_maxperword)
954  return false;
955  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
956  // Get the noise outlines into a vector with matching bool map.
957  GenericVector<C_OUTLINE*> outlines;
958  real_word->GetNoiseOutlines(&outlines);
959  GenericVector<bool> word_wanted;
960  GenericVector<bool> overlapped_any_blob;
961  GenericVector<C_BLOB*> target_blobs;
962  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
963  &word_wanted, &overlapped_any_blob,
964  &target_blobs);
965  // Filter the outlines that overlapped any blob and put them into the word
966  // now. This simplifies the remaining task and also makes it more accurate
967  // as it has more completed blobs to work on.
968  GenericVector<bool> wanted;
969  GenericVector<C_BLOB*> wanted_blobs;
970  GenericVector<C_OUTLINE*> wanted_outlines;
971  int num_overlapped = 0;
972  int num_overlapped_used = 0;
973  for (int i = 0; i < overlapped_any_blob.size(); ++i) {
974  if (overlapped_any_blob[i]) {
975  ++num_overlapped;
976  if (word_wanted[i]) ++num_overlapped_used;
977  wanted.push_back(word_wanted[i]);
978  wanted_blobs.push_back(target_blobs[i]);
979  wanted_outlines.push_back(outlines[i]);
980  outlines[i] = nullptr;
981  }
982  }
983  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
984  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
985  &target_blobs);
986  int non_overlapped = 0;
987  int non_overlapped_used = 0;
988  for (int i = 0; i < word_wanted.size(); ++i) {
989  if (word_wanted[i]) ++non_overlapped_used;
990  if (outlines[i] != nullptr) ++non_overlapped_used;
991  }
992  if (debug_noise_removal) {
993  tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
994  num_overlapped_used, num_overlapped, non_overlapped_used,
995  non_overlapped);
996  real_word->bounding_box().print();
997  }
998  // Now we have decided which outlines we want, put them into the real_word.
999  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
1000  make_next_word_fuzzy)) {
1001  pr_it->MakeCurrentWordFuzzy();
1002  }
1003  // TODO(rays) Parts of combos have a deep copy of the real word, and need
1004  // to have their noise outlines moved/assigned in the same way!!
1005  return num_overlapped_used != 0 || non_overlapped_used != 0;
1006 #endif // ndef DISABLED_LEGACY_ENGINE
1007 }
Definition: werd.h:56
void GetNoiseOutlines(GenericVector< C_OUTLINE * > *outlines)
Definition: werd.cpp:508
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
Definition: control.cpp:1014
void print() const
Definition: rect.h:278
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:90
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB * > &target_blobs, const GenericVector< C_OUTLINE * > &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:526
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:125
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int push_back(T object)
WERD_RES * word() const
Definition: pageres.h:755
TBOX bounding_box() const
Definition: werd.cpp:148
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
Definition: control.cpp:1069
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1520
int size() const
Definition: genericvector.h:70
WERD * word
Definition: pageres.h:188
C_BLOB_LIST * cblob_list()
Definition: werd.h:95
bool tesseract::Tesseract::recog_all_words ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config,
int  dopasses 
)

recog_all_words()

Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.

Parameters
page_respage structure
monitorprogress monitor
word_configword_config file
target_word_boxspecifies just to extract a rectangle
dopasses0 - all, 1 just pass 1, 2 passes 2 and higher

Definition at line 303 of file control.cpp.

307  {
308  PAGE_RES_IT page_res_it(page_res);
309 
311  tessedit_test_adaption.set_value (true);
312  tessedit_minimal_rejection.set_value (true);
313  }
314 
315  if (dopasses==0 || dopasses==1) {
316  page_res_it.restart_page();
317  // ****************** Pass 1 *******************
318 
319  #ifndef DISABLED_LEGACY_ENGINE
320  // If the adaptive classifier is full switch to one we prepared earlier,
321  // ie on the previous page. If the current adaptive classifier is non-empty,
322  // prepare a backup starting at this page, in case it fills up. Do all this
323  // independently for each language.
324  if (AdaptiveClassifierIsFull()) {
326  } else if (!AdaptiveClassifierIsEmpty()) {
328  }
329  // Now check the sub-langs as well.
330  for (int i = 0; i < sub_langs_.size(); ++i) {
331  if (sub_langs_[i]->AdaptiveClassifierIsFull()) {
332  sub_langs_[i]->SwitchAdaptiveClassifier();
333  } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {
334  sub_langs_[i]->StartBackupAdaptiveClassifier();
335  }
336  }
337 
338  #endif // ndef DISABLED_LEGACY_ENGINE
339 
340  // Set up all words ready for recognition, so that if parallelism is on
341  // all the input and output classes are ready to run the classifier.
343  SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
344  #ifndef DISABLED_LEGACY_ENGINE
345  if (tessedit_parallelize) {
346  PrerecAllWordsPar(words);
347  }
348  #endif // ndef DISABLED_LEGACY_ENGINE
349 
350  stats_.word_count = words.size();
351 
352  stats_.dict_words = 0;
353  stats_.doc_blob_quality = 0;
354  stats_.doc_outline_errs = 0;
355  stats_.doc_char_quality = 0;
356  stats_.good_char_count = 0;
357  stats_.doc_good_char_quality = 0;
358 
359  most_recently_used_ = this;
360  // Run pass 1 word recognition.
361  if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
362  // Pass 1 post-processing.
363  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
364  page_res_it.forward()) {
365  if (page_res_it.word()->word->flag(W_REP_CHAR)) {
366  fix_rep_char(&page_res_it);
367  continue;
368  }
369 
370  // Count dict words.
371  if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
372  ++(stats_.dict_words);
373 
374  // Update misadaption log (we only need to do it on pass 1, since
375  // adaption only happens on this pass).
376  if (page_res_it.word()->blamer_bundle != nullptr &&
377  page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
378  page_res->misadaption_log.push_back(
379  page_res_it.word()->blamer_bundle->misadaption_debug());
380  }
381  }
382  }
383 
384  if (dopasses == 1) return true;
385 
386  #ifndef DISABLED_LEGACY_ENGINE
387 
388  // ****************** Pass 2 *******************
390  AnyTessLang()) {
391  page_res_it.restart_page();
393  SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
394  if (tessedit_parallelize) {
395  PrerecAllWordsPar(words);
396  }
397  most_recently_used_ = this;
398  // Run pass 2 word recognition.
399  if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
400  }
401 
402  // The next passes are only required for Tess-only.
403  if (AnyTessLang() && !AnyLSTMLang()) {
404  // ****************** Pass 3 *******************
405  // Fix fuzzy spaces.
407 
410  fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
411 
412  // ****************** Pass 4 *******************
415 
416  // ****************** Pass 5,6 *******************
417  rejection_passes(page_res, monitor, target_word_box, word_config);
418 
419  // ****************** Pass 8 *******************
420  font_recognition_pass(page_res);
421 
422  // ****************** Pass 9 *******************
423  // Check the correctness of the final results.
424  blamer_pass(page_res);
425  script_pos_pass(page_res);
426  }
427 
428  #endif // ndef DISABLED_LEGACY_ENGINE
429 
430  // Write results pass.
432  // This is now redundant, but retained commented so show how to obtain
433  // bounding boxes and style information.
434 
435  #ifndef DISABLED_LEGACY_ENGINE
436  // changed by jetsoft
437  // needed for dll to output memory structure
438  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
439  output_pass(page_res_it, target_word_box);
440  // end jetsoft
441  #endif //ndef DISABLED_LEGACY_ENGINE
442 
443  const auto pageseg_mode = static_cast<PageSegMode>(
444  static_cast<int>(tessedit_pageseg_mode));
445  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
446 
447  // Remove empty words, as these mess up the result iterators.
448  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
449  page_res_it.forward()) {
450  const WERD_RES* word = page_res_it.word();
451  const POLY_BLOCK* pb = page_res_it.block()->block != nullptr
452  ? page_res_it.block()->block->pdblk.poly_block()
453  : nullptr;
454  if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
455  (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
456  page_res_it.DeleteCurrentWord();
457  }
458  }
459 
460  if (monitor != nullptr) {
461  monitor->progress = 100;
462  }
463  return true;
464 }
bool AnyTessLang() const
bool IsAllSpaces() const
Definition: ratngs.h:521
GenericVector< STRING > misadaption_log
Definition: pageres.h:91
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:75
int length() const
Definition: ratngs.h:303
bool tessedit_enable_bigram_correction
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:735
bool right_to_left() const
repeated character
Definition: werd.h:38
#define LOC_FUZZY_SPACE
Definition: errcode.h:49
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:629
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:105
bool AdaptiveClassifierIsFull() const
Definition: classify.h:325
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:36
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
Definition: control.cpp:213
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:2055
void PrerecAllWordsPar(const GenericVector< WordData > &words)
Definition: par_control.cpp:39
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:326
int push_back(T object)
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:468
#define LOC_WRITE_RESULTS
Definition: errcode.h:53
bool AnyLSTMLang() const
WERD_CHOICE * best_choice
Definition: pageres.h:234
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:613
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:711
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:322
bool IsText() const
Definition: polyblk.h:49
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2112
int size() const
Definition: genericvector.h:70
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1720
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:613
WERD * word
Definition: pageres.h:188
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:25
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
Definition: control.cpp:154
bool tesseract::Tesseract::recog_interactive ( PAGE_RES_IT pr_it)

Recognize a single word in interactive mode.

Parameters
pr_itthe page results iterator

Definition at line 77 of file control.cpp.

77  {
78  int16_t char_qual;
79  int16_t good_char_qual;
80 
81  WordData word_data(*pr_it);
82  SetupWordPassN(2, &word_data);
83  // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
84  if (lstm_recognizer_ == nullptr) {
85 #ifndef DISABLED_LEGACY_ENGINE
86  classify_word_and_language(2, pr_it, &word_data);
87 #endif // ndef DISABLED_LEGACY_ENGINE
88  } else {
89  classify_word_and_language(1, pr_it, &word_data);
90  }
91 #ifndef DISABLED_LEGACY_ENGINE
93  WERD_RES* word_res = pr_it->word();
94  word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
95  tprintf("\n%d chars; word_blob_quality: %d; outline_errs: %d; "
96  "char_quality: %d; good_char_quality: %d\n",
97  word_res->reject_map.length(),
98  word_blob_quality(word_res, pr_it->row()->row),
99  word_outline_errs(word_res), char_qual, good_char_qual);
100  }
101 #endif // ndef DISABLED_LEGACY_ENGINE
102  return true;
103 }
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:92
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:72
int32_t length() const
Definition: rejctmap.h:223
ROW_RES * row() const
Definition: pageres.h:758
int16_t word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:60
REJMAP reject_map
Definition: pageres.h:286
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
WERD_RES * word() const
Definition: pageres.h:755
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:177
ROW * row
Definition: pageres.h:142
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1333
void tesseract::Tesseract::recog_pseudo_word ( PAGE_RES page_res,
TBOX selection_box 
)

Definition at line 62 of file control.cpp.

63  {
64  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
65  if (it != nullptr) {
67  it->DeleteCurrentWord();
68  delete it;
69  }
70 }
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:35
bool recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:77
void DeleteCurrentWord()
Definition: pageres.cpp:1487
void tesseract::Tesseract::recog_training_segmented ( const STRING fname,
PAGE_RES page_res,
volatile ETEXT_DESC monitor,
FILE *  output_file 
)

Definition at line 84 of file recogtraining.cpp.

87  {
88  STRING box_fname = fname;
89  const char* lastdot = strrchr(box_fname.string(), '.');
90  if (lastdot != nullptr)
91  box_fname[lastdot - box_fname.string()] = '\0';
92  box_fname += ".box";
93  // ReadNextBox() will close box_file
94  FILE* box_file = fopen(box_fname.string(), "r");
95  if (box_file == nullptr) {
96  tprintf("Error: Could not open file %s\n", box_fname.string());
97  ASSERT_HOST(box_file);
98  }
99 
100  PAGE_RES_IT page_res_it;
101  page_res_it.page_res = page_res;
102  page_res_it.restart_page();
103  STRING label;
104 
105  // Process all the words on this page.
106  TBOX tbox; // tesseract-identified box
107  TBOX bbox; // box from the box file
108  bool keep_going;
109  int line_number = 0;
110  int examined_words = 0;
111  do {
112  keep_going = read_t(&page_res_it, &tbox);
113  keep_going &=
114  ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
115  // Align bottom left points of the TBOXes.
116  while (keep_going &&
117  !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
118  if (bbox.bottom() < tbox.bottom()) {
119  page_res_it.forward();
120  keep_going = read_t(&page_res_it, &tbox);
121  } else {
122  keep_going =
123  ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
124  }
125  }
126  while (keep_going &&
127  !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
128  if (bbox.left() > tbox.left()) {
129  page_res_it.forward();
130  keep_going = read_t(&page_res_it, &tbox);
131  } else {
132  keep_going =
133  ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
134  }
135  }
136  // OCR the word if top right points of the TBOXes are similar.
137  if (keep_going &&
138  NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
139  NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
140  ambigs_classify_and_output(label.string(), &page_res_it, output_file);
141  examined_words++;
142  }
143  page_res_it.forward();
144  } while (keep_going);
145 
146  // Set up scripts on all of the words that did not get sent to
147  // ambigs_classify_and_output. They all should have, but if all the
148  // werd_res's don't get uch_sets, tesseract will crash when you try
149  // to iterate over them. :-(
150  int total_words = 0;
151  for (page_res_it.restart_page(); page_res_it.block() != nullptr;
152  page_res_it.forward()) {
153  if (page_res_it.word()) {
154  if (page_res_it.word()->uch_set == nullptr)
155  page_res_it.word()->SetupFake(unicharset);
156  total_words++;
157  }
158  }
159  if (examined_words < 0.85 * total_words) {
160  tprintf(
161  "TODO(antonova): clean up recog_training_segmented; "
162  " It examined only a small fraction of the ambigs image.\n");
163  }
164  tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words,
165  total_words);
166 }
int16_t top() const
Definition: rect.h:58
WERD_RES * restart_page()
Definition: pageres.h:702
Definition: rect.h:34
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:356
Definition: strngs.h:45
UNICHARSET unicharset
Definition: ccutil.h:71
const int16_t kMaxBoxEdgeDiff
BLOCK_RES * block() const
Definition: pageres.h:761
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
const UNICHARSET * uch_set
Definition: pageres.h:205
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
WERD_RES * word() const
Definition: pageres.h:755
PAGE_RES * page_res
Definition: pageres.h:678
WERD_RES * forward()
Definition: pageres.h:735
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
#define ASSERT_HOST(x)
Definition: errcode.h:88
int16_t left() const
Definition: rect.h:72
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:127
void tesseract::Tesseract::recog_word ( WERD_RES word)

Definition at line 40 of file tfacepp.cpp.

40  {
41  if (wordrec_skip_no_truth_words && (word->blamer_bundle == nullptr ||
43  if (classify_debug_level) tprintf("No truth for word - skipping\n");
44  word->tess_failed = true;
45  return;
46  }
49  word->SetupBoxWord();
50  if (word->best_choice->length() != word->box_word->length()) {
51  tprintf("recog_word ASSERT FAIL String:\"%s\"; "
52  "Strlen=%d; #Blobs=%d\n",
53  word->best_choice->debug_string().string(),
54  word->best_choice->length(), word->box_word->length());
55  }
56  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
57  // Check that the ratings matrix size matches the sum of all the
58  // segmentation states.
59  if (!word->StatesAllValid()) {
60  tprintf("Not all words have valid states relative to ratings matrix!!");
61  word->DebugWordChoices(true, nullptr);
62  ASSERT_HOST(word->StatesAllValid());
63  }
65  /* Override the permuter type if a straight dictionary check disagrees. */
66  uint8_t perm_type = word->best_choice->permuter();
67  if ((perm_type != SYSTEM_DAWG_PERM) &&
68  (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
69  uint8_t real_dict_perm_type = dict_word(*word->best_choice);
70  if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
71  (real_dict_perm_type == FREQ_DAWG_PERM) ||
72  (real_dict_perm_type == USER_DAWG_PERM)) &&
74  word->best_choice->unichar_lengths().string()) > 0)) {
75  word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
76  }
77  }
79  perm_type != word->best_choice->permuter()) {
80  tprintf("Permuter Type Flipped from %d to %d\n",
81  perm_type, word->best_choice->permuter());
82  }
83  }
84  // Factored out from control.cpp
85  ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
86  if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
87  static_cast<int>(strspn(word->best_choice->unichar_string().string(),
88  " ")) == word->best_choice->length()) {
89  word->tess_failed = true;
90  word->reject_map.initialise(word->box_word->length());
92  } else {
93  word->tess_failed = false;
94  }
95 }
bool tess_failed
Definition: pageres.h:287
void rej_word_tess_failure()
Definition: rejctmap.cpp:352
void SetupBoxWord()
Definition: pageres.cpp:853
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
const STRING & unichar_string() const
Definition: ratngs.h:541
int length() const
Definition: ratngs.h:303
int length() const
Definition: boxword.h:83
void set_permuter(uint8_t perm)
Definition: ratngs.h:375
REJMAP reject_map
Definition: pageres.h:286
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:484
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:118
void initialise(int16_t length)
Definition: rejctmap.cpp:273
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
bool empty() const
Definition: genericvector.h:89
tesseract::BoxWord * box_word
Definition: pageres.h:265
bool StatesAllValid()
Definition: pageres.cpp:462
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:496
WERD_CHOICE * best_choice
Definition: pageres.h:234
#define ASSERT_HOST(x)
Definition: errcode.h:88
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:89
TWERD * chopped_word
Definition: pageres.h:214
const STRING & unichar_lengths() const
Definition: ratngs.h:548
const STRING debug_string() const
Definition: ratngs.h:505
bool wordrec_skip_no_truth_words
Definition: wordrec.h:235
WERD_CHOICE * raw_choice
Definition: pageres.h:239
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:104
uint8_t permuter() const
Definition: ratngs.h:346
BlamerBundle * blamer_bundle
Definition: pageres.h:245
void tesseract::Tesseract::recog_word_recursive ( WERD_RES word)

Definition at line 104 of file tfacepp.cpp.

104  {
105  int word_length = word->chopped_word->NumBlobs(); // no of blobs
106  if (word_length > MAX_UNDIVIDED_LENGTH) {
107  return split_and_recog_word(word);
108  }
109  cc_recog(word);
110  word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
111 
112  // Do sanity checks and minor fixes on best_choice.
113  if (word->best_choice->length() > word_length) {
114  word->best_choice->make_bad(); // should never happen
115  tprintf("recog_word: Discarded long string \"%s\""
116  " (%d characters vs %d blobs)\n",
117  word->best_choice->unichar_string().string(),
118  word->best_choice->length(), word_length);
119  tprintf("Word is at:");
120  word->word->bounding_box().print();
121  }
122  if (word->best_choice->length() < word_length) {
123  UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
124  while (word->best_choice->length() < word_length) {
125  word->best_choice->append_unichar_id(space_id, 1, 0.0,
126  word->best_choice->certainty());
127  }
128  }
129 }
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:468
void print() const
Definition: rect.h:278
TWERD * rebuild_word
Definition: pageres.h:259
const STRING & unichar_string() const
Definition: ratngs.h:541
UNICHARSET unicharset
Definition: ccutil.h:71
int length() const
Definition: ratngs.h:303
void split_and_recog_word(WERD_RES *word)
Definition: tfacepp.cpp:138
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:443
TBOX bounding_box() const
Definition: werd.cpp:148
int NumBlobs() const
Definition: blobs.h:427
float certainty() const
Definition: ratngs.h:330
WERD_CHOICE * best_choice
Definition: pageres.h:234
TWERD * chopped_word
Definition: pageres.h:214
int UNICHAR_ID
Definition: unichar.h:34
void cc_recog(WERD_RES *word)
Definition: tface.cpp:125
#define MAX_UNDIVIDED_LENGTH
Definition: tfacepp.cpp:29
WERD * word
Definition: pageres.h:188
bool tesseract::Tesseract::RecogAllWordsPassN ( int  pass_n,
ETEXT_DESC monitor,
PAGE_RES_IT pr_it,
GenericVector< WordData > *  words 
)

Definition at line 213 of file control.cpp.

215  {
216  // TODO(rays) Before this loop can be parallelized (it would yield a massive
217  // speed-up) all remaining member globals need to be converted to local/heap
218  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
219  // added. The results will be significantly different with adaption on, and
220  // deterioration will need investigation.
221  pr_it->restart_page();
222  for (int w = 0; w < words->size(); ++w) {
223  WordData* word = &(*words)[w];
224  if (w > 0) word->prev_word = &(*words)[w - 1];
225  if (monitor != nullptr) {
226  monitor->ocr_alive = true;
227  if (pass_n == 1) {
228  monitor->progress = 70 * w / words->size();
229  if (monitor->progress_callback2 != nullptr) {
230  TBOX box = pr_it->word()->word->bounding_box();
231  (*monitor->progress_callback2)(monitor, box.left(),
232  box.right(), box.top(), box.bottom());
233  }
234  } else {
235  monitor->progress = 70 + 30 * w / words->size();
236  if (monitor->progress_callback2 != nullptr) {
237  (*monitor->progress_callback2)(monitor, 0, 0, 0, 0);
238  }
239  }
240  if (monitor->deadline_exceeded() ||
241  (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this,
242  words->size()))) {
243  // Timeout. Fake out the rest of the words.
244  for (; w < words->size(); ++w) {
245  (*words)[w].word->SetupFake(unicharset);
246  }
247  return false;
248  }
249  }
250  if (word->word->tess_failed) {
251  int s;
252  for (s = 0; s < word->lang_words.size() &&
253  word->lang_words[s]->tess_failed; ++s) {}
254  // If all are failed, skip it. Image words are skipped by this test.
255  if (s > word->lang_words.size()) continue;
256  }
257  // Sync pr_it with the wth WordData.
258  while (pr_it->word() != nullptr && pr_it->word() != word->word)
259  pr_it->forward();
260  ASSERT_HOST(pr_it->word() != nullptr);
261  bool make_next_word_fuzzy = false;
262  if (!AnyLSTMLang() &&
263  ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
264  // Needs to be setup again to see the new outlines in the chopped_word.
265  SetupWordPassN(pass_n, word);
266  }
267 
268  classify_word_and_language(pass_n, pr_it, word);
270  tprintf("Pass%d: %s [%s]\n", pass_n,
271  word->word->best_choice->unichar_string().string(),
272  word->word->best_choice->debug_string().string());
273  }
274  pr_it->forward();
275  if (make_next_word_fuzzy && pr_it->word() != nullptr) {
276  pr_it->MakeCurrentWordFuzzy();
277  }
278  }
279  return true;
280 }
int16_t top() const
Definition: rect.h:58
WERD_RES * restart_page()
Definition: pageres.h:702
Definition: rect.h:34
UNICHARSET unicharset
Definition: ccutil.h:71
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:105
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:110
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:116
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
WERD_RES * word() const
Definition: pageres.h:755
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:944
WERD_RES * forward()
Definition: pageres.h:735
int16_t right() const
Definition: rect.h:79
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:177
int16_t bottom() const
Definition: rect.h:65
TBOX bounding_box() const
Definition: werd.cpp:148
bool AnyLSTMLang() const
#define ASSERT_HOST(x)
Definition: errcode.h:88
int16_t left() const
Definition: rect.h:72
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1520
int size() const
Definition: genericvector.h:70
bool deadline_exceeded() const
Definition: ocrclass.h:138
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1333
WERD * word
Definition: pageres.h:188
PROGRESS_FUNC2 progress_callback2
called whenever progress increases
Definition: ocrclass.h:115
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:112
void tesseract::Tesseract::recognize_page ( STRING image_name)
void tesseract::Tesseract::reject_edge_blobs ( WERD_RES word)

Definition at line 264 of file reject.cpp.

264  {
265  TBOX word_box = word->word->bounding_box();
266  // Use the box_word as it is already denormed back to image coordinates.
267  int blobcount = word->box_word->length();
268 
269  if (word_box.left() < tessedit_image_border ||
270  word_box.bottom() < tessedit_image_border ||
271  word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
272  word_box.top() + tessedit_image_border > ImageHeight() - 1) {
273  ASSERT_HOST(word->reject_map.length() == blobcount);
274  for (int blobindex = 0; blobindex < blobcount; blobindex++) {
275  TBOX blob_box = word->box_word->BlobBox(blobindex);
276  if (blob_box.left() < tessedit_image_border ||
277  blob_box.bottom() < tessedit_image_border ||
278  blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
279  blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
280  word->reject_map[blobindex].setrej_edge_char();
281  // Close to edge
282  }
283  }
284  }
285 }
int16_t top() const
Definition: rect.h:58
Definition: rect.h:34
int32_t length() const
Definition: rejctmap.h:223
int ImageWidth() const
const TBOX & BlobBox(int index) const
Definition: boxword.h:84
int length() const
Definition: boxword.h:83
REJMAP reject_map
Definition: pageres.h:286
tesseract::BoxWord * box_word
Definition: pageres.h:265
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
TBOX bounding_box() const
Definition: werd.cpp:148
#define ASSERT_HOST(x)
Definition: errcode.h:88
int16_t left() const
Definition: rect.h:72
int ImageHeight() const
WERD * word
Definition: pageres.h:188
void tesseract::Tesseract::reject_I_1_L ( WERD_RES word)

Definition at line 194 of file reject.cpp.

194  {
195  int16_t i;
196  int16_t offset;
197 
198  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
199  offset += word->best_choice->unichar_lengths()[i], i += 1) {
201  contains (word->best_choice->unichar_string()[offset])) {
202  //rej 1Il conflict
203  word->reject_map[i].setrej_1Il_conflict ();
204  }
205  }
206 }
Definition: strngs.h:45
const STRING & unichar_string() const
Definition: ratngs.h:541
REJMAP reject_map
Definition: pageres.h:286
WERD_CHOICE * best_choice
Definition: pageres.h:234
const STRING & unichar_lengths() const
Definition: ratngs.h:548
void tesseract::Tesseract::reject_mostly_rejects ( WERD_RES word)

Definition at line 574 of file reject.cpp.

574  {
575  /* Reject the whole of the word if the fraction of rejects exceeds a limit */
576 
577  if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
580 }
double rej_whole_of_mostly_reject_word_fract
int32_t length() const
Definition: rejctmap.h:223
void rej_word_mostly_rej()
Definition: rejctmap.cpp:406
REJMAP reject_map
Definition: pageres.h:286
int16_t reject_count()
Definition: rejctmap.h:229
void tesseract::Tesseract::rejection_passes ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config 
)

Definition at line 613 of file control.cpp.

616  {
617  PAGE_RES_IT page_res_it(page_res);
618  // ****************** Pass 5 *******************
619  // Gather statistics on rejects.
620  int word_index = 0;
621  while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
623  WERD_RES* word = page_res_it.word();
624  word_index++;
625  if (monitor != nullptr) {
626  monitor->ocr_alive = true;
627  monitor->progress = 95 + 5 * word_index / stats_.word_count;
628  }
629  if (word->rebuild_word == nullptr) {
630  // Word was not processed by tesseract.
631  page_res_it.forward();
632  continue;
633  }
634  check_debug_pt(word, 70);
635 
636  // changed by jetsoft
637  // specific to its needs to extract one word when need
638  if (target_word_box &&
640  *target_word_box, word_config, 4)) {
641  page_res_it.forward();
642  continue;
643  }
644  // end jetsoft
645 
646  page_res_it.rej_stat_word();
647  const int chars_in_word = word->reject_map.length();
648  const int rejects_in_word = word->reject_map.reject_count();
649 
650  const int blob_quality = word_blob_quality(word, page_res_it.row()->row);
651  stats_.doc_blob_quality += blob_quality;
652  const int outline_errs = word_outline_errs(word);
653  stats_.doc_outline_errs += outline_errs;
654  int16_t all_char_quality;
655  int16_t accepted_all_char_quality;
656  word_char_quality(word, page_res_it.row()->row,
657  &all_char_quality, &accepted_all_char_quality);
658  stats_.doc_char_quality += all_char_quality;
659  const uint8_t permuter_type = word->best_choice->permuter();
660  if ((permuter_type == SYSTEM_DAWG_PERM) ||
661  (permuter_type == FREQ_DAWG_PERM) ||
662  (permuter_type == USER_DAWG_PERM)) {
663  stats_.good_char_count += chars_in_word - rejects_in_word;
664  stats_.doc_good_char_quality += accepted_all_char_quality;
665  }
666  check_debug_pt(word, 80);
668  (blob_quality == 0) && (outline_errs >= chars_in_word))
670  check_debug_pt(word, 90);
671  page_res_it.forward();
672  }
673 
675  tprintf
676  ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
677  " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
678  page_res->char_count, page_res->rej_count,
679  page_res->rej_count / static_cast<float>(page_res->char_count),
680  stats_.doc_blob_quality,
681  stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
682  stats_.doc_outline_errs,
683  stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
684  stats_.doc_char_quality,
685  stats_.doc_char_quality / static_cast<float>(page_res->char_count),
686  stats_.doc_good_char_quality,
687  (stats_.good_char_count > 0) ?
688  (stats_.doc_good_char_quality /
689  static_cast<float>(stats_.good_char_count)) : 0.0);
690  }
691  bool good_quality_doc =
692  ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
693  quality_rej_pc) &&
694  (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
695  quality_blob_pc) &&
696  (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
698  (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
700 
701  // ****************** Pass 6 *******************
702  // Do whole document or whole block rejection pass
703  if (!tessedit_test_adaption) {
705  quality_based_rejection(page_res_it, good_quality_doc);
706  }
707 }
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:92
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:72
TWERD * rebuild_word
Definition: pageres.h:259
int32_t length() const
Definition: rejctmap.h:223
#define LOC_DOC_BLK_REJ
Definition: errcode.h:52
#define LOC_MM_ADAPT
Definition: errcode.h:51
int16_t word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:60
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:105
REJMAP reject_map
Definition: pageres.h:286
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:110
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:120
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1863
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int32_t char_count
Definition: pageres.h:78
TBOX bounding_box() const
Definition: werd.cpp:148
int32_t rej_count
Definition: pageres.h:79
void rej_word_bad_quality()
Definition: rejctmap.cpp:415
WERD_CHOICE * best_choice
Definition: pageres.h:234
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:138
int16_t reject_count()
Definition: rejctmap.h:229
WERD * word
Definition: pageres.h:188
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:25
uint8_t permuter() const
Definition: ratngs.h:346
bool tesseract::Tesseract::repeated_nonalphanum_wd ( WERD_RES word,
ROW row 
)

Definition at line 583 of file reject.cpp.

583  {
584  int16_t char_quality;
585  int16_t accepted_char_quality;
586 
587  if (word->best_choice->unichar_lengths().length() <= 1)
588  return false;
589 
591  contains(word->best_choice->unichar_string()[0]))
592  return false;
593 
594  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
595  for (int i = 1; i < word->best_choice->length(); ++i) {
596  if (word->best_choice->unichar_id(i) != uch_id) return false;
597  }
598 
599  word_char_quality(word, row, &char_quality, &accepted_char_quality);
600 
601  if ((word->best_choice->unichar_lengths().length () == char_quality) &&
602  (char_quality == accepted_char_quality))
603  return true;
604  else
605  return false;
606 }
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:92
Definition: strngs.h:45
const STRING & unichar_string() const
Definition: ratngs.h:541
int32_t length() const
Definition: strngs.cpp:189
int length() const
Definition: ratngs.h:303
char * ok_repeated_ch_non_alphanum_wds
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
WERD_CHOICE * best_choice
Definition: pageres.h:234
int UNICHAR_ID
Definition: unichar.h:34
const STRING & unichar_lengths() const
Definition: ratngs.h:548
void tesseract::Tesseract::ReportFailedBox ( int  boxfile_lineno,
TBOX  box,
const char *  box_ch,
const char *  err_msg 
)

Logs a bad box by line in the box file and box coords.

Definition at line 772 of file applybox.cpp.

773  {
774  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
775  boxfile_lineno + 1, box_ch,
776  box.left(), box.bottom(), box.right(), box.top(), err_msg);
777 }
int16_t top() const
Definition: rect.h:58
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
int16_t left() const
Definition: rect.h:72
void tesseract::Tesseract::ReportXhtFixResult ( bool  accept_new_word,
float  new_x_ht,
WERD_RES word,
WERD_RES new_word 
)

Definition at line 1476 of file control.cpp.

1477  {
1478  tprintf("New XHT Match:%s = %s ",
1479  word->best_choice->unichar_string().string(),
1480  word->best_choice->debug_string().string());
1481  word->reject_map.print(debug_fp);
1482  tprintf(" -> %s = %s ",
1483  new_word->best_choice->unichar_string().string(),
1484  new_word->best_choice->debug_string().string());
1485  new_word->reject_map.print(debug_fp);
1486  tprintf(" %s->%s %s %s\n",
1487  word->guessed_x_ht ? "GUESS" : "CERT",
1488  new_word->guessed_x_ht ? "GUESS" : "CERT",
1489  new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1490  accept_new_word ? "ACCEPTED" : "");
1491 }
bool guessed_x_ht
Definition: pageres.h:307
FILE * debug_fp
Definition: tessvars.cpp:24
const STRING & unichar_string() const
Definition: ratngs.h:541
void print(FILE *fp)
Definition: rejctmap.cpp:321
REJMAP reject_map
Definition: pageres.h:286
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
WERD_CHOICE * best_choice
Definition: pageres.h:234
const STRING debug_string() const
Definition: ratngs.h:505
void tesseract::Tesseract::ReSegmentByClassification ( PAGE_RES page_res)

Resegments the words by running the classifier in an attempt to find the correct segmentation that produces the required string.

Definition at line 510 of file applybox.cpp.

510  {
511  PAGE_RES_IT pr_it(page_res);
512  WERD_RES* word_res;
513  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
514  const WERD* word = word_res->word;
515  if (word->text() == nullptr || word->text()[0] == '\0')
516  continue; // Ignore words that have no text.
517  // Convert the correct text to a vector of UNICHAR_ID
518  GenericVector<UNICHAR_ID> target_text;
519  if (!ConvertStringToUnichars(word->text(), &target_text)) {
520  tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
521  word->text());
522  pr_it.DeleteCurrentWord();
523  continue;
524  }
525  if (!FindSegmentation(target_text, word_res)) {
526  tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
527  word->text());
528  pr_it.DeleteCurrentWord();
529  continue;
530  }
531  }
532 }
Definition: werd.h:56
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
const char * text() const
Definition: werd.h:114
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
Definition: applybox.cpp:565
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
Definition: applybox.cpp:538
WERD * word
Definition: pageres.h:188
bool tesseract::Tesseract::ResegmentCharBox ( PAGE_RES page_res,
const TBOX prev_box,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Gather consecutive blobs that match the given box into the best_state and corresponding correct_text.

Fights over which box owns which blobs are settled by pre-chopping and applying the blobs to box or next_box with the least non-overlap.

Returns
false if the box was in error, which can only be caused by failing to find an appropriate blob for a box.

This means that occasionally, blobs may be incorrectly segmented if the chopper fails to find a suitable chop point.

Definition at line 333 of file applybox.cpp.

335  {
336  if (applybox_debug > 1) {
337  tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
338  }
339  PAGE_RES_IT page_res_it(page_res);
340  WERD_RES* word_res;
341  for (word_res = page_res_it.word(); word_res != nullptr;
342  word_res = page_res_it.forward()) {
343  if (!word_res->box_word->bounding_box().major_overlap(box))
344  continue;
345  if (applybox_debug > 1) {
346  tprintf("Checking word box:");
347  word_res->box_word->bounding_box().print();
348  }
349  int word_len = word_res->box_word->length();
350  for (int i = 0; i < word_len; ++i) {
351  TBOX char_box = TBOX();
352  int blob_count = 0;
353  for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
354  TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
355  if (!blob_box.major_overlap(box))
356  break;
357  if (word_res->correct_text[i + blob_count].length() > 0)
358  break; // Blob is claimed already.
359  if (next_box != nullptr) {
360  const double current_box_miss_metric = BoxMissMetric(blob_box, box);
361  const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
362  if (applybox_debug > 2) {
363  tprintf("Checking blob:");
364  blob_box.print();
365  tprintf("Current miss metric = %g, next = %g\n",
366  current_box_miss_metric, next_box_miss_metric);
367  }
368  if (current_box_miss_metric > next_box_miss_metric)
369  break; // Blob is a better match for next box.
370  }
371  char_box += blob_box;
372  }
373  if (blob_count > 0) {
374  if (applybox_debug > 1) {
375  tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
376  }
377  if (!char_box.almost_equal(box, 3) &&
378  ((next_box != nullptr && box.x_gap(*next_box) < -3)||
379  (prev_box != nullptr && prev_box->x_gap(box) < -3))) {
380  return false;
381  }
382  // We refine just the box_word, best_state and correct_text here.
383  // The rebuild_word is made in TidyUp.
384  // blob_count blobs are put together to match the box. Merge the
385  // box_word boxes, save the blob_count in the state and the text.
386  word_res->box_word->MergeBoxes(i, i + blob_count);
387  word_res->best_state[i] = blob_count;
388  word_res->correct_text[i] = correct_text;
389  if (applybox_debug > 2) {
390  tprintf("%d Blobs match: blob box:", blob_count);
391  word_res->box_word->BlobBox(i).print();
392  tprintf("Matches box:");
393  box.print();
394  if (next_box != nullptr) {
395  tprintf("With next box:");
396  next_box->print();
397  }
398  }
399  // Eliminated best_state and correct_text entries for the consumed
400  // blobs.
401  for (int j = 1; j < blob_count; ++j) {
402  word_res->best_state.remove(i + 1);
403  word_res->correct_text.remove(i + 1);
404  }
405  // Assume that no box spans multiple source words, so we are done with
406  // this box.
407  if (applybox_debug > 1) {
408  tprintf("Best state = ");
409  for (int j = 0; j < word_res->best_state.size(); ++j) {
410  tprintf("%d ", word_res->best_state[j]);
411  }
412  tprintf("\n");
413  tprintf("Correct text = [[ ");
414  for (int j = 0; j < word_res->correct_text.size(); ++j) {
415  tprintf("%s ", word_res->correct_text[j].string());
416  }
417  tprintf("]]\n");
418  }
419  return true;
420  }
421  }
422  }
423  if (applybox_debug > 0) {
424  tprintf("FAIL!\n");
425  }
426  return false; // Failure.
427 }
void MergeBoxes(int start, int end)
Definition: boxword.cpp:131
Definition: rect.h:34
bool major_overlap(const TBOX &box) const
Definition: rect.h:368
void print() const
Definition: rect.h:278
int length() const
Definition: genericvector.h:84
GenericVector< STRING > correct_text
Definition: pageres.h:274
const TBOX & BlobBox(int index) const
Definition: boxword.h:84
int length() const
Definition: boxword.h:83
int x_gap(const TBOX &box) const
Definition: rect.h:225
void remove(int index)
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
tesseract::BoxWord * box_word
Definition: pageres.h:265
const TBOX & bounding_box() const
Definition: boxword.h:80
GenericVector< int > best_state
Definition: pageres.h:270
int size() const
Definition: genericvector.h:70
WERD * word
Definition: pageres.h:188
bool tesseract::Tesseract::ResegmentWordBox ( BLOCK_LIST *  block_list,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Consume all source blobs that strongly overlap the given box, putting them into a new word, with the correct_text label. Fights over which box owns which blobs are settled by applying the blobs to box or next_box with the least non-overlap.

Returns
false if the box was in error, which can only be caused by failing to find an overlapping blob for a box.

Definition at line 435 of file applybox.cpp.

437  {
438  if (applybox_debug > 1) {
439  tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
440  }
441  WERD* new_word = nullptr;
442  BLOCK_IT b_it(block_list);
443  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
444  BLOCK* block = b_it.data();
445  if (!box.major_overlap(block->pdblk.bounding_box()))
446  continue;
447  ROW_IT r_it(block->row_list());
448  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
449  ROW* row = r_it.data();
450  if (!box.major_overlap(row->bounding_box()))
451  continue;
452  WERD_IT w_it(row->word_list());
453  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
454  WERD* word = w_it.data();
455  if (applybox_debug > 2) {
456  tprintf("Checking word:");
457  word->bounding_box().print();
458  }
459  if (word->text() != nullptr && word->text()[0] != '\0')
460  continue; // Ignore words that are already done.
461  if (!box.major_overlap(word->bounding_box()))
462  continue;
463  C_BLOB_IT blob_it(word->cblob_list());
464  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
465  blob_it.forward()) {
466  C_BLOB* blob = blob_it.data();
467  TBOX blob_box = blob->bounding_box();
468  if (!blob_box.major_overlap(box))
469  continue;
470  if (next_box != nullptr) {
471  const double current_box_miss_metric = BoxMissMetric(blob_box, box);
472  const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
473  if (applybox_debug > 2) {
474  tprintf("Checking blob:");
475  blob_box.print();
476  tprintf("Current miss metric = %g, next = %g\n",
477  current_box_miss_metric, next_box_miss_metric);
478  }
479  if (current_box_miss_metric > next_box_miss_metric)
480  continue; // Blob is a better match for next box.
481  }
482  if (applybox_debug > 2) {
483  tprintf("Blob match: blob:");
484  blob_box.print();
485  tprintf("Matches box:");
486  box.print();
487  if (next_box != nullptr) {
488  tprintf("With next box:");
489  next_box->print();
490  }
491  }
492  if (new_word == nullptr) {
493  // Make a new word with a single blob.
494  new_word = word->shallow_copy();
495  new_word->set_text(correct_text);
496  w_it.add_to_end(new_word);
497  }
498  C_BLOB_IT new_blob_it(new_word->cblob_list());
499  new_blob_it.add_to_end(blob_it.extract());
500  }
501  }
502  }
503  }
504  if (new_word == nullptr && applybox_debug > 0) tprintf("FAIL!\n");
505  return new_word != nullptr;
506 }
Definition: werd.h:56
void set_text(const char *new_text)
Definition: werd.h:115
Definition: rect.h:34
bool major_overlap(const TBOX &box) const
Definition: rect.h:368
void print() const
Definition: rect.h:278
WERD * shallow_copy()
Definition: werd.cpp:334
TBOX bounding_box() const
Definition: stepblob.cpp:253
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:60
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:191
TBOX bounding_box() const
Definition: werd.cpp:148
WERD_LIST * word_list()
Definition: ocrrow.h:55
TBOX bounding_box() const
Definition: ocrrow.h:88
const char * text() const
Definition: werd.h:114
Definition: ocrrow.h:36
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:117
Definition: ocrblock.h:29
C_BLOB_LIST * cblob_list()
Definition: werd.h:95
void tesseract::Tesseract::ResetAdaptiveClassifier ( )

Definition at line 600 of file tesseractclass.cpp.

600  {
602  for (int i = 0; i < sub_langs_.size(); ++i) {
603  sub_langs_[i]->ResetAdaptiveClassifierInternal();
604  }
605 }
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:598
void tesseract::Tesseract::ResetDocumentDictionary ( )

Definition at line 610 of file tesseractclass.cpp.

610  {
612  for (int i = 0; i < sub_langs_.size(); ++i) {
613  sub_langs_[i]->getDict().ResetDocumentDictionary();
614  }
615 }
void ResetDocumentDictionary()
Definition: dict.h:317
Dict & getDict() override
const FCOORD& tesseract::Tesseract::reskew ( ) const
inline

Definition at line 194 of file tesseractclass.h.

194  {
195  return reskew_;
196  }
int tesseract::Tesseract::RetryWithLanguage ( const WordData word_data,
WordRecognizer  recognizer,
bool  debug,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  best_words 
)

Definition at line 905 of file control.cpp.

908  {
909  if (debug) {
910  tprintf("Trying word using lang %s, oem %d\n",
911  lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
912  }
913  // Run the recognizer on the word.
914  PointerVector<WERD_RES> new_words;
915  (this->*recognizer)(word_data, in_word, &new_words);
916  if (new_words.empty()) {
917  // Transfer input word to new_words, as the classifier must have put
918  // the result back in the input.
919  new_words.push_back(*in_word);
920  *in_word = nullptr;
921  }
922  if (debug) {
923  for (int i = 0; i < new_words.size(); ++i)
924  new_words[i]->DebugTopChoice("Lang result");
925  }
926  // Initial version is a bit of a hack based on better certainty and rating
927  // or a dictionary vs non-dictionary word.
928  return SelectBestWords(classify_max_rating_ratio,
930  debug, &new_words, best_words);
931 }
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
double classify_max_certainty_margin
Definition: classify.h:444
STRING lang
Definition: ccutil.h:69
double classify_max_rating_ratio
Definition: classify.h:442
bool tesseract::Tesseract::right_to_left ( ) const
inline

Definition at line 275 of file tesseractclass.h.

275  {
276  return right_to_left_;
277  }
bool tesseract::Tesseract::RunOldFixXht ( WERD_RES word,
BLOCK block,
ROW row 
)
int16_t tesseract::Tesseract::safe_dict_word ( const WERD_RES werd_res)

Definition at line 608 of file reject.cpp.

608  {
609  const WERD_CHOICE &word = *werd_res->best_choice;
610  int dict_word_type = werd_res->tesseract->dict_word(word);
611  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
612 }
tesseract::Tesseract * tesseract
Definition: pageres.h:281
WERD_CHOICE * best_choice
Definition: pageres.h:234
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:89
Pix* tesseract::Tesseract::scaled_color ( ) const
inline

Definition at line 258 of file tesseractclass.h.

258  {
259  return scaled_color_;
260  }
int tesseract::Tesseract::scaled_factor ( ) const
inline

Definition at line 261 of file tesseractclass.h.

261  {
262  return scaled_factor_;
263  }
void tesseract::Tesseract::script_pos_pass ( PAGE_RES page_res)

Definition at line 735 of file control.cpp.

735  {
736  PAGE_RES_IT page_res_it(page_res);
737  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
738  page_res_it.forward()) {
739  WERD_RES* word = page_res_it.word();
740  if (word->word->flag(W_REP_CHAR)) {
741  page_res_it.forward();
742  continue;
743  }
744  const float x_height = page_res_it.block()->block->x_height();
745  float word_x_height = word->x_height;
746  if (word_x_height < word->best_choice->min_x_height() ||
747  word_x_height > word->best_choice->max_x_height()) {
748  word_x_height = (word->best_choice->min_x_height() +
749  word->best_choice->max_x_height()) / 2.0f;
750  }
751  // Test for small caps. Word capheight must be close to block xheight,
752  // and word must contain no lower case letters, and at least one upper case.
753  const double small_cap_xheight = x_height * kXHeightCapRatio;
754  const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
755  if (word->uch_set->script_has_xheight() &&
756  small_cap_xheight - small_cap_delta <= word_x_height &&
757  word_x_height <= small_cap_xheight + small_cap_delta) {
758  // Scan for upper/lower.
759  int num_upper = 0;
760  int num_lower = 0;
761  for (int i = 0; i < word->best_choice->length(); ++i) {
762  if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
763  ++num_upper;
764  else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
765  ++num_lower;
766  }
767  if (num_upper > 0 && num_lower == 0)
768  word->small_caps = true;
769  }
770  word->SetScriptPositions();
771  }
772 }
float min_x_height() const
Definition: ratngs.h:336
int length() const
Definition: ratngs.h:303
repeated character
Definition: werd.h:38
bool script_has_xheight() const
Definition: unicharset.h:904
const UNICHARSET * uch_set
Definition: pageres.h:205
bool small_caps
Definition: pageres.h:298
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
WERD_CHOICE * best_choice
Definition: pageres.h:234
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
static const double kXHeightCapRatio
Definition: ccstruct.h:37
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
float max_x_height() const
Definition: ratngs.h:339
float x_height
Definition: pageres.h:310
WERD * word
Definition: pageres.h:188
void SetScriptPositions()
Definition: pageres.cpp:862
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498
void tesseract::Tesseract::SearchForText ( const GenericVector< BLOB_CHOICE_LIST * > *  choices,
int  choices_pos,
int  choices_length,
const GenericVector< UNICHAR_ID > &  target_text,
int  text_index,
float  rating,
GenericVector< int > *  segmentation,
float *  best_rating,
GenericVector< int > *  best_segmentation 
)

Recursive helper to find a match to the target_text (from text_index position) in the choices (from choices_pos position).

Parameters
choicesis an array of GenericVectors, of length choices_length, with each element representing a starting position in the word, and the GenericVector holding classification results for a sequence of consecutive blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
choices_pos
choices_length
target_text
text_index
rating
segmentation
best_rating
best_segmentation

Definition at line 635 of file applybox.cpp.

641  {
643  for (int length = 1; length <= choices[choices_pos].size(); ++length) {
644  // Rating of matching choice or worst choice if no match.
645  float choice_rating = 0.0f;
646  // Find the corresponding best BLOB_CHOICE.
647  BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
648  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
649  choice_it.forward()) {
650  const BLOB_CHOICE* choice = choice_it.data();
651  choice_rating = choice->rating();
652  UNICHAR_ID class_id = choice->unichar_id();
653  if (class_id == target_text[text_index]) {
654  break;
655  }
656  // Search ambigs table.
657  if (class_id < table.size() && table[class_id] != nullptr) {
658  AmbigSpec_IT spec_it(table[class_id]);
659  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
660  spec_it.forward()) {
661  const AmbigSpec *ambig_spec = spec_it.data();
662  // We'll only do 1-1.
663  if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
664  ambig_spec->correct_ngram_id == target_text[text_index])
665  break;
666  }
667  if (!spec_it.cycled_list())
668  break; // Found an ambig.
669  }
670  }
671  if (choice_it.cycled_list())
672  continue; // No match.
673  segmentation->push_back(length);
674  if (choices_pos + length == choices_length &&
675  text_index + 1 == target_text.size()) {
676  // This is a complete match. If the rating is good record a new best.
677  if (applybox_debug > 2) {
678  tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
679  rating + choice_rating, *best_rating, segmentation->size(),
680  best_segmentation->size());
681  }
682  if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
683  *best_segmentation = *segmentation;
684  *best_rating = rating + choice_rating;
685  }
686  } else if (choices_pos + length < choices_length &&
687  text_index + 1 < target_text.size()) {
688  if (applybox_debug > 3) {
689  tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
690  target_text[text_index],
691  unicharset.id_to_unichar(target_text[text_index]),
692  choice_it.data()->unichar_id() == target_text[text_index]
693  ? "Match" : "Ambig",
694  choices_pos, length);
695  }
696  SearchForText(choices, choices_pos + length, choices_length, target_text,
697  text_index + 1, rating + choice_rating, segmentation,
698  best_rating, best_segmentation);
699  if (applybox_debug > 3) {
700  tprintf("End recursion for %d=%s\n", target_text[text_index],
701  unicharset.id_to_unichar(target_text[text_index]));
702  }
703  }
704  segmentation->truncate(segmentation->size() - 1);
705  }
706 }
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
float rating() const
Definition: ratngs.h:80
UNICHARSET unicharset
Definition: ccutil.h:71
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:141
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:103
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
void SearchForText(const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
Definition: applybox.cpp:635
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void truncate(int size)
bool empty() const
Definition: genericvector.h:89
int push_back(T object)
Dict & getDict() override
int UNICHAR_ID
Definition: unichar.h:34
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:152
int size() const
Definition: genericvector.h:70
void tesseract::Tesseract::SearchWords ( PointerVector< WERD_RES > *  words)

Definition at line 250 of file linerec.cpp.

250  {
251  // Run the segmentation search on the network outputs and make a BoxWord
252  // for each of the output words.
253  // If we drop a word as junk, then there is always a space in front of the
254  // next.
255  const Dict* stopper_dict = lstm_recognizer_->GetDict();
256  if (stopper_dict == nullptr) stopper_dict = &getDict();
257  bool any_nonspace_delimited = false;
258  for (int w = 0; w < words->size(); ++w) {
259  WERD_RES* word = (*words)[w];
260  if (word->best_choice != nullptr &&
262  any_nonspace_delimited = true;
263  break;
264  }
265  }
266  for (int w = 0; w < words->size(); ++w) {
267  WERD_RES* word = (*words)[w];
268  if (word->best_choice == nullptr) {
269  // It is a dud.
270  word->SetupFake(lstm_recognizer_->GetUnicharset());
271  } else {
272  // Set the best state.
273  for (int i = 0; i < word->best_choice->length(); ++i) {
274  int length = word->best_choice->state(i);
275  word->best_state.push_back(length);
276  }
277  word->reject_map.initialise(word->best_choice->length());
278  word->tess_failed = false;
279  word->tess_accepted = true;
280  word->tess_would_adapt = false;
281  word->done = true;
282  word->tesseract = this;
283  float word_certainty = std::min(word->space_certainty,
284  word->best_choice->certainty());
285  word_certainty *= kCertaintyScale;
286  if (getDict().stopper_debug_level >= 1) {
287  tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
288  word->best_choice->certainty(), word->space_certainty,
289  std::min(word->space_certainty, word->best_choice->certainty()) *
291  word_certainty);
292  word->best_choice->print();
293  }
294  word->best_choice->set_certainty(word_certainty);
295 
296  word->tess_accepted = stopper_dict->AcceptableResult(word);
297  }
298  }
299 }
const float kCertaintyScale
Definition: linerec.cpp:36
bool tess_failed
Definition: pageres.h:287
bool done
Definition: pageres.h:297
const Dict * GetDict() const
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:356
const UNICHARSET & GetUnicharset() const
float space_certainty
Definition: pageres.h:315
bool ContainsAnyNonSpaceDelimited() const
Definition: ratngs.h:514
int length() const
Definition: ratngs.h:303
bool tess_accepted
Definition: pageres.h:295
tesseract::Tesseract * tesseract
Definition: pageres.h:281
bool tess_would_adapt
Definition: pageres.h:296
REJMAP reject_map
Definition: pageres.h:286
int stopper_debug_level
Definition: dict.h:628
void print() const
Definition: ratngs.h:580
void initialise(int16_t length)
Definition: rejctmap.cpp:273
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int push_back(T object)
Dict & getDict() override
float certainty() const
Definition: ratngs.h:330
WERD_CHOICE * best_choice
Definition: pageres.h:234
GenericVector< int > best_state
Definition: pageres.h:270
int size() const
Definition: genericvector.h:70
void set_certainty(float new_val)
Definition: ratngs.h:372
int state(int index) const
Definition: ratngs.h:319
int tesseract::Tesseract::SegmentPage ( const STRING input_file,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr 
)

Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be nullptr. On return the blocks list owns all the constructed page layout.

Definition at line 99 of file pagesegmain.cpp.

100  {
101  ASSERT_HOST(pix_binary_ != nullptr);
102  int width = pixGetWidth(pix_binary_);
103  int height = pixGetHeight(pix_binary_);
104  // Get page segmentation mode.
105  auto pageseg_mode = static_cast<PageSegMode>(
106  static_cast<int>(tessedit_pageseg_mode));
107  // If a UNLV zone file can be found, use that instead of segmentation.
108  if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
109  input_file != nullptr && input_file->length() > 0) {
110  STRING name = *input_file;
111  const char* lastdot = strrchr(name.string(), '.');
112  if (lastdot != nullptr)
113  name[lastdot - name.string()] = '\0';
114  read_unlv_file(name, width, height, blocks);
115  }
116  if (blocks->empty()) {
117  // No UNLV file present. Work according to the PageSegMode.
118  // First make a single block covering the whole image.
119  BLOCK_IT block_it(blocks);
120  auto* block = new BLOCK("", true, 0, 0, 0, 0, width, height);
121  block->set_right_to_left(right_to_left());
122  block_it.add_to_end(block);
123  } else {
124  // UNLV file present. Use PSM_SINGLE_BLOCK.
125  pageseg_mode = PSM_SINGLE_BLOCK;
126  }
127  // The diacritic_blobs holds noise blobs that may be diacritics. They
128  // are separated out on areas of the image that seem noisy and short-circuit
129  // the layout process, going straight from the initial partition creation
130  // right through to after word segmentation, where they are added to the
131  // rej_cblobs list of the most appropriate word. From there classification
132  // will determine whether they are used.
133  BLOBNBOX_LIST diacritic_blobs;
134  int auto_page_seg_ret_val = 0;
135  TO_BLOCK_LIST to_blocks;
136  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
137  PSM_SPARSE(pageseg_mode)) {
138  auto_page_seg_ret_val = AutoPageSeg(
139  pageseg_mode, blocks, &to_blocks,
140  enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
141  if (pageseg_mode == PSM_OSD_ONLY)
142  return auto_page_seg_ret_val;
143  // To create blobs from the image region bounds uncomment this line:
144  // to_blocks.clear(); // Uncomment to go back to the old mode.
145  } else {
146  deskew_ = FCOORD(1.0f, 0.0f);
147  reskew_ = FCOORD(1.0f, 0.0f);
148  if (pageseg_mode == PSM_CIRCLE_WORD) {
149  Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
150  if (pixcleaned != nullptr) {
151  pixDestroy(&pix_binary_);
152  pix_binary_ = pixcleaned;
153  }
154  }
155  }
156 
157  if (auto_page_seg_ret_val < 0) {
158  return -1;
159  }
160 
161  if (blocks->empty()) {
163  tprintf("Empty page\n");
164  return 0; // AutoPageSeg found an empty page.
165  }
166  bool splitting =
168  bool cjk_mode = textord_use_cjk_fp_model;
169 
170  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
171  pix_thresholds_, pix_grey_, splitting || cjk_mode,
172  &diacritic_blobs, blocks, &to_blocks);
173  return auto_page_seg_ret_val;
174 }
Treat the image as a single word in a circle.
Definition: publictypes.h:175
Definition: strngs.h:45
Definition: points.h:188
Orientation and script detection only.
Definition: publictypes.h:164
int32_t length() const
Definition: strngs.cpp:189
bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:203
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:191
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
bool PSM_SPARSE(int pageseg_mode)
Definition: publictypes.h:200
bool right_to_left() const
bool PSM_COL_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:197
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int textord_debug_tabfind
Definition: alignedblob.cpp:27
#define ASSERT_HOST(x)
Definition: errcode.h:88
void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, Pix *binary_pix, Pix *thresholds_pix, Pix *grey_pix, bool use_box_bottoms, BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: textord.cpp:230
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:172
bool read_unlv_file(STRING name, int32_t xsize, int32_t ysize, BLOCK_LIST *blocks)
Definition: blread.cpp:32
Definition: ocrblock.h:29
bool tesseract::Tesseract::SelectGoodDiacriticOutlines ( int  pass,
float  certainty_threshold,
PAGE_RES_IT pr_it,
C_BLOB blob,
const GenericVector< C_OUTLINE * > &  outlines,
int  num_outlines,
GenericVector< bool > *  ok_outlines 
)

Definition at line 1147 of file control.cpp.

1150  {
1151 #ifndef DISABLED_LEGACY_ENGINE
1152  STRING best_str;
1153  float target_cert = certainty_threshold;
1154  if (blob != nullptr) {
1155  float target_c2;
1156  target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
1157  if (debug_noise_removal) {
1158  tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(),
1159  target_cert, target_c2);
1160  blob->bounding_box().print();
1161  }
1162  target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1163  }
1164  GenericVector<bool> test_outlines = *ok_outlines;
1165  // Start with all the outlines in.
1166  STRING all_str;
1167  GenericVector<bool> best_outlines = *ok_outlines;
1168  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1169  pr_it, blob, &all_str);
1170  if (debug_noise_removal) {
1171  TBOX ol_box;
1172  for (int i = 0; i < test_outlines.size(); ++i) {
1173  if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
1174  }
1175  tprintf("All Noise blob classified as %s=%g, delta=%g at:",
1176  all_str.string(), best_cert, best_cert - target_cert);
1177  ol_box.print();
1178  }
1179  // Iteratively zero out the bit that improves the certainty the most, until
1180  // we get past the threshold, have zero bits, or fail to improve.
1181  int best_index = 0; // To zero out.
1182  while (num_outlines > 1 && best_index >= 0 &&
1183  (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
1184  // Find the best bit to zero out.
1185  best_index = -1;
1186  for (int i = 0; i < outlines.size(); ++i) {
1187  if (test_outlines[i]) {
1188  test_outlines[i] = false;
1189  STRING str;
1190  float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1191  pr_it, blob, &str);
1192  if (debug_noise_removal) {
1193  TBOX ol_box;
1194  for (int j = 0; j < outlines.size(); ++j) {
1195  if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
1196  tprintf("%d", test_outlines[j]);
1197  }
1198  tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(),
1199  cert, cert - target_cert);
1200  ol_box.print();
1201  }
1202  if (cert > best_cert) {
1203  best_cert = cert;
1204  best_index = i;
1205  best_outlines = test_outlines;
1206  }
1207  test_outlines[i] = true;
1208  }
1209  }
1210  if (best_index >= 0) {
1211  test_outlines[best_index] = false;
1212  --num_outlines;
1213  }
1214  }
1215  if (best_cert >= target_cert) {
1216  // Save the best combination.
1217  *ok_outlines = best_outlines;
1218  if (debug_noise_removal) {
1219  tprintf("%s noise combination ", blob ? "Adding" : "New");
1220  for (int i = 0; i < best_outlines.size(); ++i) {
1221  tprintf("%d", best_outlines[i]);
1222  }
1223  tprintf(" yields certainty %g, beating target of %g\n", best_cert,
1224  target_cert);
1225  }
1226  return true;
1227  }
1228 #endif // ndef DISABLED_LEGACY_ENGINE
1229  return false;
1230 }
Definition: rect.h:34
void print() const
Definition: rect.h:278
Definition: strngs.h:45
TBOX bounding_box() const
Definition: stepblob.cpp:253
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1282
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
Definition: control.cpp:1234
int size() const
Definition: genericvector.h:70
void tesseract::Tesseract::set_done ( WERD_RES word,
int16_t  pass 
)
void tesseract::Tesseract::set_pix_grey ( Pix *  grey_pix)
inline

Definition at line 208 of file tesseractclass.h.

208  {
209  pixDestroy(&pix_grey_);
210  pix_grey_ = grey_pix;
211  }
void tesseract::Tesseract::set_pix_original ( Pix *  original_pix)
inline

Definition at line 216 of file tesseractclass.h.

216  {
217  pixDestroy(&pix_original_);
218  pix_original_ = original_pix;
219  // Clone to sublangs as well.
220  for (int i = 0; i < sub_langs_.size(); ++i) {
221  sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
222  : nullptr);
223  }
224  }
void tesseract::Tesseract::set_pix_thresholds ( Pix *  thresholds)
inline

Definition at line 242 of file tesseractclass.h.

242  {
243  pixDestroy(&pix_thresholds_);
244  pix_thresholds_ = thresholds;
245  }
void tesseract::Tesseract::set_source_resolution ( int  ppi)
inline

Definition at line 249 of file tesseractclass.h.

249  {
250  source_resolution_ = ppi;
251  }
void tesseract::Tesseract::set_unlv_suspects ( WERD_RES word)

Definition at line 273 of file output.cpp.

273  {
274  int len = word_res->reject_map.length();
275  const WERD_CHOICE &word = *(word_res->best_choice);
276  const UNICHARSET &uchset = *word.unicharset();
277  int i;
278  float rating_per_ch;
279 
280  if (suspect_level == 0) {
281  for (i = 0; i < len; i++) {
282  if (word_res->reject_map[i].rejected())
283  word_res->reject_map[i].setrej_minimal_rej_accept();
284  }
285  return;
286  }
287 
288  if (suspect_level >= 3)
289  return; //Use defaults
290 
291  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
292 
293  if (safe_dict_word(word_res) &&
294  (count_alphas(word) > suspect_short_words)) {
295  /* Unreject alphas in dictionary words */
296  for (i = 0; i < len; ++i) {
297  if (word_res->reject_map[i].rejected() &&
298  uchset.get_isalpha(word.unichar_id(i)))
299  word_res->reject_map[i].setrej_minimal_rej_accept();
300  }
301  }
302 
303  rating_per_ch = word.rating() / word_res->reject_map.length();
304 
305  if (rating_per_ch >= suspect_rating_per_ch)
306  return; // Don't touch bad ratings
307 
308  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
309  /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
310  for (i = 0; i < len; ++i) {
311  if (word_res->reject_map[i].rejected() &&
312  (!uchset.eq(word.unichar_id(i), " ")))
313  word_res->reject_map[i].setrej_minimal_rej_accept();
314  }
315  }
316 
317  for (i = 0; i < len; i++) {
318  if (word_res->reject_map[i].rejected()) {
319  if (word_res->reject_map[i].flag(R_DOC_REJ))
320  word_res->reject_map[i].setrej_minimal_rej_accept();
321  if (word_res->reject_map[i].flag(R_BLOCK_REJ))
322  word_res->reject_map[i].setrej_minimal_rej_accept();
323  if (word_res->reject_map[i].flag(R_ROW_REJ))
324  word_res->reject_map[i].setrej_minimal_rej_accept();
325  }
326  }
327 
328  if (suspect_level == 2)
329  return;
330 
331  if (!suspect_constrain_1Il ||
332  (word_res->reject_map.length() <= suspect_short_words)) {
333  for (i = 0; i < len; i++) {
334  if (word_res->reject_map[i].rejected()) {
335  if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
336  word_res->reject_map[i].flag(R_POSTNN_1IL)))
337  word_res->reject_map[i].setrej_minimal_rej_accept();
338 
339  if (!suspect_constrain_1Il &&
340  word_res->reject_map[i].flag(R_MM_REJECT))
341  word_res->reject_map[i].setrej_minimal_rej_accept();
342  }
343  }
344  }
345 
346  if (acceptable_word_string(*word_res->uch_set,
347  word.unichar_string().string(),
348  word.unichar_lengths().string()) !=
349  AC_UNACCEPTABLE ||
351  word.unichar_lengths().string())) {
352  if (word_res->reject_map.length() > suspect_short_words) {
353  for (i = 0; i < len; i++) {
354  if (word_res->reject_map[i].rejected() &&
355  (!word_res->reject_map[i].perm_rejected() ||
356  word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
357  word_res->reject_map[i].flag (R_POSTNN_1IL) ||
358  word_res->reject_map[i].flag (R_MM_REJECT))) {
359  word_res->reject_map[i].setrej_minimal_rej_accept();
360  }
361  }
362  }
363  }
364 }
Unacceptable word.
Definition: control.h:30
int16_t count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:366
const STRING & unichar_string() const
Definition: ratngs.h:541
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:608
bool acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:387
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:687
float rating() const
Definition: ratngs.h:327
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1759
const char * string() const
Definition: strngs.cpp:194
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
const STRING & unichar_lengths() const
Definition: ratngs.h:548
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
void tesseract::Tesseract::set_word_fonts ( WERD_RES word)

set_word_fonts

Get the fonts for the word.

Definition at line 1976 of file control.cpp.

1976  {
1977  // Don't try to set the word fonts for an lstm word, as the configs
1978  // will be meaningless.
1979  if (word->chopped_word == nullptr) return;
1980  ASSERT_HOST(word->best_choice != nullptr);
1981 
1982 #ifndef DISABLED_LEGACY_ENGINE
1983  const int fontinfo_size = get_fontinfo_table().size();
1984  if (fontinfo_size == 0) return;
1985  GenericVector<int> font_total_score;
1986  font_total_score.init_to_size(fontinfo_size, 0);
1987 
1988  word->italic = 0;
1989  word->bold = 0;
1990  // Compute the font scores for the word
1991  if (tessedit_debug_fonts) {
1992  tprintf("Examining fonts in %s\n",
1993  word->best_choice->debug_string().string());
1994  }
1995  for (int b = 0; b < word->best_choice->length(); ++b) {
1996  const BLOB_CHOICE* choice = word->GetBlobChoice(b);
1997  if (choice == nullptr) continue;
1998  const GenericVector<ScoredFont>& fonts = choice->fonts();
1999  for (int f = 0; f < fonts.size(); ++f) {
2000  const int fontinfo_id = fonts[f].fontinfo_id;
2001  if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
2002  font_total_score[fontinfo_id] += fonts[f].score;
2003  }
2004  }
2005  }
2006  // Find the top and 2nd choice for the word.
2007  int score1 = 0, score2 = 0;
2008  int16_t font_id1 = -1, font_id2 = -1;
2009  for (int f = 0; f < fontinfo_size; ++f) {
2010  if (tessedit_debug_fonts && font_total_score[f] > 0) {
2011  tprintf("Font %s, total score = %d\n",
2012  fontinfo_table_.get(f).name, font_total_score[f]);
2013  }
2014  if (font_total_score[f] > score1) {
2015  score2 = score1;
2016  font_id2 = font_id1;
2017  score1 = font_total_score[f];
2018  font_id1 = f;
2019  } else if (font_total_score[f] > score2) {
2020  score2 = font_total_score[f];
2021  font_id2 = f;
2022  }
2023  }
2024  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : nullptr;
2025  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : nullptr;
2026  // Each score has a limit of UINT16_MAX, so divide by that to get the number
2027  // of "votes" for that font, ie number of perfect scores.
2028  word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
2029  word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
2030  if (score1 > 0) {
2031  const FontInfo fi = fontinfo_table_.get(font_id1);
2032  if (tessedit_debug_fonts) {
2033  if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
2034  tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
2035  fi.name, word->fontinfo_id_count,
2036  fontinfo_table_.get(font_id2).name,
2037  word->fontinfo_id2_count);
2038  } else {
2039  tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
2040  fi.name, word->fontinfo_id_count);
2041  }
2042  }
2043  word->italic = (fi.is_italic() ? 1 : -1) * word->fontinfo_id_count;
2044  word->bold = (fi.is_bold() ? 1 : -1) * word->fontinfo_id_count;
2045  }
2046 #endif // ndef DISABLED_LEGACY_ENGINE
2047 }
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:528
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:754
bool is_italic() const
Definition: fontinfo.h:111
int length() const
Definition: ratngs.h:303
int8_t bold
Definition: pageres.h:301
void init_to_size(int size, const T &t)
const FontInfo * fontinfo2
Definition: pageres.h:304
int8_t fontinfo_id2_count
Definition: pageres.h:306
const FontInfo * fontinfo
Definition: pageres.h:303
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
const GenericVector< tesseract::ScoredFont > & fonts() const
Definition: ratngs.h:92
int8_t fontinfo_id_count
Definition: pageres.h:305
WERD_CHOICE * best_choice
Definition: pageres.h:234
#define ASSERT_HOST(x)
Definition: errcode.h:88
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
TWERD * chopped_word
Definition: pageres.h:214
const STRING debug_string() const
Definition: ratngs.h:505
int size() const
Definition: genericvector.h:70
bool is_bold() const
Definition: fontinfo.h:112
int8_t italic
Definition: pageres.h:300
void tesseract::Tesseract::SetBlackAndWhitelist ( )

Definition at line 617 of file tesseractclass.cpp.

617  {
618  // Set the white and blacklists (if any)
620  tessedit_char_whitelist.string(),
621  tessedit_char_unblacklist.string());
622  if (lstm_recognizer_) {
623  UNICHARSET& lstm_unicharset = const_cast<UNICHARSET&> (lstm_recognizer_->GetUnicharset());
624  lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
625  tessedit_char_whitelist.string(),
626  tessedit_char_unblacklist.string());
627  }
628  // Black and white lists should apply to all loaded classifiers.
629  for (int i = 0; i < sub_langs_.size(); ++i) {
630  sub_langs_[i]->unicharset.set_black_and_whitelist(
632  tessedit_char_unblacklist.string());
633  if (sub_langs_[i]->lstm_recognizer_) {
634  UNICHARSET& lstm_unicharset = const_cast<UNICHARSET&> (sub_langs_[i]->lstm_recognizer_->GetUnicharset());
635  lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
636  tessedit_char_whitelist.string(),
637  tessedit_char_unblacklist.string());
638  }
639  }
640 }
const UNICHARSET & GetUnicharset() const
UNICHARSET unicharset
Definition: ccutil.h:71
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
void tesseract::Tesseract::SetEquationDetect ( EquationDetect detector)

Definition at line 594 of file tesseractclass.cpp.

594  {
595  equ_detect_ = detector;
596  equ_detect_->SetLangTesseract(this);
597 }
void SetLangTesseract(Tesseract *lang_tesseract)
void tesseract::Tesseract::SetScaledColor ( int  factor,
Pix *  color 
)
inline

Definition at line 264 of file tesseractclass.h.

264  {
265  scaled_factor_ = factor;
266  scaled_color_ = color;
267  }
void tesseract::Tesseract::SetupAllWordsPassN ( int  pass_n,
const TBOX target_word_box,
const char *  word_config,
PAGE_RES page_res,
GenericVector< WordData > *  words 
)

If tesseract is to be run, sets the words up ready for it.

Definition at line 154 of file control.cpp.

158  {
159  // Prepare all the words.
160  PAGE_RES_IT page_res_it(page_res);
161  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
162  page_res_it.forward()) {
163  if (target_word_box == nullptr ||
164  ProcessTargetWord(page_res_it.word()->word->bounding_box(),
165  *target_word_box, word_config, 1)) {
166  words->push_back(WordData(page_res_it));
167  }
168  }
169  // Setup all the words for recognition with polygonal approximation.
170  for (int w = 0; w < words->size(); ++w) {
171  SetupWordPassN(pass_n, &(*words)[w]);
172  if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
173  }
174 }
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:120
int push_back(T object)
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:177
int size() const
Definition: genericvector.h:70
PAGE_RES * tesseract::Tesseract::SetupApplyBoxes ( const GenericVector< TBOX > &  boxes,
BLOCK_LIST *  block_list 
)

Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: All fuzzy spaces are removed, and all the words are maximally chopped.

Definition at line 207 of file applybox.cpp.

208  {
209  PreenXHeights(block_list);
210  // Strip all fuzzy space markers to simplify the PAGE_RES.
211  BLOCK_IT b_it(block_list);
212  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
213  BLOCK* block = b_it.data();
214  ROW_IT r_it(block->row_list());
215  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
216  ROW* row = r_it.data();
217  WERD_IT w_it(row->word_list());
218  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
219  WERD* word = w_it.data();
220  if (word->cblob_list()->empty()) {
221  delete w_it.extract();
222  } else {
223  word->set_flag(W_FUZZY_SP, false);
224  word->set_flag(W_FUZZY_NON, false);
225  }
226  }
227  }
228  }
229  auto* page_res = new PAGE_RES(false, block_list, nullptr);
230  PAGE_RES_IT pr_it(page_res);
231  WERD_RES* word_res;
232  while ((word_res = pr_it.word()) != nullptr) {
233  MaximallyChopWord(boxes, pr_it.block()->block,
234  pr_it.row()->row, word_res);
235  pr_it.forward();
236  }
237  return page_res;
238 }
Definition: werd.h:56
fuzzy nonspace
Definition: werd.h:40
fuzzy space
Definition: werd.h:39
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:243
void PreenXHeights(BLOCK_LIST *block_list)
Definition: applybox.cpp:181
WERD_LIST * word_list()
Definition: ocrrow.h:55
Definition: ocrrow.h:36
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:117
WERD * word
Definition: pageres.h:188
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:118
Definition: ocrblock.h:29
C_BLOB_LIST * cblob_list()
Definition: werd.h:95
ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr,
TO_BLOCK_LIST *  to_blocks,
Pix **  photo_mask_pix,
Pix **  music_mask_pix 
)

Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a nullptr pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.

Definition at line 270 of file pagesegmain.cpp.

273  {
274  int vertical_x = 0;
275  int vertical_y = 1;
276  TabVector_LIST v_lines;
277  TabVector_LIST h_lines;
278  ICOORD bleft(0, 0);
279 
280  ASSERT_HOST(pix_binary_ != nullptr);
282  pixa_debug_.AddPix(pix_binary_, "PageSegInput");
283  }
284  // Leptonica is used to find the rule/separator lines in the input.
285  LineFinder::FindAndRemoveLines(source_resolution_,
286  textord_tabfind_show_vlines, pix_binary_,
287  &vertical_x, &vertical_y, music_mask_pix,
288  &v_lines, &h_lines);
290  pixa_debug_.AddPix(pix_binary_, "NoLines");
291  }
292  // Leptonica is used to find a mask of the photo regions in the input.
293  *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
295  pixa_debug_.AddPix(pix_binary_, "NoImages");
296  }
297  if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear();
298 
299  // The rest of the algorithm uses the usual connected components.
300  textord_.find_components(pix_binary_, blocks, to_blocks);
301 
302  TO_BLOCK_IT to_block_it(to_blocks);
303  // There must be exactly one input block.
304  // TODO(rays) handle new textline finding with a UNLV zone file.
305  ASSERT_HOST(to_blocks->singleton());
306  TO_BLOCK* to_block = to_block_it.data();
307  TBOX blkbox = to_block->block->pdblk.bounding_box();
308  ColumnFinder* finder = nullptr;
309  int estimated_resolution = source_resolution_;
310  if (source_resolution_ == kMinCredibleResolution) {
311  // Try to estimate resolution from typical body text size.
312  int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor);
313  if (res > estimated_resolution && res < kMaxCredibleResolution) {
314  estimated_resolution = res;
315  tprintf("Estimating resolution as %d\n", estimated_resolution);
316  }
317  }
318 
319  if (to_block->line_size >= 2) {
320  finder = new ColumnFinder(static_cast<int>(to_block->line_size),
321  blkbox.botleft(), blkbox.topright(),
322  estimated_resolution, textord_use_cjk_fp_model,
324  &h_lines, vertical_x, vertical_y);
325 
326  finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
327 
328 #ifndef DISABLED_LEGACY_ENGINE
329 
330  if (equ_detect_) {
331  equ_detect_->LabelSpecialText(to_block);
332  }
333 
334  BLOBNBOX_CLIST osd_blobs;
335  // osd_orientation is the number of 90 degree rotations to make the
336  // characters upright. (See osdetect.h for precise definition.)
337  // We want the text lines horizontal, (vertical text indicates vertical
338  // textlines) which may conflict (eg vertically written CJK).
339  int osd_orientation = 0;
340  bool vertical_text = textord_tabfind_force_vertical_text ||
341  pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
342  if (!vertical_text && textord_tabfind_vertical_text &&
343  PSM_ORIENTATION_ENABLED(pageseg_mode)) {
344  vertical_text =
345  finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
346  to_block, &osd_blobs);
347  }
348  if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != nullptr && osr != nullptr) {
349  GenericVector<int> osd_scripts;
350  if (osd_tess != this) {
351  // We are running osd as part of layout analysis, so constrain the
352  // scripts to those allowed by *this.
353  AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
354  for (int s = 0; s < sub_langs_.size(); ++s) {
355  AddAllScriptsConverted(sub_langs_[s]->unicharset,
356  osd_tess->unicharset, &osd_scripts);
357  }
358  }
359  os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
360  if (pageseg_mode == PSM_OSD_ONLY) {
361  delete finder;
362  return nullptr;
363  }
364  osd_orientation = osr->best_result.orientation_id;
365  double osd_score = osr->orientations[osd_orientation];
366  double osd_margin = min_orientation_margin * 2;
367  for (int i = 0; i < 4; ++i) {
368  if (i != osd_orientation &&
369  osd_score - osr->orientations[i] < osd_margin) {
370  osd_margin = osd_score - osr->orientations[i];
371  }
372  }
373  int best_script_id = osr->best_result.script_id;
374  const char* best_script_str =
375  osd_tess->unicharset.get_script_from_script_id(best_script_id);
376  bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
377  best_script_id == osd_tess->unicharset.hiragana_sid() ||
378  best_script_id == osd_tess->unicharset.katakana_sid() ||
379  strcmp("Japanese", best_script_str) == 0 ||
380  strcmp("Korean", best_script_str) == 0 ||
381  strcmp("Hangul", best_script_str) == 0;
382  if (cjk) {
383  finder->set_cjk_script(true);
384  }
385  if (osd_margin < min_orientation_margin) {
386  // The margin is weak.
387  if (!cjk && !vertical_text && osd_orientation == 2) {
388  // upside down latin text is improbable with such a weak margin.
389  tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
390  "Don't rotate.\n", osd_margin);
391  osd_orientation = 0;
392  } else {
393  tprintf(
394  "OSD: Weak margin (%.2f) for %d blob text block, "
395  "but using orientation anyway: %d\n",
396  osd_margin, osd_blobs.length(), osd_orientation);
397  }
398  }
399  }
400  osd_blobs.shallow_clear();
401  finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
402 
403 #endif // ndef DISABLED_LEGACY_ENGINE
404  }
405 
406  return finder;
407 }
int IntCastRounded(double x)
Definition: helpers.h:175
static void FindAndRemoveLines(int resolution, bool debug, Pix *pix, int *vertical_x, int *vertical_y, Pix **pix_music_mask, TabVector_LIST *v_lines, TabVector_LIST *h_lines)
Definition: linefind.cpp:243
void find_components(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:219
Definition: rect.h:34
double textord_tabfind_aligned_gap_fraction
Orientation and script detection only.
Definition: publictypes.h:164
double textord_tabfind_vertical_text_ratio
constexpr int kResolutionEstimationFactor
Definition: publictypes.h:45
UNICHARSET unicharset
Definition: ccutil.h:71
void AddPix(const Pix *pix, const char *caption)
Definition: debugpixa.h:26
integer coordinate
Definition: points.h:31
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:191
float orientations[4]
Definition: osdetect.h:76
const ICOORD & botleft() const
Definition: rect.h:92
OSBestResult best_result
Definition: osdetect.h:81
bool PSM_COL_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:197
constexpr int kMaxCredibleResolution
Definition: publictypes.h:40
const ICOORD & topright() const
Definition: rect.h:104
bool PSM_ORIENTATION_ENABLED(int pageseg_mode)
Definition: publictypes.h:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int LabelSpecialText(TO_BLOCK *to_block) override
static Pix * FindImages(Pix *pix, DebugPixa *pixa_debug)
Definition: imagefind.cpp:62
#define ASSERT_HOST(x)
Definition: errcode.h:88
constexpr int kMinCredibleResolution
Definition: publictypes.h:38
int orientation_id
Definition: osdetect.h:43
int script_id
Definition: osdetect.h:44
bool textord_tabfind_force_vertical_text
int os_detect_blobs(const GenericVector< int > *allowed_scripts, BLOBNBOX_CLIST *blob_list, OSResults *osr, tesseract::Tesseract *tess)
Definition: osdetect.cpp:278
void tesseract::Tesseract::SetupUniversalFontIds ( )

Definition at line 429 of file tessedit.cpp.

429  {
430  // Note that we can get away with bitwise copying FontInfo in
431  // all_fonts, as it is a temporary structure and we avoid setting the
432  // delete callback.
433  UnicityTable<FontInfo> all_fonts;
435 
436  // Create the universal ID table.
437  CollectFonts(get_fontinfo_table(), &all_fonts);
438  for (int i = 0; i < sub_langs_.size(); ++i) {
439  CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
440  }
441  // Assign ids from the table to each font table.
442  AssignIds(all_fonts, &get_fontinfo_table());
443  for (int i = 0; i < sub_langs_.size(); ++i) {
444  AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
445  }
446  font_table_size_ = all_fonts.size();
447 }
void set_compare_callback(TessResultCallback2< bool, T const &, T const & > *cb)
int size() const
Return the size used.
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:119
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
void tesseract::Tesseract::SetupWordPassN ( int  pass_n,
WordData word 
)

Definition at line 177 of file control.cpp.

177  {
178  if (pass_n == 1 || !word->word->done) {
179  if (pass_n == 1) {
180  word->word->SetupForRecognition(unicharset, this, BestPix(),
181  tessedit_ocr_engine_mode, nullptr,
185  word->row, word->block);
186  } else if (pass_n == 2) {
187  // TODO(rays) Should we do this on pass1 too?
188  word->word->caps_height = 0.0;
189  if (word->word->x_height == 0.0f)
190  word->word->x_height = word->row->x_height();
191  }
192  word->lang_words.truncate(0);
193  for (int s = 0; s <= sub_langs_.size(); ++s) {
194  // The sub_langs_.size() entry is for the master language.
195  Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
196  auto* word_res = new WERD_RES;
197  word_res->InitForRetryRecognition(*word->word);
198  word->lang_words.push_back(word_res);
199  // LSTM doesn't get setup for pass2.
200  if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
201  word_res->SetupForRecognition(
202  lang_t->unicharset, lang_t, BestPix(),
203  lang_t->tessedit_ocr_engine_mode, nullptr,
204  lang_t->classify_bln_numeric_mode,
205  lang_t->textord_use_cjk_fp_model,
206  lang_t->poly_allow_detailed_fx, word->row, word->block);
207  }
208  }
209  }
210 }
UNICHARSET unicharset
Definition: ccutil.h:71
Pix * BestPix() const
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:281
bool classify_bln_numeric_mode
Definition: classify.h:540
void tesseract::Tesseract::SetupWordScripts ( BLOCK_LIST *  blocks)
int tesseract::Tesseract::source_resolution ( ) const
inline

Definition at line 246 of file tesseractclass.h.

246  {
247  return source_resolution_;
248  }
void tesseract::Tesseract::split_and_recog_word ( WERD_RES word)

Definition at line 138 of file tfacepp.cpp.

138  {
139  // Find the biggest blob gap in the chopped_word.
140  int bestgap = -INT32_MAX;
141  int split_index = 0;
142  for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
143  TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
144  TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
145  int gap = blob_box.left() - prev_box.right();
146  if (gap > bestgap) {
147  bestgap = gap;
148  split_index = b;
149  }
150  }
151  ASSERT_HOST(split_index > 0);
152 
153  WERD_RES *word2 = nullptr;
154  BlamerBundle *orig_bb = nullptr;
155  split_word(word, split_index, &word2, &orig_bb);
156 
157  // Recognize the first part of the word.
158  recog_word_recursive(word);
159  // Recognize the second part of the word.
160  recog_word_recursive(word2);
161 
162  join_words(word, word2, orig_bb);
163 }
Definition: rect.h:34
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:176
int16_t right() const
Definition: rect.h:79
int NumBlobs() const
Definition: blobs.h:427
#define ASSERT_HOST(x)
Definition: errcode.h:88
int16_t left() const
Definition: rect.h:72
TWERD * chopped_word
Definition: pageres.h:214
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:234
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:104
void tesseract::Tesseract::split_word ( WERD_RES word,
int  split_pt,
WERD_RES **  right_piece,
BlamerBundle **  orig_blamer_bundle 
) const

Definition at line 176 of file tfacepp.cpp.

179  {
180  ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
181 
182  // Save a copy of the blamer bundle so we can try to reconstruct it below.
183  BlamerBundle *orig_bb =
184  word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
185 
186  auto *word2 = new WERD_RES(*word);
187 
188  // blow away the copied chopped_word, as we want to work with
189  // the blobs from the input chopped_word so seam_arrays can be merged.
190  TWERD *chopped = word->chopped_word;
191  auto *chopped2 = new TWERD;
192  chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
193  for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
194  chopped2->blobs.push_back(chopped->blobs[i]);
195  }
196  chopped->blobs.truncate(split_pt);
197  word->chopped_word = nullptr;
198  delete word2->chopped_word;
199  word2->chopped_word = nullptr;
200 
201  const UNICHARSET &unicharset = *word->uch_set;
202  word->ClearResults();
203  word2->ClearResults();
204  word->chopped_word = chopped;
205  word2->chopped_word = chopped2;
206  word->SetupBasicsFromChoppedWord(unicharset);
207  word2->SetupBasicsFromChoppedWord(unicharset);
208 
209  // Try to adjust the blamer bundle.
210  if (orig_bb != nullptr) {
211  // TODO(rays) Looks like a leak to me.
212  // orig_bb should take, rather than copy.
213  word->blamer_bundle = new BlamerBundle();
214  word2->blamer_bundle = new BlamerBundle();
215  orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
216  word2->chopped_word->blobs[0]->bounding_box().left(),
218  word->blamer_bundle, word2->blamer_bundle);
219  }
220 
221  *right_piece = word2;
222  *orig_blamer_bundle = orig_bb;
223 }
Definition: blobs.h:397
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
TBOX bounding_box() const
Definition: blobs.cpp:472
UNICHARSET unicharset
Definition: ccutil.h:71
const UNICHARSET * uch_set
Definition: pageres.h:205
void reserve(int size)
void truncate(int size)
int16_t right() const
Definition: rect.h:79
int NumBlobs() const
Definition: blobs.h:427
void ClearResults()
Definition: pageres.cpp:1151
#define ASSERT_HOST(x)
Definition: errcode.h:88
TWERD * chopped_word
Definition: pageres.h:214
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:347
bool wordrec_debug_blamer
Definition: wordrec.h:236
T & back() const
BlamerBundle * blamer_bundle
Definition: pageres.h:245
bool tesseract::Tesseract::SubAndSuperscriptFix ( WERD_RES word)

Attempt to split off any high (or low) bits at the ends of the word with poor certainty and recognize them separately. If the certainty gets much better and other sanity checks pass, accept.

This superscript fix is meant to be called in the second pass of recognition when we have tried once and already have a preliminary answer for word.

Returns
Whether we modified the given word.

Definition at line 102 of file superscript.cpp.

102  {
103  if (word->tess_failed || word->word->flag(W_REP_CHAR) ||
104  !word->best_choice) {
105  return false;
106  }
107  int num_leading, num_trailing;
108  ScriptPos sp_leading, sp_trailing;
109  float leading_certainty, trailing_certainty;
110  float avg_certainty, unlikely_threshold;
111 
112  // Calculate the number of whole suspicious characters at the edges.
114  word, &num_leading, &sp_leading, &leading_certainty,
115  &num_trailing, &sp_trailing, &trailing_certainty,
116  &avg_certainty, &unlikely_threshold);
117 
118  const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
119  const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
120 
121  int num_blobs = word->best_choice->length();
122 
123  // Calculate the remainder (partial characters) at the edges.
124  // This accounts for us having classified the best version of
125  // a word as [speaker?'] when it was instead [speaker.^{21}]
126  // (that is we accidentally thought the 2 was attached to the period).
127  int num_remainder_leading = 0, num_remainder_trailing = 0;
128  if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
129  int super_y_bottom =
131  int sub_y_top =
133  int last_word_char = num_blobs - 1 - num_trailing;
134  float last_char_certainty = word->best_choice->certainty(last_word_char);
135  if (word->best_choice->unichar_id(last_word_char) != 0 &&
136  last_char_certainty <= unlikely_threshold) {
137  ScriptPos rpos;
138  YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
139  nullptr, nullptr, &rpos, &num_remainder_trailing);
140  if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
141  if (num_remainder_trailing > 0 &&
142  last_char_certainty < trailing_certainty) {
143  trailing_certainty = last_char_certainty;
144  }
145  }
146  bool another_blob_available = (num_remainder_trailing == 0) ||
147  num_leading + num_trailing + 1 < num_blobs;
148  int first_char_certainty = word->best_choice->certainty(num_leading);
149  if (another_blob_available &&
150  word->best_choice->unichar_id(num_leading) != 0 &&
151  first_char_certainty <= unlikely_threshold) {
152  ScriptPos lpos;
153  YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
154  &lpos, &num_remainder_leading, nullptr, nullptr);
155  if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
156  if (num_remainder_leading > 0 &&
157  first_char_certainty < leading_certainty) {
158  leading_certainty = first_char_certainty;
159  }
160  }
161  }
162 
163  // If nothing to do, bail now.
164  if (num_leading + num_trailing +
165  num_remainder_leading + num_remainder_trailing == 0) {
166  return false;
167  }
168 
169  if (superscript_debug >= 1) {
170  tprintf("Candidate for superscript detection: %s (",
171  word->best_choice->unichar_string().string());
172  if (num_leading || num_remainder_leading) {
173  tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading,
174  leading_pos);
175  }
176  if (num_trailing || num_remainder_trailing) {
177  tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
178  trailing_pos);
179  }
180  tprintf(")\n");
181  }
182  if (superscript_debug >= 3) {
183  word->best_choice->print();
184  }
185  if (superscript_debug >= 2) {
186  tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ",
187  avg_certainty, unlikely_threshold);
188  if (num_leading)
189  tprintf("Orig. leading (min): %.2f ", leading_certainty);
190  if (num_trailing)
191  tprintf("Orig. trailing (min): %.2f ", trailing_certainty);
192  tprintf("\n");
193  }
194 
195  // We've now calculated the number of rebuilt blobs we want to carve off.
196  // However, split_word() works from TBLOBs in chopped_word, so we need to
197  // convert to those.
198  int num_chopped_leading =
199  LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
200  int num_chopped_trailing =
201  TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
202 
203  int retry_leading = 0;
204  int retry_trailing = 0;
205  bool is_good = false;
206  WERD_RES *revised = TrySuperscriptSplits(
207  num_chopped_leading, leading_certainty, sp_leading,
208  num_chopped_trailing, trailing_certainty, sp_trailing,
209  word, &is_good, &retry_leading, &retry_trailing);
210  if (is_good) {
211  word->ConsumeWordResults(revised);
212  } else if (retry_leading || retry_trailing) {
213  int retry_chopped_leading =
214  LeadingUnicharsToChopped(revised, retry_leading);
215  int retry_chopped_trailing =
216  TrailingUnicharsToChopped(revised, retry_trailing);
217  WERD_RES *revised2 = TrySuperscriptSplits(
218  retry_chopped_leading, leading_certainty, sp_leading,
219  retry_chopped_trailing, trailing_certainty, sp_trailing,
220  revised, &is_good, &retry_leading, &retry_trailing);
221  if (is_good) {
222  word->ConsumeWordResults(revised2);
223  }
224  delete revised2;
225  }
226  delete revised;
227  return is_good;
228 }
bool tess_failed
Definition: pageres.h:287
const int kBlnXHeight
Definition: normalis.h:24
const STRING & unichar_string() const
Definition: ratngs.h:541
int length() const
Definition: ratngs.h:303
repeated character
Definition: werd.h:38
void print() const
Definition: ratngs.h:580
const int kBlnBaselineOffset
Definition: normalis.h:25
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:769
float certainty() const
Definition: ratngs.h:330
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
WERD_CHOICE * best_choice
Definition: pageres.h:234
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
WERD * word
Definition: pageres.h:188
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
bool tesseract::Tesseract::terrible_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level 
)

Definition at line 503 of file docqual.cpp.

504  {
505  float rating_per_ch;
506  int adjusted_len;
507  int crunch_mode = 0;
508 
509  if ((word->best_choice->unichar_string().length() == 0) ||
510  (strspn(word->best_choice->unichar_string().string(), " ") ==
512  crunch_mode = 1;
513  else {
514  adjusted_len = word->reject_map.length ();
515  if (adjusted_len > crunch_rating_max)
516  adjusted_len = crunch_rating_max;
517  rating_per_ch = word->best_choice->rating () / adjusted_len;
518 
519  if (rating_per_ch > crunch_terrible_rating)
520  crunch_mode = 2;
521  else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
522  crunch_mode = 3;
523  else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
524  (garbage_level != G_OK))
525  crunch_mode = 4;
526  else if ((rating_per_ch > crunch_poor_garbage_rate) &&
527  (garbage_level != G_OK))
528  crunch_mode = 5;
529  }
530  if (crunch_mode > 0) {
531  if (crunch_debug > 2) {
532  tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
533  crunch_mode, word->best_choice->unichar_string().string());
534  }
535  return true;
536  }
537  else
538  return false;
539 }
uint32_t unsigned_size() const
Definition: strngs.h:72
int32_t length() const
Definition: rejctmap.h:223
const STRING & unichar_string() const
Definition: ratngs.h:541
int32_t length() const
Definition: strngs.cpp:189
REJMAP reject_map
Definition: pageres.h:286
float rating() const
Definition: ratngs.h:327
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
Definition: docqual.h:32
float certainty() const
Definition: ratngs.h:330
WERD_CHOICE * best_choice
Definition: pageres.h:234
bool tesseract::Tesseract::tess_acceptable_word ( WERD_RES word)

Definition at line 62 of file tessbox.cpp.

62  {
63  return getDict().AcceptableResult(word);
64 }
bool AcceptableResult(WERD_RES *word) const
Definition: stopper.cpp:100
Dict & getDict() override
void tesseract::Tesseract::tess_add_doc_word ( WERD_CHOICE word_choice)

Definition at line 72 of file tessbox.cpp.

72  {
73  getDict().add_document_word(*word_choice);
74 }
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:653
Dict & getDict() override
void tesseract::Tesseract::tess_segment_pass_n ( int  pass_n,
WERD_RES word 
)

Definition at line 32 of file tessbox.cpp.

32  {
33  int saved_enable_assoc = 0;
34  int saved_chop_enable = 0;
35 
36  if (word->word->flag(W_DONT_CHOP)) {
37  saved_enable_assoc = wordrec_enable_assoc;
38  saved_chop_enable = chop_enable;
39  wordrec_enable_assoc.set_value(0);
40  chop_enable.set_value(0);
41  }
42  if (pass_n == 1)
43  set_pass1();
44  else
45  set_pass2();
46  recog_word(word);
47  if (word->best_choice == nullptr)
48  word->SetupFake(*word->uch_set);
49  if (word->word->flag(W_DONT_CHOP)) {
50  wordrec_enable_assoc.set_value(saved_enable_assoc);
51  chop_enable.set_value(saved_chop_enable);
52  }
53 }
bool wordrec_enable_assoc
Definition: wordrec.h:199
void recog_word(WERD_RES *word)
Definition: tfacepp.cpp:40
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:356
void set_pass1()
Definition: tface.cpp:101
fixed pitch chopped
Definition: werd.h:37
const UNICHARSET * uch_set
Definition: pageres.h:205
void set_pass2()
Definition: tface.cpp:113
WERD_CHOICE * best_choice
Definition: pageres.h:234
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
WERD * word
Definition: pageres.h:188
bool tesseract::Tesseract::TestNewNormalization ( int  original_misfits,
float  baseline_shift,
float  new_x_ht,
WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1533 of file control.cpp.

1535  {
1536  bool accept_new_x_ht = false;
1537  WERD_RES new_x_ht_word(word->word);
1538  if (word->blamer_bundle != nullptr) {
1539  new_x_ht_word.blamer_bundle = new BlamerBundle();
1540  new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1541  }
1542  new_x_ht_word.x_height = new_x_ht;
1543  new_x_ht_word.baseline_shift = baseline_shift;
1544  new_x_ht_word.caps_height = 0.0;
1545  new_x_ht_word.SetupForRecognition(
1546  unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
1548  poly_allow_detailed_fx, row, block);
1549  match_word_pass_n(2, &new_x_ht_word, row, block);
1550  if (!new_x_ht_word.tess_failed) {
1551  int new_misfits = CountMisfitTops(&new_x_ht_word);
1552  if (debug_x_ht_level >= 1) {
1553  tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
1554  original_misfits, word->x_height,
1555  new_misfits, new_x_ht);
1556  tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
1557  word->best_choice->rating(), word->best_choice->certainty(),
1558  new_x_ht_word.best_choice->rating(),
1559  new_x_ht_word.best_choice->certainty());
1560  }
1561  // The misfits must improve and either the rating or certainty.
1562  accept_new_x_ht = new_misfits < original_misfits &&
1563  (new_x_ht_word.best_choice->certainty() >
1564  word->best_choice->certainty() ||
1565  new_x_ht_word.best_choice->rating() <
1566  word->best_choice->rating());
1567  if (debug_x_ht_level >= 1) {
1568  ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1569  }
1570  }
1571  if (accept_new_x_ht) {
1572  word->ConsumeWordResults(&new_x_ht_word);
1573  return true;
1574  }
1575  return false;
1576 }
UNICHARSET unicharset
Definition: ccutil.h:71
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1644
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1476
Pix * BestPix() const
float rating() const
Definition: ratngs.h:327
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:769
float certainty() const
Definition: ratngs.h:330
WERD_CHOICE * best_choice
Definition: pageres.h:234
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:70
float x_height
Definition: pageres.h:310
WERD * word
Definition: pageres.h:188
bool classify_bln_numeric_mode
Definition: classify.h:540
BlamerBundle * blamer_bundle
Definition: pageres.h:245
const Textord& tesseract::Tesseract::textord ( ) const
inline

Definition at line 268 of file tesseractclass.h.

268  {
269  return textord_;
270  }
void tesseract::Tesseract::TidyUp ( PAGE_RES page_res)
  • Counts up the labelled words and the blobs within.
  • Deletes all unused or emptied words, counting the unused ones.
  • Resets W_BOL and W_EOL flags correctly.
  • Builds the rebuild_word and rebuilds the box_word and the best_choice.

Definition at line 712 of file applybox.cpp.

712  {
713  int ok_blob_count = 0;
714  int bad_blob_count = 0;
715  int ok_word_count = 0;
716  int unlabelled_words = 0;
717  PAGE_RES_IT pr_it(page_res);
718  WERD_RES* word_res;
719  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
720  int ok_in_word = 0;
721  int blob_count = word_res->correct_text.size();
722  auto* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
723  word_choice->set_permuter(TOP_CHOICE_PERM);
724  for (int c = 0; c < blob_count; ++c) {
725  if (word_res->correct_text[c].length() > 0) {
726  ++ok_in_word;
727  }
728  // Since we only need a fake word_res->best_choice, the actual
729  // unichar_ids do not matter. Which is fortunate, since TidyUp()
730  // can be called while training Tesseract, at the stage where
731  // unicharset is not meaningful yet.
732  word_choice->append_unichar_id_space_allocated(
733  INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
734  }
735  if (ok_in_word > 0) {
736  ok_blob_count += ok_in_word;
737  bad_blob_count += word_res->correct_text.size() - ok_in_word;
738  word_res->LogNewRawChoice(word_choice);
739  word_res->LogNewCookedChoice(1, false, word_choice);
740  } else {
741  ++unlabelled_words;
742  if (applybox_debug > 0) {
743  tprintf("APPLY_BOXES: Unlabelled word at :");
744  word_res->word->bounding_box().print();
745  }
746  pr_it.DeleteCurrentWord();
747  delete word_choice;
748  }
749  }
750  pr_it.restart_page();
751  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
752  // Denormalize back to a BoxWord.
753  word_res->RebuildBestState();
754  word_res->SetupBoxWord();
755  word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
756  word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
757  }
758  if (applybox_debug > 0) {
759  tprintf(" Found %d good blobs.\n", ok_blob_count);
760  if (bad_blob_count > 0) {
761  tprintf(" Leaving %d unlabelled blobs in %d words.\n",
762  bad_blob_count, ok_word_count);
763  }
764  if (unlabelled_words > 0)
765  tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
766  }
767 }
void print() const
Definition: rect.h:278
int length() const
Definition: genericvector.h:84
void SetupBoxWord()
Definition: pageres.cpp:853
GenericVector< STRING > correct_text
Definition: pageres.h:274
start of line
Definition: werd.h:32
end of line
Definition: werd.h:33
const UNICHARSET * uch_set
Definition: pageres.h:205
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:608
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
TBOX bounding_box() const
Definition: werd.cpp:148
void RebuildBestState()
Definition: pageres.cpp:812
GenericVector< int > best_state
Definition: pageres.h:270
int size() const
Definition: genericvector.h:70
WERD * word
Definition: pageres.h:188
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:118
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:624
void tesseract::Tesseract::tilde_crunch ( PAGE_RES_IT page_res_it)

Definition at line 417 of file docqual.cpp.

417  {
418  WERD_RES *word;
419  GARBAGE_LEVEL garbage_level;
420  PAGE_RES_IT copy_it;
421  bool prev_potential_marked = false;
422  bool found_terrible_word = false;
423  bool ok_dict_word;
424 
425  page_res_it.restart_page();
426  while (page_res_it.word() != nullptr) {
427  POLY_BLOCK* pb = page_res_it.block()->block->pdblk.poly_block();
428  if (pb != nullptr && !pb->IsText()) {
429  page_res_it.forward();
430  continue;
431  }
432  word = page_res_it.word();
433 
435  convert_bad_unlv_chs(word);
436 
438  word->merge_tess_fails();
439 
440  if (word->reject_map.accept_count () != 0) {
441  found_terrible_word = false;
442  //Forget earlier potential crunches
443  prev_potential_marked = false;
444  }
445  else {
446  ok_dict_word = safe_dict_word(word);
447  garbage_level = garbage_word(word, ok_dict_word);
448 
449  if ((garbage_level != G_NEVER_CRUNCH) &&
450  (terrible_word_crunch (word, garbage_level))) {
451  if (crunch_debug > 0) {
452  tprintf ("T CRUNCHING: \"%s\"\n",
453  word->best_choice->unichar_string().string());
454  }
456  if (prev_potential_marked) {
457  while (copy_it.word () != word) {
458  if (crunch_debug > 0) {
459  tprintf ("P1 CRUNCHING: \"%s\"\n",
460  copy_it.word()->best_choice->unichar_string().string());
461  }
462  copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
463  copy_it.forward ();
464  }
465  prev_potential_marked = false;
466  }
467  found_terrible_word = true;
468  }
469  else if ((garbage_level != G_NEVER_CRUNCH) &&
470  (potential_word_crunch (word,
471  garbage_level, ok_dict_word))) {
472  if (found_terrible_word) {
473  if (crunch_debug > 0) {
474  tprintf ("P2 CRUNCHING: \"%s\"\n",
475  word->best_choice->unichar_string().string());
476  }
478  }
479  else if (!prev_potential_marked) {
480  copy_it = page_res_it;
481  prev_potential_marked = true;
482  if (crunch_debug > 1) {
483  tprintf ("P3 CRUNCHING: \"%s\"\n",
484  word->best_choice->unichar_string().string());
485  }
486  }
487  }
488  else {
489  found_terrible_word = false;
490  //Forget earlier potential crunches
491  prev_potential_marked = false;
492  if (crunch_debug > 2) {
493  tprintf ("NO CRUNCH: \"%s\"\n",
494  word->best_choice->unichar_string().string());
495  }
496  }
497  }
498  page_res_it.forward ();
499  }
500 }
BLOCK * block
Definition: pageres.h:116
WERD_RES * restart_page()
Definition: pageres.h:702
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
Definition: docqual.cpp:679
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56
const STRING & unichar_string() const
Definition: ratngs.h:541
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:503
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:608
BLOCK_RES * block() const
Definition: pageres.h:761
REJMAP reject_map
Definition: pageres.h:286
bool crunch_early_convert_bad_unlv_chs
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
Definition: docqual.cpp:541
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:659
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:309
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
WERD_RES * word() const
Definition: pageres.h:755
WERD_RES * forward()
Definition: pageres.h:735
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:191
int16_t accept_count()
Definition: rejctmap.cpp:279
GARBAGE_LEVEL
Definition: docqual.h:29
void merge_tess_fails()
Definition: pageres.cpp:1071
WERD_CHOICE * best_choice
Definition: pageres.h:234
bool IsText() const
Definition: polyblk.h:49
void tesseract::Tesseract::tilde_delete ( PAGE_RES_IT page_res_it)

Definition at line 589 of file docqual.cpp.

589  {
590  WERD_RES *word;
591  PAGE_RES_IT copy_it;
592  bool deleting_from_bol = false;
593  bool marked_delete_point = false;
594  int16_t debug_delete_mode;
595  CRUNCH_MODE delete_mode;
596  int16_t x_debug_delete_mode;
597  CRUNCH_MODE x_delete_mode;
598 
599  page_res_it.restart_page();
600  while (page_res_it.word() != nullptr) {
601  word = page_res_it.word();
602 
603  delete_mode = word_deletable (word, debug_delete_mode);
604  if (delete_mode != CR_NONE) {
605  if (word->word->flag (W_BOL) || deleting_from_bol) {
606  if (crunch_debug > 0) {
607  tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
608  debug_delete_mode,
609  word->best_choice->unichar_string().string());
610  }
611  word->unlv_crunch_mode = delete_mode;
612  deleting_from_bol = true;
613  } else if (word->word->flag(W_EOL)) {
614  if (marked_delete_point) {
615  while (copy_it.word() != word) {
616  x_delete_mode = word_deletable (copy_it.word (),
617  x_debug_delete_mode);
618  if (crunch_debug > 0) {
619  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
620  x_debug_delete_mode,
621  copy_it.word()->best_choice->unichar_string().string());
622  }
623  copy_it.word ()->unlv_crunch_mode = x_delete_mode;
624  copy_it.forward ();
625  }
626  }
627  if (crunch_debug > 0) {
628  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
629  debug_delete_mode,
630  word->best_choice->unichar_string().string());
631  }
632  word->unlv_crunch_mode = delete_mode;
633  deleting_from_bol = false;
634  marked_delete_point = false;
635  }
636  else {
637  if (!marked_delete_point) {
638  copy_it = page_res_it;
639  marked_delete_point = true;
640  }
641  }
642  }
643  else {
644  deleting_from_bol = false;
645  //Forget earlier potential crunches
646  marked_delete_point = false;
647  }
648  /*
649  The following step has been left till now as the tess fails are used to
650  determine if the word is deletable.
651  */
653  word->merge_tess_fails();
654  page_res_it.forward ();
655  }
656 }
WERD_RES * restart_page()
Definition: pageres.h:702
const STRING & unichar_string() const
Definition: ratngs.h:541
start of line
Definition: werd.h:32
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
Definition: docqual.cpp:897
end of line
Definition: werd.h:33
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:309
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
WERD_RES * word() const
Definition: pageres.h:755
WERD_RES * forward()
Definition: pageres.h:735
void merge_tess_fails()
Definition: pageres.cpp:1071
WERD_CHOICE * best_choice
Definition: pageres.h:234
CRUNCH_MODE
Definition: pageres.h:158
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
WERD * word
Definition: pageres.h:188
bool tesseract::Tesseract::TrainedXheightFix ( WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1499 of file control.cpp.

1499  {
1500  int original_misfits = CountMisfitTops(word);
1501  if (original_misfits == 0)
1502  return false;
1503  float baseline_shift = 0.0f;
1504  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1505  if (baseline_shift != 0.0f) {
1506  // Try the shift on its own first.
1507  if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
1508  word, block, row))
1509  return false;
1510  original_misfits = CountMisfitTops(word);
1511  if (original_misfits > 0) {
1512  float new_baseline_shift;
1513  // Now recompute the new x_height.
1514  new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1515  if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1516  // No test of return value here, as we are definitely making a change
1517  // to the word by shifting the baseline.
1518  TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
1519  word, block, row);
1520  }
1521  }
1522  return true;
1523  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1524  return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
1525  word, block, row);
1526  } else {
1527  return false;
1528  }
1529 }
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:102
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1533
const double kMinRefitXHeightFraction
Definition: control.cpp:51
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:70
float x_height
Definition: pageres.h:310
void tesseract::Tesseract::TrainFromBoxes ( const GenericVector< TBOX > &  boxes,
const GenericVector< STRING > &  texts,
BLOCK_LIST *  block_list,
DocumentData training_data 
)

Definition at line 74 of file linerec.cpp.

77  {
78  int box_count = boxes.size();
79  // Process all the text lines in this page, as defined by the boxes.
80  int end_box = 0;
81  // Don't let \t, which marks newlines in the box file, get into the line
82  // content, as that makes the line unusable in training.
83  while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
84  for (int start_box = end_box; start_box < box_count; start_box = end_box) {
85  // Find the textline of boxes starting at start and their bounding box.
86  TBOX line_box = boxes[start_box];
87  STRING line_str = texts[start_box];
88  for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t";
89  ++end_box) {
90  line_box += boxes[end_box];
91  line_str += texts[end_box];
92  }
93  // Find the most overlapping block.
94  BLOCK* best_block = nullptr;
95  int best_overlap = 0;
96  BLOCK_IT b_it(block_list);
97  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
98  BLOCK* block = b_it.data();
99  if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText())
100  continue; // Not a text block.
101  TBOX block_box = block->pdblk.bounding_box();
102  block_box.rotate(block->re_rotation());
103  if (block_box.major_overlap(line_box)) {
104  TBOX overlap_box = line_box.intersection(block_box);
105  if (overlap_box.area() > best_overlap) {
106  best_overlap = overlap_box.area();
107  best_block = block;
108  }
109  }
110  }
111  ImageData* imagedata = nullptr;
112  if (best_block == nullptr) {
113  tprintf("No block overlapping textline: %s\n", line_str.string());
114  } else {
115  imagedata = GetLineData(line_box, boxes, texts, start_box, end_box,
116  *best_block);
117  }
118  if (imagedata != nullptr)
119  training_data->AddPageToDocument(imagedata);
120  // Don't let \t, which marks newlines in the box file, get into the line
121  // content, as that makes the line unusable in training.
122  while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
123  }
124 }
int32_t area() const
Definition: rect.h:122
Definition: rect.h:34
bool major_overlap(const TBOX &box) const
Definition: rect.h:368
ImageData * GetLineData(const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
Definition: linerec.cpp:129
FCOORD re_rotation() const
Definition: ocrblock.h:135
Definition: strngs.h:45
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:60
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:191
void rotate(const FCOORD &vec)
Definition: rect.h:197
bool IsText() const
Definition: polyblk.h:49
int size() const
Definition: genericvector.h:70
Definition: ocrblock.h:29
void tesseract::Tesseract::TrainLineRecognizer ( const STRING input_imagename,
const STRING output_basename,
BLOCK_LIST *  block_list 
)

Definition at line 43 of file linerec.cpp.

45  {
46  STRING lstmf_name = output_basename + ".lstmf";
47  DocumentData images(lstmf_name);
48  if (applybox_page > 0) {
49  // Load existing document for the previous pages.
50  if (!images.LoadDocument(lstmf_name.string(), 0, 0, nullptr)) {
51  tprintf("Failed to read training data from %s!\n", lstmf_name.string());
52  return;
53  }
54  }
55  GenericVector<TBOX> boxes;
57  // Get the boxes for this page, if there are any.
58  if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr,
59  nullptr) ||
60  boxes.empty()) {
61  tprintf("Failed to read boxes from %s\n", input_imagename.string());
62  return;
63  }
64  TrainFromBoxes(boxes, texts, block_list, &images);
65  images.Shuffle();
66  if (!images.SaveDocument(lstmf_name.string(), nullptr)) {
67  tprintf("Failed to write training data to %s!\n", lstmf_name.string());
68  }
69 }
Definition: strngs.h:45
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
bool empty() const
Definition: genericvector.h:89
void TrainFromBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
Definition: linerec.cpp:74
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:53
WERD_RES * tesseract::Tesseract::TrySuperscriptSplits ( int  num_chopped_leading,
float  leading_certainty,
ScriptPos  leading_pos,
int  num_chopped_trailing,
float  trailing_certainty,
ScriptPos  trailing_pos,
WERD_RES word,
bool *  is_good,
int *  retry_rebuild_leading,
int *  retry_rebuild_trailing 
)

Try splitting off the given number of (chopped) blobs from the front and back of the given word and recognizing the pieces.

Parameters
[in]num_chopped_leadinghow many chopped blobs from the left end of the word to chop off and try recognizing as a superscript (or subscript)
[in]leading_certaintythe (minimum) certainty had by the characters in the original leading section.
[in]leading_pos"super" or "sub" (for debugging)
[in]num_chopped_trailinghow many chopped blobs from the right end of the word to chop off and try recognizing as a superscript (or subscript)
[in]trailing_certaintythe (minimum) certainty had by the characters in the original trailing section.
[in]trailing_pos"super" or "sub" (for debugging)
[in]wordthe word to try to chop up.
[out]is_gooddo we believe our result?
[out]retry_rebuild_leading,retry_rebuild_trailingIf non-zero, and !is_good, then the caller may have luck trying to split the returned word with this number of (rebuilt) leading and trailing blobs / unichars.
Returns
A word which is the result of re-recognizing as asked.

Definition at line 383 of file superscript.cpp.

389  {
390  int num_chopped = word->chopped_word->NumBlobs();
391 
392  *retry_rebuild_leading = *retry_rebuild_trailing = 0;
393 
394  // Chop apart the word into up to three pieces.
395 
396  BlamerBundle *bb0 = nullptr;
397  BlamerBundle *bb1 = nullptr;
398  WERD_RES *prefix = nullptr;
399  WERD_RES *core = nullptr;
400  WERD_RES *suffix = nullptr;
401  if (num_chopped_leading > 0) {
402  prefix = new WERD_RES(*word);
403  split_word(prefix, num_chopped_leading, &core, &bb0);
404  } else {
405  core = new WERD_RES(*word);
406  }
407 
408  if (num_chopped_trailing > 0) {
409  int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
410  split_word(core, split_pt, &suffix, &bb1);
411  }
412 
413  // Recognize the pieces in turn.
414  int saved_cp_multiplier = classify_class_pruner_multiplier;
415  int saved_im_multiplier = classify_integer_matcher_multiplier;
416  if (prefix) {
417  // Turn off Tesseract's y-position penalties for the leading superscript.
420 
421  // Adjust our expectations about the baseline for this prefix.
422  if (superscript_debug >= 3) {
423  tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
424  }
425  recog_word_recursive(prefix);
426  if (superscript_debug >= 2) {
427  tprintf(" The leading bits look like %s %s\n",
428  ScriptPosToString(leading_pos),
429  prefix->best_choice->unichar_string().string());
430  }
431 
432  // Restore the normal y-position penalties.
433  classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
434  classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
435  }
436 
437  if (superscript_debug >= 3) {
438  tprintf(" recognizing middle %d chopped blobs\n",
439  num_chopped - num_chopped_leading - num_chopped_trailing);
440  }
441 
442  if (suffix) {
443  // Turn off Tesseract's y-position penalties for the trailing superscript.
446 
447  if (superscript_debug >= 3) {
448  tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
449  }
450  recog_word_recursive(suffix);
451  if (superscript_debug >= 2) {
452  tprintf(" The trailing bits look like %s %s\n",
453  ScriptPosToString(trailing_pos),
454  suffix->best_choice->unichar_string().string());
455  }
456 
457  // Restore the normal y-position penalties.
458  classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
459  classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
460  }
461 
462  // Evaluate whether we think the results are believably better
463  // than what we already had.
464  bool good_prefix = !prefix || BelievableSuperscript(
465  superscript_debug >= 1, *prefix,
466  superscript_bettered_certainty * leading_certainty,
467  retry_rebuild_leading, nullptr);
468  bool good_suffix = !suffix || BelievableSuperscript(
469  superscript_debug >= 1, *suffix,
470  superscript_bettered_certainty * trailing_certainty,
471  nullptr, retry_rebuild_trailing);
472 
473  *is_good = good_prefix && good_suffix;
474  if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
475  // None of it is any good. Quit now.
476  delete core;
477  delete prefix;
478  delete suffix;
479  delete bb1;
480  return nullptr;
481  }
482  recog_word_recursive(core);
483 
484  // Now paste the results together into core.
485  if (suffix) {
486  suffix->SetAllScriptPositions(trailing_pos);
487  join_words(core, suffix, bb1);
488  }
489  if (prefix) {
490  prefix->SetAllScriptPositions(leading_pos);
491  join_words(prefix, core, bb0);
492  core = prefix;
493  prefix = nullptr;
494  }
495 
496  if (superscript_debug >= 1) {
497  tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
498  core->best_choice->unichar_string().string());
499  }
500  return core;
501 }
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
const STRING & unichar_string() const
Definition: ratngs.h:541
int classify_integer_matcher_multiplier
Definition: classify.h:509
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:176
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:869
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
double superscript_bettered_certainty
int NumBlobs() const
Definition: blobs.h:427
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:200
WERD_CHOICE * best_choice
Definition: pageres.h:234
TWERD * chopped_word
Definition: pageres.h:214
int classify_class_pruner_multiplier
Definition: classify.h:505
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:234
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:104
void tesseract::Tesseract::unrej_good_chs ( WERD_RES word,
ROW row 
)

Definition at line 115 of file docqual.cpp.

115  {
116  if (word->bln_boxes == nullptr ||
117  word->rebuild_word == nullptr || word->rebuild_word->blobs.empty())
118  return;
119 
120  DocQualCallbacks cb(word);
122  *word->rebuild_word,
124 }
TWERD * rebuild_word
Definition: pageres.h:259
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
void AcceptIfGoodQuality(int index)
Definition: docqual.cpp:44
tesseract::BoxWord * bln_boxes
Definition: pageres.h:197
bool empty() const
Definition: genericvector.h:89
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:190
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
void tesseract::Tesseract::unrej_good_quality_words ( PAGE_RES_IT page_res_it)

Definition at line 160 of file docqual.cpp.

161  {
162  WERD_RES *word;
163  ROW_RES *current_row;
164  BLOCK_RES *current_block;
165  int i;
166 
167  page_res_it.restart_page ();
168  while (page_res_it.word () != nullptr) {
169  check_debug_pt (page_res_it.word (), 100);
170  if (bland_unrej) {
171  word = page_res_it.word ();
172  for (i = 0; i < word->reject_map.length (); i++) {
173  if (word->reject_map[i].accept_if_good_quality ())
174  word->reject_map[i].setrej_quality_accept ();
175  }
176  page_res_it.forward ();
177  }
178  else if ((page_res_it.row ()->char_count > 0) &&
179  ((page_res_it.row ()->rej_count /
180  static_cast<float>(page_res_it.row ()->char_count)) <=
182  word = page_res_it.word ();
186  word->best_choice->unichar_string().string(),
188  != AC_UNACCEPTABLE)) {
189  unrej_good_chs(word, page_res_it.row ()->row);
190  }
191  page_res_it.forward ();
192  }
193  else {
194  /* Skip to end of dodgy row */
195  current_row = page_res_it.row ();
196  while ((page_res_it.word () != nullptr) &&
197  (page_res_it.row () == current_row))
198  page_res_it.forward ();
199  }
200  check_debug_pt (page_res_it.word (), 110);
201  }
202  page_res_it.restart_page ();
203  page_res_it.page_res->char_count = 0;
204  page_res_it.page_res->rej_count = 0;
205  current_block = nullptr;
206  current_row = nullptr;
207  while (page_res_it.word () != nullptr) {
208  if (current_block != page_res_it.block ()) {
209  current_block = page_res_it.block ();
210  current_block->char_count = 0;
211  current_block->rej_count = 0;
212  }
213  if (current_row != page_res_it.row ()) {
214  current_row = page_res_it.row ();
215  current_row->char_count = 0;
216  current_row->rej_count = 0;
217  current_row->whole_word_rej_count = 0;
218  }
219  page_res_it.rej_stat_word ();
220  page_res_it.forward ();
221  }
222 }
WERD_RES * restart_page()
Definition: pageres.h:702
Unacceptable word.
Definition: control.h:30
int32_t length() const
Definition: rejctmap.h:223
int32_t char_count
Definition: pageres.h:143
const STRING & unichar_string() const
Definition: ratngs.h:541
ROW_RES * row() const
Definition: pageres.h:758
int32_t whole_word_rej_count
Definition: pageres.h:145
BLOCK_RES * block() const
Definition: pageres.h:761
REJMAP reject_map
Definition: pageres.h:286
const UNICHARSET * uch_set
Definition: pageres.h:205
int32_t rej_count
Definition: pageres.h:118
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:115
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1863
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1759
const char * string() const
Definition: strngs.cpp:194
int32_t char_count
Definition: pageres.h:78
WERD_RES * word() const
Definition: pageres.h:755
PAGE_RES * page_res
Definition: pageres.h:678
WERD_RES * forward()
Definition: pageres.h:735
bool quality_recoverable_rejects()
Definition: rejctmap.cpp:300
int32_t rej_count
Definition: pageres.h:79
WERD_CHOICE * best_choice
Definition: pageres.h:234
int32_t char_count
Definition: pageres.h:117
int32_t rej_count
Definition: pageres.h:144
const STRING & unichar_lengths() const
Definition: ratngs.h:548
ROW * row
Definition: pageres.h:142
void rej_stat_word()
Definition: pageres.cpp:1714
bool tesseract::Tesseract::word_adaptable ( WERD_RES word,
uint16_t  mode 
)

Definition at line 34 of file adaptions.cpp.

36  {
38  tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
40  word->best_choice->rating(), word->best_choice->certainty());
41  }
42 
43  bool status = false;
44  BITS16 flags(mode);
45 
46  enum MODES
47  {
48  ADAPTABLE_WERD,
49  ACCEPTABLE_WERD,
50  CHECK_DAWGS,
51  CHECK_SPACES,
52  CHECK_ONE_ELL_CONFLICT,
53  CHECK_AMBIG_WERD
54  };
55 
56  /*
57  0: NO adaption
58  */
59  if (mode == 0) {
60  if (tessedit_adaption_debug) tprintf("adaption disabled\n");
61  return false;
62  }
63 
64  if (flags.bit (ADAPTABLE_WERD)) {
65  status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
66  if (tessedit_adaption_debug && !status) {
67  tprintf("tess_would_adapt bit is false\n");
68  }
69  }
70 
71  if (flags.bit (ACCEPTABLE_WERD)) {
72  status |= word->tess_accepted;
73  if (tessedit_adaption_debug && !status) {
74  tprintf("tess_accepted bit is false\n");
75  }
76  }
77 
78  if (!status) { // If not set then
79  return false; // ignore other checks
80  }
81 
82  if (flags.bit (CHECK_DAWGS) &&
83  (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
84  (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
85  (word->best_choice->permuter () != USER_DAWG_PERM) &&
86  (word->best_choice->permuter () != NUMBER_PERM)) {
87  if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
88  return false;
89  }
90 
91  if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, false)) {
92  if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
93  return false;
94  }
95 
96  if (flags.bit (CHECK_SPACES) &&
97  (strchr(word->best_choice->unichar_string().string(), ' ') != nullptr)) {
98  if (tessedit_adaption_debug) tprintf("word contains spaces\n");
99  return false;
100  }
101 
102  if (flags.bit (CHECK_AMBIG_WERD) &&
104  if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
105  return false;
106  }
107 
109  tprintf("returning status %d\n", status);
110  }
111  return status;
112 }
const STRING & unichar_string() const
Definition: ratngs.h:541
bool tess_accepted
Definition: pageres.h:295
bool tess_would_adapt
Definition: pageres.h:296
float rating() const
Definition: ratngs.h:327
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
bool dangerous_ambig_found() const
Definition: ratngs.h:363
Definition: bits16.h:25
float certainty() const
Definition: ratngs.h:330
WERD_CHOICE * best_choice
Definition: pageres.h:234
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
Definition: reject.cpp:293
uint8_t permuter() const
Definition: ratngs.h:346
bool tesseract::Tesseract::word_blank_and_set_display ( PAGE_RES_IT pr_its)

Definition at line 698 of file pgedit.cpp.

698  {
699  pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK,
701  return word_set_display(pr_it);
702 }
bool word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:926
bool tesseract::Tesseract::word_bln_display ( PAGE_RES_IT pr_it)

word_bln_display()

Normalize word and display in word window

Definition at line 710 of file pgedit.cpp.

710  {
711  WERD_RES* word_res = pr_it->word();
712  if (word_res->chopped_word == nullptr) {
713  // Setup word normalization parameters.
714  word_res->SetupForRecognition(unicharset, this, BestPix(),
715  tessedit_ocr_engine_mode, nullptr,
719  pr_it->row()->row, pr_it->block()->block);
720  }
721  bln_word_window_handle()->Clear();
722  display_bln_lines(bln_word_window_handle(), ScrollView::CYAN,
723  1.0, 0.0f, -1000.0f, 1000.0f);
724  C_BLOB_IT it(word_res->word->cblob_list());
726  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
727  it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN,
728  bln_word_window_handle());
729  color = WERD::NextColor(color);
730  }
731  bln_word_window_handle()->Update();
732  return true;
733 }
BLOCK * block
Definition: pageres.h:116
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:306
ROW_RES * row() const
Definition: pageres.h:758
UNICHARSET unicharset
Definition: ccutil.h:71
BLOCK_RES * block() const
Definition: pageres.h:761
static ScrollView::Color NextColor(ScrollView::Color colour)
Definition: werd.cpp:292
static void Update()
Definition: scrollview.cpp:709
Pix * BestPix() const
void Clear()
Definition: scrollview.cpp:589
WERD_RES * word() const
Definition: pageres.h:755
TWERD * chopped_word
Definition: pageres.h:214
ROW * row
Definition: pageres.h:142
WERD * word
Definition: pageres.h:188
DENORM denorm
Definition: pageres.h:203
bool classify_bln_numeric_mode
Definition: classify.h:540
C_BLOB_LIST * cblob_list()
Definition: werd.h:95
int16_t tesseract::Tesseract::word_blob_quality ( WERD_RES word,
ROW row 
)

Definition at line 60 of file docqual.cpp.

60  {
61  if (word->bln_boxes == nullptr ||
62  word->rebuild_word == nullptr || word->rebuild_word->blobs.empty())
63  return 0;
64 
65  DocQualCallbacks cb(word);
67  *word->rebuild_word,
69  return cb.match_count;
70 }
TWERD * rebuild_word
Definition: pageres.h:259
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
tesseract::BoxWord * bln_boxes
Definition: pageres.h:197
bool empty() const
Definition: genericvector.h:89
void CountMatchingBlobs(int index)
Definition: docqual.cpp:34
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:190
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
void tesseract::Tesseract::word_char_quality ( WERD_RES word,
ROW row,
int16_t *  match_count,
int16_t *  accepted_match_count 
)

Definition at line 92 of file docqual.cpp.

95  {
96  if (word->bln_boxes == nullptr || word->rebuild_word == nullptr ||
97  word->rebuild_word->blobs.empty()) {
98  *match_count = 0;
99  *accepted_match_count = 0;
100  return;
101  }
102 
103  DocQualCallbacks cb(word);
105  *word->rebuild_word,
107  *match_count = cb.match_count;
108  *accepted_match_count = cb.accepted_match_count;
109 }
TWERD * rebuild_word
Definition: pageres.h:259
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
tesseract::BoxWord * bln_boxes
Definition: pageres.h:197
bool empty() const
Definition: genericvector.h:89
void CountAcceptedBlobs(int index)
Definition: docqual.cpp:38
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:190
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
bool tesseract::Tesseract::word_contains_non_1_digit ( const char *  word,
const char *  word_lengths 
)

Definition at line 510 of file reject.cpp.

511  {
512  int16_t i;
513  int16_t offset;
514 
515  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
516  if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
517  (word_lengths[i] != 1 || word[offset] != '1'))
518  return true;
519  }
520  return false;
521 }
UNICHARSET unicharset
Definition: ccutil.h:71
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
CRUNCH_MODE tesseract::Tesseract::word_deletable ( WERD_RES word,
int16_t &  delete_mode 
)

Definition at line 897 of file docqual.cpp.

897  {
898  int word_len = word->reject_map.length ();
899  float rating_per_ch;
900  TBOX box; //BB of word
901 
902  if (word->unlv_crunch_mode == CR_NONE) {
903  delete_mode = 0;
904  return CR_NONE;
905  }
906 
907  if (word_len == 0) {
908  delete_mode = 1;
909  return CR_DELETE;
910  }
911 
912  if (word->rebuild_word != nullptr) {
913  // Cube leaves rebuild_word nullptr.
914  box = word->rebuild_word->bounding_box();
915  if (box.height () < crunch_del_min_ht * kBlnXHeight) {
916  delete_mode = 4;
917  return CR_DELETE;
918  }
919 
920  if (noise_outlines(word->rebuild_word)) {
921  delete_mode = 5;
922  return CR_DELETE;
923  }
924  }
925 
926  if ((failure_count (word) * 1.5) > word_len) {
927  delete_mode = 2;
928  return CR_LOOSE_SPACE;
929  }
930 
931  if (word->best_choice->certainty () < crunch_del_cert) {
932  delete_mode = 7;
933  return CR_LOOSE_SPACE;
934  }
935 
936  rating_per_ch = word->best_choice->rating () / word_len;
937 
938  if (rating_per_ch > crunch_del_rating) {
939  delete_mode = 8;
940  return CR_LOOSE_SPACE;
941  }
942 
944  delete_mode = 9;
945  return CR_LOOSE_SPACE;
946  }
947 
948  if (box.bottom () >
950  delete_mode = 10;
951  return CR_LOOSE_SPACE;
952  }
953 
954  if (box.height () > crunch_del_max_ht * kBlnXHeight) {
955  delete_mode = 11;
956  return CR_LOOSE_SPACE;
957  }
958 
959  if (box.width () < crunch_del_min_width * kBlnXHeight) {
960  delete_mode = 3;
961  return CR_LOOSE_SPACE;
962  }
963 
964  delete_mode = 0;
965  return CR_NONE;
966 }
const int kBlnXHeight
Definition: normalis.h:24
int16_t top() const
Definition: rect.h:58
Definition: rect.h:34
TWERD * rebuild_word
Definition: pageres.h:259
int32_t length() const
Definition: rejctmap.h:223
REJMAP reject_map
Definition: pageres.h:286
int16_t height() const
Definition: rect.h:108
const int kBlnBaselineOffset
Definition: normalis.h:25
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:309
int16_t failure_count(WERD_RES *word)
Definition: docqual.cpp:968
float rating() const
Definition: ratngs.h:327
TBOX bounding_box() const
Definition: blobs.cpp:865
int16_t width() const
Definition: rect.h:115
int16_t bottom() const
Definition: rect.h:65
float certainty() const
Definition: ratngs.h:330
WERD_CHOICE * best_choice
Definition: pageres.h:234
bool noise_outlines(TWERD *word)
Definition: docqual.cpp:980
bool tesseract::Tesseract::word_display ( PAGE_RES_IT pr_it)

word_display() Word Processor

Display a word according to its display modes

Definition at line 742 of file pgedit.cpp.

742  {
743  WERD_RES* word_res = pr_it->word();
744  WERD* word = word_res->word;
745  TBOX word_bb; // word bounding box
746  int word_height; // ht of word BB
747  bool displayed_something = false;
748  float shift; // from bot left
749 
750  if (color_mode != CM_RAINBOW && word_res->box_word != nullptr) {
751  BoxWord* box_word = word_res->box_word;
752  WERD_CHOICE* best_choice = word_res->best_choice;
753  int length = box_word->length();
754  if (word_res->fontinfo == nullptr) return false;
755  const FontInfo& font_info = *word_res->fontinfo;
756  for (int i = 0; i < length; ++i) {
758  switch (color_mode) {
759  case CM_SUBSCRIPT:
760  if (best_choice->BlobPosition(i) == SP_SUBSCRIPT)
761  color = ScrollView::RED;
762  break;
763  case CM_SUPERSCRIPT:
764  if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT)
765  color = ScrollView::RED;
766  break;
767  case CM_ITALIC:
768  if (font_info.is_italic())
769  color = ScrollView::RED;
770  break;
771  case CM_BOLD:
772  if (font_info.is_bold())
773  color = ScrollView::RED;
774  break;
775  case CM_FIXEDPITCH:
776  if (font_info.is_fixed_pitch())
777  color = ScrollView::RED;
778  break;
779  case CM_SERIF:
780  if (font_info.is_serif())
781  color = ScrollView::RED;
782  break;
783  case CM_SMALLCAPS:
784  if (word_res->small_caps)
785  color = ScrollView::RED;
786  break;
787  case CM_DROPCAPS:
788  if (best_choice->BlobPosition(i) == SP_DROPCAP)
789  color = ScrollView::RED;
790  break;
791  // TODO(rays) underline is currently completely unsupported.
792  case CM_UNDERLINE:
793  default:
794  break;
795  }
796  image_win->Pen(color);
797  TBOX box = box_word->BlobBox(i);
798  image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
799  }
800  return true;
801  }
802  /*
803  Note the double coercions of(COLOUR)((int32_t)editor_image_word_bb_color)
804  etc. are to keep the compiler happy.
805  */
806  // display bounding box
807  if (word->display_flag(DF_BOX)) {
808  word->bounding_box().plot(image_win,
809  static_cast<ScrollView::Color>((int32_t)
811  static_cast<ScrollView::Color>((int32_t)
812  editor_image_word_bb_color));
813 
814  auto c = static_cast<ScrollView::Color>((int32_t) editor_image_blob_bb_color);
815  image_win->Pen(c);
816  // cblob iterator
817  C_BLOB_IT c_it(word->cblob_list());
818  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
819  c_it.data()->bounding_box().plot(image_win);
820  displayed_something = true;
821  }
822 
823  // display edge steps
824  if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available
825  word->plot(image_win); // rainbow colors
826  displayed_something = true;
827  }
828 
829  // display poly approx
830  if (word->display_flag(DF_POLYGONAL)) {
831  // need to convert
833  tword->plot(image_win);
834  delete tword;
835  displayed_something = true;
836  }
837 
838  // Display correct text and blamer information.
839  STRING text;
840  STRING blame;
841  if (word->display_flag(DF_TEXT) && word->text() != nullptr) {
842  text = word->text();
843  }
844  if (word->display_flag(DF_BLAMER) &&
845  !(word_res->blamer_bundle != nullptr &&
847  text = "";
848  const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
849  if (blamer_bundle == nullptr) {
850  text += "NULL";
851  } else {
852  text = blamer_bundle->TruthString();
853  }
854  text += " -> ";
855  STRING best_choice_str;
856  if (word_res->best_choice == nullptr) {
857  best_choice_str = "NULL";
858  } else {
859  word_res->best_choice->string_and_lengths(&best_choice_str, nullptr);
860  }
861  text += best_choice_str;
862  IncorrectResultReason reason = (blamer_bundle == nullptr) ?
863  IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();
864  ASSERT_HOST(reason < IRR_NUM_REASONS);
865  blame += " [";
866  blame += BlamerBundle::IncorrectReasonName(reason);
867  blame += "]";
868  }
869  if (text.length() > 0) {
870  word_bb = word->bounding_box();
871  image_win->Pen(ScrollView::RED);
872  word_height = word_bb.height();
873  int text_height = 0.50 * word_height;
874  if (text_height > 20) text_height = 20;
875  image_win->TextAttributes("Arial", text_height, false, false, false);
876  shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
877  image_win->Text(word_bb.left() + shift,
878  word_bb.bottom() + 0.25 * word_height, text.string());
879  if (blame.length() > 0) {
880  image_win->Text(word_bb.left() + shift,
881  word_bb.bottom() + 0.25 * word_height - text_height,
882  blame.string());
883  }
884 
885  displayed_something = true;
886  }
887 
888  if (!displayed_something) // display BBox anyway
889  word->bounding_box().plot(image_win,
890  static_cast<ScrollView::Color>((int32_t) editor_image_word_bb_color),
891  static_cast<ScrollView::Color>((int32_t)
892  editor_image_word_bb_color));
893  return true;
894 }
Definition: werd.h:56
STRING TruthString() const
Definition: blamer.h:112
int16_t top() const
Definition: rect.h:58
void Text(int x, int y, const char *mystring)
Definition: scrollview.cpp:652
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:780
Definition: rect.h:34
Definition: blobs.h:397
Definition: strngs.h:45
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:600
IncorrectResultReason
Definition: blamer.h:49
bool is_italic() const
Definition: fontinfo.h:111
int editor_image_word_bb_color
Definition: pgedit.cpp:125
int32_t length() const
Definition: strngs.cpp:189
Correct ascii.
Definition: werd.h:47
int length() const
Definition: ratngs.h:303
void plot(ScrollView *window)
Definition: blobs.cpp:901
Blamer information.
Definition: werd.h:51
int16_t height() const
Definition: rect.h:108
Edge steps.
Definition: werd.h:49
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:449
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
Definition: scrollview.cpp:635
void plot(ScrollView *fd) const
Definition: rect.h:286
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:118
Polyg approx.
Definition: werd.h:48
bool small_caps
Definition: pageres.h:298
void Pen(Color color)
Definition: scrollview.cpp:719
const FontInfo * fontinfo
Definition: pageres.h:303
const char * string() const
Definition: strngs.cpp:194
tesseract::BoxWord * box_word
Definition: pageres.h:265
WERD_RES * word() const
Definition: pageres.h:755
int16_t width() const
Definition: rect.h:115
bool display_flag(uint8_t flag) const
Definition: werd.h:120
bool is_fixed_pitch() const
Definition: fontinfo.h:113
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
TBOX bounding_box() const
Definition: werd.cpp:148
void plot(ScrollView *window, ScrollView::Color colour)
Definition: werd.cpp:283
WERD_CHOICE * best_choice
Definition: pageres.h:234
#define ASSERT_HOST(x)
Definition: errcode.h:88
int16_t left() const
Definition: rect.h:72
int editor_image_blob_bb_color
Definition: pgedit.cpp:127
Bounding box.
Definition: werd.h:46
bool is_serif() const
Definition: fontinfo.h:114
const char * text() const
Definition: werd.h:114
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:322
bool is_bold() const
Definition: fontinfo.h:112
WERD * word
Definition: pageres.h:188
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:61
BlamerBundle * blamer_bundle
Definition: pageres.h:245
C_BLOB_LIST * cblob_list()
Definition: werd.h:95
bool tesseract::Tesseract::word_dumper ( PAGE_RES_IT pr_it)

word_dumper()

Dump members to the debug window

Definition at line 902 of file pgedit.cpp.

902  {
903  if (pr_it->block()->block != nullptr) {
904  tprintf("\nBlock data...\n");
905  pr_it->block()->block->print(nullptr, false);
906  }
907  tprintf("\nRow data...\n");
908  pr_it->row()->row->print(nullptr);
909  tprintf("\nWord data...\n");
910  WERD_RES* word_res = pr_it->word();
911  word_res->word->print();
912  if (word_res->blamer_bundle != nullptr && wordrec_debug_blamer &&
914  tprintf("Current blamer debug: %s\n",
915  word_res->blamer_bundle->debug().string());
916  }
917  return true;
918 }
void print(FILE *fp, bool dump)
dump whole table
Definition: ocrblock.cpp:191
BLOCK * block
Definition: pageres.h:116
ROW_RES * row() const
Definition: pageres.h:758
BLOCK_RES * block() const
Definition: pageres.h:761
void print()
Definition: werd.cpp:253
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:118
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
const STRING & debug() const
Definition: blamer.h:128
WERD_RES * word() const
Definition: pageres.h:755
bool wordrec_debug_blamer
Definition: wordrec.h:236
ROW * row
Definition: pageres.h:142
WERD * word
Definition: pageres.h:188
void print(FILE *fp)
Definition: ocrrow.cpp:166
BlamerBundle * blamer_bundle
Definition: pageres.h:245
int16_t tesseract::Tesseract::word_outline_errs ( WERD_RES word)

Definition at line 72 of file docqual.cpp.

72  {
73  int16_t i = 0;
74  int16_t err_count = 0;
75 
76  if (word->rebuild_word != nullptr) {
77  for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
78  TBLOB* blob = word->rebuild_word->blobs[b];
79  err_count += count_outline_errs(word->best_choice->unichar_string()[i],
80  blob->NumOutlines());
81  i++;
82  }
83  }
84  return err_count;
85 }
TWERD * rebuild_word
Definition: pageres.h:259
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
const STRING & unichar_string() const
Definition: ratngs.h:541
Definition: blobs.h:263
int NumBlobs() const
Definition: blobs.h:427
WERD_CHOICE * best_choice
Definition: pageres.h:234
int NumOutlines() const
Definition: blobs.cpp:458
int16_t count_outline_errs(char c, int16_t outline_count)
Definition: docqual.cpp:126
bool tesseract::Tesseract::word_set_display ( PAGE_RES_IT pr_it)

word_set_display() Word processor

Display word according to current display mode settings

Definition at line 926 of file pgedit.cpp.

926  {
927  WERD* word = pr_it->word()->word;
928  word->set_display_flag(DF_BOX, word_display_mode.bit(DF_BOX));
929  word->set_display_flag(DF_TEXT, word_display_mode.bit(DF_TEXT));
930  word->set_display_flag(DF_POLYGONAL, word_display_mode.bit(DF_POLYGONAL));
931  word->set_display_flag(DF_EDGE_STEP, word_display_mode.bit(DF_EDGE_STEP));
933  word_display_mode.bit(DF_BN_POLYGONAL));
934  word->set_display_flag(DF_BLAMER, word_display_mode.bit(DF_BLAMER));
935  return word_display(pr_it);
936 }
Definition: werd.h:56
bool word_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:742
Correct ascii.
Definition: werd.h:47
void set_display_flag(uint8_t flag, bool value)
Definition: werd.h:121
bool bit(uint8_t bit_num) const
Definition: bits16.h:51
Blamer information.
Definition: werd.h:51
Edge steps.
Definition: werd.h:49
Polyg approx.
Definition: werd.h:48
WERD_RES * word() const
Definition: pageres.h:755
Bounding box.
Definition: werd.h:46
WERD * word
Definition: pageres.h:188
BL normalisd polyapx.
Definition: werd.h:50
int16_t tesseract::Tesseract::worst_noise_blob ( WERD_RES word_res,
float *  worst_noise_score 
)

Definition at line 707 of file fixspace.cpp.

708  {
709  float noise_score[512];
710  int i;
711  int min_noise_blob; // 1st contender
712  int max_noise_blob; // last contender
713  int non_noise_count;
714  int worst_noise_blob; // Worst blob
715  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
716  float non_noise_limit = kBlnXHeight * 0.8;
717 
718  if (word_res->rebuild_word == nullptr)
719  return -1; // Can't handle cube words.
720 
721  // Normalised.
722  int blob_count = word_res->box_word->length();
723  ASSERT_HOST(blob_count <= 512);
724  if (blob_count < 5)
725  return -1; // too short to split
726 
727  /* Get the noise scores for all blobs */
728 
729  #ifndef SECURE_NAMES
730  if (debug_fix_space_level > 5)
731  tprintf("FP fixspace Noise metrics for \"%s\": ",
732  word_res->best_choice->unichar_string().string());
733  #endif
734 
735  for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
736  TBLOB* blob = word_res->rebuild_word->blobs[i];
737  if (word_res->reject_map[i].accepted())
738  noise_score[i] = non_noise_limit;
739  else
740  noise_score[i] = blob_noise_score(blob);
741 
742  if (debug_fix_space_level > 5)
743  tprintf("%1.1f ", noise_score[i]);
744  }
745  if (debug_fix_space_level > 5)
746  tprintf("\n");
747 
748  /* Now find the worst one which is far enough away from the end of the word */
749 
750  non_noise_count = 0;
751  for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
752  if (noise_score[i] >= non_noise_limit) {
753  non_noise_count++;
754  }
755  }
756  if (non_noise_count < fixsp_non_noise_limit)
757  return -1;
758 
759  min_noise_blob = i;
760 
761  non_noise_count = 0;
762  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
763  i--) {
764  if (noise_score[i] >= non_noise_limit) {
765  non_noise_count++;
766  }
767  }
768  if (non_noise_count < fixsp_non_noise_limit)
769  return -1;
770 
771  max_noise_blob = i;
772 
773  if (min_noise_blob > max_noise_blob)
774  return -1;
775 
776  *worst_noise_score = small_limit;
777  worst_noise_blob = -1;
778  for (i = min_noise_blob; i <= max_noise_blob; i++) {
779  if (noise_score[i] < *worst_noise_score) {
780  worst_noise_blob = i;
781  *worst_noise_score = noise_score[i];
782  }
783  }
784  return worst_noise_blob;
785 }
const int kBlnXHeight
Definition: normalis.h:24
TWERD * rebuild_word
Definition: pageres.h:259
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
const STRING & unichar_string() const
Definition: ratngs.h:541
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:787
Definition: blobs.h:263
int length() const
Definition: boxword.h:83
REJMAP reject_map
Definition: pageres.h:286
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
tesseract::BoxWord * box_word
Definition: pageres.h:265
int NumBlobs() const
Definition: blobs.h:427
WERD_CHOICE * best_choice
Definition: pageres.h:234
#define ASSERT_HOST(x)
Definition: errcode.h:88
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:707
void tesseract::Tesseract::write_results ( PAGE_RES_IT page_res_it,
char  newline_type,
bool  force_eol 
)

Definition at line 98 of file output.cpp.

100  { // override tilde crunch?
101  WERD_RES *word = page_res_it.word();
102  const UNICHARSET &uchset = *word->uch_set;
103  int i;
104  bool need_reject = false;
105  UNICHAR_ID space = uchset.unichar_to_id(" ");
106 
107  if ((word->unlv_crunch_mode != CR_NONE ||
108  word->best_choice->length() == 0) &&
110  if ((word->unlv_crunch_mode != CR_DELETE) &&
111  (!stats_.tilde_crunch_written ||
112  ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
113  (word->word->space () > 0) &&
114  !word->word->flag (W_FUZZY_NON) &&
115  !word->word->flag (W_FUZZY_SP)))) {
116  if (!word->word->flag (W_BOL) &&
117  (word->word->space () > 0) &&
118  !word->word->flag (W_FUZZY_NON) &&
119  !word->word->flag (W_FUZZY_SP)) {
120  stats_.last_char_was_tilde = false;
121  }
122  need_reject = true;
123  }
124  if ((need_reject && !stats_.last_char_was_tilde) ||
125  (force_eol && stats_.write_results_empty_block)) {
126  /* Write a reject char - mark as rejected unless zero_rejection mode */
127  stats_.last_char_was_tilde = true;
128  stats_.tilde_crunch_written = true;
129  stats_.last_char_was_newline = false;
130  stats_.write_results_empty_block = false;
131  }
132 
133  if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
134  stats_.tilde_crunch_written = false;
135  stats_.last_char_was_newline = true;
136  stats_.last_char_was_tilde = false;
137  }
138 
139  if (force_eol)
140  stats_.write_results_empty_block = true;
141  return;
142  }
143 
144  /* NORMAL PROCESSING of non tilde crunched words */
145 
146  stats_.tilde_crunch_written = false;
147  if (newline_type)
148  stats_.last_char_was_newline = true;
149  else
150  stats_.last_char_was_newline = false;
151  stats_.write_results_empty_block = force_eol; // about to write a real word
152 
153  if (unlv_tilde_crunching &&
154  stats_.last_char_was_tilde &&
155  (word->word->space() == 0) &&
157  (word->best_choice->unichar_id(0) == space)) {
158  /* Prevent adjacent tilde across words - we know that adjacent tildes within
159  words have been removed */
160  word->MergeAdjacentBlobs(0);
161  }
162  if (newline_type ||
164  stats_.last_char_was_tilde = false;
165  else {
166  if (word->reject_map.length () > 0) {
167  if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
168  stats_.last_char_was_tilde = true;
169  else
170  stats_.last_char_was_tilde = false;
171  }
172  else if (word->word->space () > 0)
173  stats_.last_char_was_tilde = false;
174  /* else it is unchanged as there are no output chars */
175  }
176 
177  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
178 
179  set_unlv_suspects(word);
180  check_debug_pt (word, 120);
182  tprintf ("Dict word: \"%s\": %d\n",
183  word->best_choice->debug_string().string(),
184  dict_word(*(word->best_choice)));
185  }
186  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
188  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
189  for (i = 0; i < word->best_choice->length(); ++i) {
190  if (word->reject_map[i].rejected())
191  word->reject_map[i].setrej_minimal_rej_accept();
192  }
193  }
195  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
196  for (i = 0; i < word->best_choice->length(); ++i) {
197  if ((word->best_choice->unichar_id(i) != space) &&
198  word->reject_map[i].rejected())
199  word->reject_map[i].setrej_minimal_rej_accept();
200  }
201  }
202  }
203 }
fuzzy nonspace
Definition: werd.h:40
int32_t length() const
Definition: rejctmap.h:223
start of line
Definition: werd.h:32
int length() const
Definition: ratngs.h:303
repeated character
Definition: werd.h:38
fuzzy space
Definition: werd.h:39
end of line
Definition: werd.h:33
REJMAP reject_map
Definition: pageres.h:286
const UNICHARSET * uch_set
Definition: pageres.h:205
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:309
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1863
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
WERD_RES * word() const
Definition: pageres.h:755
uint8_t space()
Definition: werd.h:99
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:273
WERD_CHOICE * best_choice
Definition: pageres.h:234
#define ASSERT_HOST(x)
Definition: errcode.h:88
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:89
int UNICHAR_ID
Definition: unichar.h:34
const STRING debug_string() const
Definition: ratngs.h:505
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:978
WERD * word
Definition: pageres.h:188

Member Data Documentation

int tesseract::Tesseract::applybox_debug = 1

"Debug level"

Definition at line 822 of file tesseractclass.h.

char* tesseract::Tesseract::applybox_exposure_pattern = ".exp"

"Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif"

Definition at line 827 of file tesseractclass.h.

bool tesseract::Tesseract::applybox_learn_chars_and_char_frags_mode = false

"Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters."

Definition at line 831 of file tesseractclass.h.

bool tesseract::Tesseract::applybox_learn_ngrams_mode = false

"Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally."

Definition at line 834 of file tesseractclass.h.

int tesseract::Tesseract::applybox_page = 0

"Page number to apply boxes from"

Definition at line 823 of file tesseractclass.h.

int tesseract::Tesseract::bidi_debug = 0

"Debug level for BiDi"

Definition at line 821 of file tesseractclass.h.

bool tesseract::Tesseract::bland_unrej = false

"unrej potential with no checks"

Definition at line 934 of file tesseractclass.h.

char* tesseract::Tesseract::chs_leading_punct = "('`\""

"Leading punctuation"

Definition at line 874 of file tesseractclass.h.

char* tesseract::Tesseract::chs_trailing_punct1 = ").,;:?!"

"1st Trailing punctuation"

Definition at line 875 of file tesseractclass.h.

char* tesseract::Tesseract::chs_trailing_punct2 = ")'`\""

"2nd Trailing punctuation"

Definition at line 876 of file tesseractclass.h.

char* tesseract::Tesseract::conflict_set_I_l_1 = "Il1[]"

"Il1 conflict set"

Definition at line 1050 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_accept_ok = true

"Use acceptability in okstring"

Definition at line 960 of file tesseractclass.h.

int tesseract::Tesseract::crunch_debug = 0

"As it says"

Definition at line 969 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_cert = -10.0

"POTENTIAL crunch cert lt this"

Definition at line 950 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_high_word = 1.5

"Del if word gt xht x this above bl"

Definition at line 954 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_low_word = 0.5

"Del if word gt xht x this below bl"

Definition at line 955 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_max_ht = 3.0

"Del if word ht gt xht x this"

Definition at line 952 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_min_ht = 0.7

"Del if word ht lt xht x this"

Definition at line 951 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_min_width = 3.0

"Del if word width lt xht x this"

Definition at line 953 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_rating = 60

"POTENTIAL crunch rating lt this"

Definition at line 949 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_early_convert_bad_unlv_chs = false

"Take out ~^ early?"

Definition at line 941 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_early_merge_tess_fails = true

"Before word crunch?"

Definition at line 940 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_include_numerals = false

"Fiddle alpha figures"

Definition at line 963 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_leave_accept_strings = false

"Don't pot crunch sensible strings"

Definition at line 962 of file tesseractclass.h.

int tesseract::Tesseract::crunch_leave_lc_strings = 4

"Don't crunch words with long lower case strings"

Definition at line 965 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_leave_ok_strings = true

"Don't touch sensible strings"

Definition at line 959 of file tesseractclass.h.

int tesseract::Tesseract::crunch_leave_uc_strings = 4

"Don't crunch words with long lower case strings"

Definition at line 967 of file tesseractclass.h.

int tesseract::Tesseract::crunch_long_repetitions = 3

"Crunch words with long repetitions"

Definition at line 968 of file tesseractclass.h.

double tesseract::Tesseract::crunch_poor_garbage_cert = -9.0

"crunch garbage cert lt this"

Definition at line 944 of file tesseractclass.h.

double tesseract::Tesseract::crunch_poor_garbage_rate = 60

"crunch garbage rating lt this"

Definition at line 945 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_pot_garbage = true

"POTENTIAL crunch garbage"

Definition at line 948 of file tesseractclass.h.

int tesseract::Tesseract::crunch_pot_indicators = 1

"How many potential indicators needed"

Definition at line 958 of file tesseractclass.h.

double tesseract::Tesseract::crunch_pot_poor_cert = -8.0

"POTENTIAL crunch cert lt this"

Definition at line 947 of file tesseractclass.h.

double tesseract::Tesseract::crunch_pot_poor_rate = 40

"POTENTIAL crunch rating lt this"

Definition at line 946 of file tesseractclass.h.

int tesseract::Tesseract::crunch_rating_max = 10

"For adj length in rating per ch"

Definition at line 957 of file tesseractclass.h.

double tesseract::Tesseract::crunch_small_outlines_size = 0.6

"Small if lt xht x this"

Definition at line 956 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_terrible_garbage = true

"As it says"

Definition at line 943 of file tesseractclass.h.

double tesseract::Tesseract::crunch_terrible_rating = 80.0

"crunch rating lt this"

Definition at line 942 of file tesseractclass.h.

bool tesseract::Tesseract::debug_acceptable_wds = false

"Dump word pass/fail chk"

Definition at line 873 of file tesseractclass.h.

int tesseract::Tesseract::debug_fix_space_level = 0

"Contextual fixspace debug"

Definition at line 974 of file tesseractclass.h.

int tesseract::Tesseract::debug_noise_removal = 0

"Debug reassignment of small outlines"

Definition at line 857 of file tesseractclass.h.

int tesseract::Tesseract::debug_x_ht_level = 0

"Reestimate debug"

Definition at line 872 of file tesseractclass.h.

bool tesseract::Tesseract::docqual_excuse_outline_errs = false

"Allow outline errs in unrejection?"

Definition at line 903 of file tesseractclass.h.

bool tesseract::Tesseract::enable_noise_removal = true

"Remove and conditionally reassign small outlines when they" " confuse layout analysis, determining diacritics vs noise"

Definition at line 856 of file tesseractclass.h.

char* tesseract::Tesseract::file_type = ".tif"

"Filename extension"

Definition at line 1057 of file tesseractclass.h.

int tesseract::Tesseract::fixsp_done_mode = 1

"What constitues done for spacing"

Definition at line 973 of file tesseractclass.h.

int tesseract::Tesseract::fixsp_non_noise_limit = 1

"How many non-noise blbs either side?"

Definition at line 970 of file tesseractclass.h.

double tesseract::Tesseract::fixsp_small_outlines_size = 0.28

"Small if lt xht x this"

Definition at line 971 of file tesseractclass.h.

bool tesseract::Tesseract::hocr_char_boxes = false

"Add coordinates for each character to hocr output"

Definition at line 939 of file tesseractclass.h.

bool tesseract::Tesseract::hocr_font_info = false

"Add font info to hocr output"

Definition at line 937 of file tesseractclass.h.

bool tesseract::Tesseract::interactive_display_mode = false

"Run interactively?"

Definition at line 1056 of file tesseractclass.h.

int tesseract::Tesseract::jpg_quality = 85

"Set JPEG quality level"

Definition at line 1016 of file tesseractclass.h.

int tesseract::Tesseract::lstm_choice_mode = 0

"Allows to include alternative symbols choices in the hOCR " "output. " "Valid input values are 0, 1, 2 and 3. 0 is the default value. " "With 1 the alternative symbol choices per timestep are included. " "With 2 the alternative symbol choices are accumulated per " "character. "

Definition at line 1094 of file tesseractclass.h.

bool tesseract::Tesseract::lstm_use_matrix = 1

"Use ratings matrix/beam searct with lstm"

Definition at line 899 of file tesseractclass.h.

int tesseract::Tesseract::min_characters_to_try = 50

"Specify minimum characters to try during OSD"

Definition at line 1019 of file tesseractclass.h.

double tesseract::Tesseract::min_orientation_margin = 7.0

"Min acceptable orientation margin"

Definition at line 1066 of file tesseractclass.h.

int tesseract::Tesseract::min_sane_x_ht_pixels = 8

"Reject any x-ht lt or eq than this"

Definition at line 1051 of file tesseractclass.h.

int tesseract::Tesseract::multilang_debug_level = 0

"Print multilang debug info."

Definition at line 894 of file tesseractclass.h.

double tesseract::Tesseract::noise_cert_basechar = -8.0

"Hingepoint for base char certainty"

Definition at line 860 of file tesseractclass.h.

double tesseract::Tesseract::noise_cert_disjoint = -2.5

"Hingepoint for disjoint certainty"

Definition at line 863 of file tesseractclass.h.

double tesseract::Tesseract::noise_cert_factor = 0.375

"Scaling on certainty diff from Hingepoint"

Definition at line 869 of file tesseractclass.h.

double tesseract::Tesseract::noise_cert_punc = -2.5

"Threshold for new punc char certainty"

Definition at line 866 of file tesseractclass.h.

int tesseract::Tesseract::noise_maxperblob = 8

"Max diacritics to apply to a blob"

Definition at line 870 of file tesseractclass.h.

int tesseract::Tesseract::noise_maxperword = 16

"Max diacritics to apply to a word"

Definition at line 871 of file tesseractclass.h.

char* tesseract::Tesseract::numeric_punctuation = ".,"

"Punct. chs expected WITHIN numbers"

Definition at line 975 of file tesseractclass.h.

int tesseract::Tesseract::ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing ocr."

Definition at line 816 of file tesseractclass.h.

char* tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds = "-?*\075"

"Allow NN to unrej"

Definition at line 1049 of file tesseractclass.h.

char* tesseract::Tesseract::outlines_2 = "ij!?%\":;"

"Non standard number of outlines"

Definition at line 901 of file tesseractclass.h.

char* tesseract::Tesseract::outlines_odd = "%| "

"Non standard number of outlines"

Definition at line 900 of file tesseractclass.h.

char* tesseract::Tesseract::page_separator = "\f"

"Page separator (default is form feed control character)"

Definition at line 1087 of file tesseractclass.h.

int tesseract::Tesseract::pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation."

Definition at line 812 of file tesseractclass.h.

int tesseract::Tesseract::paragraph_debug_level = 0

"Print paragraph debug info."

Definition at line 895 of file tesseractclass.h.

bool tesseract::Tesseract::paragraph_text_based = true

"Run paragraph detection on the post-text-recognition " "(more accurate)"

Definition at line 898 of file tesseractclass.h.

bool tesseract::Tesseract::poly_allow_detailed_fx = false

"Allow feature extractors to see the original outline"

Definition at line 1070 of file tesseractclass.h.

bool tesseract::Tesseract::preserve_interword_spaces = false

"Preserve multiple interword spaces"

Definition at line 1085 of file tesseractclass.h.

double tesseract::Tesseract::quality_blob_pc = 0.0

"good_quality_doc gte good blobs limit"

Definition at line 878 of file tesseractclass.h.

double tesseract::Tesseract::quality_char_pc = 0.95

"good_quality_doc gte good char limit"

Definition at line 881 of file tesseractclass.h.

int tesseract::Tesseract::quality_min_initial_alphas_reqd = 2

"alphas in a good word"

Definition at line 882 of file tesseractclass.h.

double tesseract::Tesseract::quality_outline_pc = 1.0

"good_quality_doc lte outline error limit"

Definition at line 880 of file tesseractclass.h.

double tesseract::Tesseract::quality_rej_pc = 0.08

"good_quality_doc lte rejection limit"

Definition at line 877 of file tesseractclass.h.

double tesseract::Tesseract::quality_rowrej_pc = 1.1

"good_quality_doc gte good char limit"

Definition at line 935 of file tesseractclass.h.

bool tesseract::Tesseract::rej_1Il_trust_permuter_type = true

"Don't double check"

Definition at line 1041 of file tesseractclass.h.

bool tesseract::Tesseract::rej_1Il_use_dict_word = false

"Use dictword test"

Definition at line 1040 of file tesseractclass.h.

bool tesseract::Tesseract::rej_alphas_in_number_perm = false

"Extend permuter check"

Definition at line 1046 of file tesseractclass.h.

bool tesseract::Tesseract::rej_trust_doc_dawg = false

"Use DOC dawg in 11l conf. detector"

Definition at line 1039 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_good_perm = true

"Individual rejection control"

Definition at line 1044 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_sensible_wd = false

"Extend permuter check"

Definition at line 1045 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_tess_accepted = true

"Individual rejection control"

Definition at line 1042 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_tess_blanks = true

"Individual rejection control"

Definition at line 1043 of file tesseractclass.h.

double tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract = 0.85

"if >this fract"

Definition at line 1047 of file tesseractclass.h.

double tesseract::Tesseract::subscript_max_y_top = 0.5

"Maximum top of a character measured as a multiple of x-height " "above the baseline for us to reconsider whether it's a " "subscript."

Definition at line 996 of file tesseractclass.h.

double tesseract::Tesseract::superscript_bettered_certainty = 0.97

"What reduction in " "badness do we think sufficient to choose a superscript over " "what we'd thought. For example, a value of 0.6 means we want " "to reduce badness of certainty by 40%"

Definition at line 988 of file tesseractclass.h.

int tesseract::Tesseract::superscript_debug = 0

"Debug level for sub & superscript fixer"

Definition at line 979 of file tesseractclass.h.

double tesseract::Tesseract::superscript_min_y_bottom = 0.3

"Minimum bottom of a character measured as a multiple of " "x-height above the baseline for us to reconsider whether it's " "a superscript."

Definition at line 1000 of file tesseractclass.h.

double tesseract::Tesseract::superscript_scaledown_ratio = 0.4

"A superscript scaled down more than this is unbelievably " "small. For example, 0.3 means we expect the font size to " "be no smaller than 30% of the text line font size."

Definition at line 992 of file tesseractclass.h.

double tesseract::Tesseract::superscript_worse_certainty = 2.0

"How many times worse " "certainty does a superscript position glyph need to be for us " "to try classifying it as a char with a different baseline?"

Definition at line 983 of file tesseractclass.h.

double tesseract::Tesseract::suspect_accept_rating = -999.9

"Accept good rating limit"

Definition at line 1026 of file tesseractclass.h.

bool tesseract::Tesseract::suspect_constrain_1Il = false

"UNLV keep 1Il chars rejected"

Definition at line 1024 of file tesseractclass.h.

int tesseract::Tesseract::suspect_level = 99

"Suspect marker level"

Definition at line 1021 of file tesseractclass.h.

double tesseract::Tesseract::suspect_rating_per_ch = 999.9

"Don't touch bad rating limit"

Definition at line 1025 of file tesseractclass.h.

int tesseract::Tesseract::suspect_short_words = 2

"Don't Suspect dict wds longer than this"

Definition at line 1023 of file tesseractclass.h.

int tesseract::Tesseract::suspect_space_level = 100

"Min suspect level for rejecting spaces"

Definition at line 1022 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_adaption_debug = false

"Generate and print debug information for adaption"

Definition at line 820 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_ambigs_training = false

"Perform training for ambiguities"

Definition at line 808 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_bigram_debug = 0

"Amount of debug output for bigram " "correction."

Definition at line 853 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_char_blacklist = ""

"Blacklist of chars not to recognize"

Definition at line 803 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_char_unblacklist = ""

"List of chars to override tessedit_char_blacklist"

Definition at line 806 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_char_whitelist = ""

"Whitelist of chars to recognize"

Definition at line 804 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_consistent_reps = true

"Force all rep chars the same"

Definition at line 1033 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_alto = false

"Write .xml ALTO output file"

Definition at line 1007 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_boxfile = false

"Output text with boxes"

Definition at line 1052 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_hocr = false

"Write .html hOCR output file"

Definition at line 1006 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_lstmbox = false

"Write .box file for LSTM training"

Definition at line 1009 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_pdf = false

"Write .pdf output file"

Definition at line 1013 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_tsv = false

"Write .tsv output file"

Definition at line 1010 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_txt = false

"Write .txt output file"

Definition at line 1005 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_wordstrbox = false

"Write WordStr format .box output file"

Definition at line 1012 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_block_rejection = false

"Block and Row stats"

Definition at line 846 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_doc_rejection = false

"Page stats"

Definition at line 931 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_fonts = false

"Output font info per char"

Definition at line 845 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_quality_metrics = false

"Output data to debug file"

Definition at line 933 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_display_outwords = false

"Draw output words"

Definition at line 835 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dont_blkrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 921 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dont_rowrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 923 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dump_choices = false

"Dump char choices"

Definition at line 836 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dump_pageseg_images = false

"Dump intermediate images made during page segmentation"

Definition at line 794 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_enable_bigram_correction = true

"Enable correction based on the word bigram dictionary."

Definition at line 848 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_enable_dict_correction = false

"Enable single word correction based on the dictionary."

Definition at line 850 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_enable_doc_dict = true

"Add words to the document dictionary"

Definition at line 844 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_fix_fuzzy_spaces = true

"Try to improve fuzzy spaces"

Definition at line 838 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_fix_hyphens = true

"Crunch double hyphens?"

Definition at line 841 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_flip_0O = true

"Contextual 0O O0 flips"

Definition at line 1036 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd = 1.1

"rej good doc wd if more than this fraction rejected"

Definition at line 929 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_good_quality_unrej = true

"Reduce rejection on good docs"

Definition at line 905 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_image_border = 2

"Rej blbs near image edge limit"

Definition at line 1048 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_init_config_only = false

"Only initialize with the config file. Useful if the instance is " "not going to be used for OCR but say only for layout analysis."

Definition at line 1073 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_load_sublangs = ""

"List of languages to load with this one"

Definition at line 1060 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_lower_flip_hyphen = 1.5

"Aspect ratio dot/hyphen test"

Definition at line 1037 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_make_boxes_from_boxes = false

"Generate more boxes from boxed chars"

Definition at line 790 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_matcher_log = false

"Log matcher activity"

Definition at line 888 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_minimal_rej_pass1 = false

"Do minimal rejection on pass 1 output"

Definition at line 886 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_minimal_rejection = false

"Only reject tess failures"

Definition at line 1027 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_ocr_engine_mode = tesseract::OEM_DEFAULT

"Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults" " to loading and running the most accurate available."

Definition at line 801 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_override_permuter = true

"According to dict_word"

Definition at line 1058 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_page_number = -1

"-1 -> All pages, else specific page to process"

Definition at line 1054 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_pageseg_mode = PSM_SINGLE_BLOCK

"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)"

Definition at line 798 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_parallelize = 0

"Run in parallel where possible"

Definition at line 1083 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_prefer_joined_punct = false

"Reward punctuation joins"

Definition at line 972 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds = true

"Only rej partially rejected words in block rejection"

Definition at line 917 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_preserve_min_wd_len = 2

"Only preserve wds longer than this"

Definition at line 925 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds = true

"Only rej partially rejected words in row rejection"

Definition at line 919 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_redo_xheight = true

"Check/Correct x-height"

Definition at line 842 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_reject_bad_qual_wds = true

"Reject all bad quality wds"

Definition at line 930 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_reject_block_percent = 45.00

"%rej allowed before rej whole block"

Definition at line 910 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_reject_doc_percent = 65.00

"%rej allowed before rej whole doc"

Definition at line 908 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_reject_mode = 0

"Rejection algorithm"

Definition at line 1034 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_reject_row_percent = 40.00

"%rej allowed before rej whole row"

Definition at line 912 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_rejection_debug = false

"Adaption debug"

Definition at line 1035 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_resegment_from_boxes = false

"Take segmentation and labeling from box file"

Definition at line 784 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_resegment_from_line_boxes = false

"Conversion of word/line box file to char box file"

Definition at line 786 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_row_rej_good_docs = true

"Apply row rejection to good docs"

Definition at line 927 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_tess_adaption_mode = 0x27

"Adaptation decision algorithm for tess"

Definition at line 884 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_test_adaption = false

"Test adaption criteria"

Definition at line 887 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_test_adaption_mode = 3

"Adaptation decision algorithm for tess"

Definition at line 890 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_timing_debug = false

"Print timing stats"

Definition at line 837 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_train_from_boxes = false

"Generate training data from boxed chars"

Definition at line 788 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_train_line_recognizer = false

"Break input into lines and remap boxes if present"

Definition at line 792 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_unrej_any_wd = false

"Don't bother with word plausibility"

Definition at line 840 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_upper_flip_hyphen = 1.8

"Aspect ratio dot/hyphen test"

Definition at line 1038 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_use_primary_params_model = false

"In multilingual mode use params model of the primary language"

Definition at line 1062 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_use_reject_spaces = true

"Reject spaces?"

Definition at line 906 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_whole_wd_rej_row_percent = 70.00

"Number of row rejects in whole word rejects" "which prevents whole row rejection"

Definition at line 915 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_word_for_word = false

"Make output have exactly one word per WERD"

Definition at line 1030 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_block_separators = false

"Write block separators in output"

Definition at line 1002 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_images = false

"Capture the image from the IPE"

Definition at line 1055 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_write_params_to_file = ""

"Write all parameters to the given file."

Definition at line 818 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_rep_codes = false

"Write repetition char code"

Definition at line 1003 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_unlv = false

"Write .unlv output file"

Definition at line 1004 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_zero_kelvin_rejection = false

"Don't reject ANYTHING AT ALL"

Definition at line 1032 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_zero_rejection = false

"Don't reject ANYTHING"

Definition at line 1028 of file tesseractclass.h.

bool tesseract::Tesseract::test_pt = false

"Test for point"

Definition at line 891 of file tesseractclass.h.

double tesseract::Tesseract::test_pt_x = 99999.99

"xcoord"

Definition at line 892 of file tesseractclass.h.

double tesseract::Tesseract::test_pt_y = 99999.99

"ycoord"

Definition at line 893 of file tesseractclass.h.

bool tesseract::Tesseract::textonly_pdf = false

"Create PDF with only one invisible text layer"

Definition at line 1015 of file tesseractclass.h.

bool tesseract::Tesseract::textord_equation_detect = false

"Turn on equation detector"

Definition at line 1074 of file tesseractclass.h.

double tesseract::Tesseract::textord_tabfind_aligned_gap_fraction = 0.75

"Fraction of height used as a minimum gap for aligned blobs."

Definition at line 1082 of file tesseractclass.h.

bool tesseract::Tesseract::textord_tabfind_force_vertical_text = false

"Force using vertical text page mode"

Definition at line 1077 of file tesseractclass.h.

bool tesseract::Tesseract::textord_tabfind_show_vlines = false

"Debug line finding"

Definition at line 1067 of file tesseractclass.h.

bool tesseract::Tesseract::textord_tabfind_vertical_text = true

"Enable vertical detection"

Definition at line 1075 of file tesseractclass.h.

double tesseract::Tesseract::textord_tabfind_vertical_text_ratio = 0.5

"Fraction of textlines deemed vertical to use vertical page " "mode"

Definition at line 1080 of file tesseractclass.h.

bool tesseract::Tesseract::textord_use_cjk_fp_model = false

"Use CJK fixed pitch model"

Definition at line 1068 of file tesseractclass.h.

bool tesseract::Tesseract::unlv_tilde_crunching = false

"Mark v.bad words for tilde crunch"

Definition at line 936 of file tesseractclass.h.

char* tesseract::Tesseract::unrecognised_char = "|"

"Output char for unidentified blobs"

Definition at line 1020 of file tesseractclass.h.

int tesseract::Tesseract::user_defined_dpi = 0

"Specify DPI for input image"

Definition at line 1017 of file tesseractclass.h.

int tesseract::Tesseract::x_ht_acceptance_tolerance = 8

"Max allowed deviation of blob top outside of font data"

Definition at line 977 of file tesseractclass.h.

int tesseract::Tesseract::x_ht_min_change = 8

"Min change in xht before actually trying it"

Definition at line 978 of file tesseractclass.h.


The documentation for this class was generated from the following files: