tesseract  4.1.0
tesseract::Wordrec Class Reference

#include <wordrec.h>

Inheritance diagram for tesseract::Wordrec:
tesseract::Classify tesseract::CCStruct tesseract::CUtil tesseract::CCUtil tesseract::Tesseract

Public Member Functions

 Wordrec ()
 
 ~Wordrec () override=default
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void DoSegSearch (WERD_RES *word_res)
 
void add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams)
 
void choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
 
void combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
 
SEAMpick_good_seam (TBLOB *blob)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY grade_split_length (SPLIT *split)
 
PRIORITY grade_sharpness (SPLIT *split)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
virtual BLOB_CHOICE_LIST * classify_piece (const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
 
void merge_fragments (MATRIX *ratings, int16_t num_blobs)
 
void get_fragment_lists (int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
 
void merge_and_put_fragment_lists (int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
 
void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
 
program_editup

Initialize all the things in the program that need to be initialized. init_permute determines whether to initialize the permute functions and Dawg models.

void program_editup (const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
 
cc_recog

Recognize a word.

void cc_recog (WERD_RES *word)
 
program_editdown

This function holds any necessary post processing for the Wise Owl program.

void program_editdown (int32_t elasped_time)
 
set_pass1

Get ready to do some pass 1 stuff.

void set_pass1 ()
 
set_pass2

Get ready to do some pass 2 stuff.

void set_pass2 ()
 
end_recog

Cleanup and exit the recog program.

int end_recog ()
 
call_matcher

Called from Tess with a blob in tess form. The blob may need rotating to the correct orientation for classification.

BLOB_CHOICE_LIST * call_matcher (TBLOB *blob)
 
dict_word()

Test the dictionaries, returning NO_PERM (0) if not found, or one of the PermuterType values if found, according to the dictionary.

int dict_word (const WERD_CHOICE &word)
 
classify_blob

Classify the this blob if it is not already recorded in the match table. Attempt to recognize this blob as a character. The recognition rating for this blob will be stored as a part of the blob. This value will also be returned to the caller.

Parameters
blobCurrent blob
stringThe string to display in ScrollView
colorThe colour to use when displayed with ScrollView
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
 
point_priority

Assign a priority to and edge point that might be used as part of a split. The argument should be of type EDGEPT.

PRIORITY point_priority (EDGEPT *point)
 
add_point_to_list

Add an edge point to a POINT_GROUP containing a list of other points.

void add_point_to_list (PointHeap *point_heap, EDGEPT *point)
 
bool is_inside_angle (EDGEPT *pt)
 
angle_change

Return the change in angle (degrees) of the line segments between points one and two, and two and three.

int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
pick_close_point

Choose the edge point that is closest to the critical point. This point may not be exactly vertical from the critical point.

EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
prioritize_points

Find a list of edge points from the outer outline of this blob. For each of these points assign a priority. Sort these points using a heap structure so that they can be visited in order.

void prioritize_points (TESSLINE *outline, PointHeap *points)
 
new_min_point

Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to nullptr.

void new_min_point (EDGEPT *local_min, PointHeap *points)
 
new_max_point

Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to nullptr.

void new_max_point (EDGEPT *local_max, PointHeap *points)
 
vertical_projection_point

For one point on the outline, find the corresponding point on the other side of the outline that is a likely projection for a split point. This is done by iterating through the edge points until the X value of the point being looked at is greater than the X value of the split point. Ensure that the point being returned is not right next to the split point. Return the edge point in *best_point as a result, and any points that were newly created are also saved on the new_points list.

void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
attempt_blob_chop

Try to split the this blob after this one. Check to make sure that it was successful.

SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_numbered_blob (TWERD *word, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
 
improve_one_blob

Finds the best place to chop, based on the worst blob, fixpt, or next to a fragment, according to the input. Returns the SEAM corresponding to the chop point, if any is found, and the index in the ratings_matrix of the chopped blob. Note that blob_choices is just a copy of the pointers in the leading diagonal of the ratings MATRIX. Although the blob is chopped, the returned SEAM is yet to be inserted into word->seam_array and the resulting blobs are unclassified, so this function can be used by ApplyBox as well as during recognition.

SEAMimprove_one_blob (const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
 
chop_one_blob

Start with the current one-blob word and its classification. Find the worst blobs and try to divide it up to improve the ratings. Used for testing chopper.

SEAMchop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
 
chop_word_main

Classify the blobs in this word and permute the results. Find the worst blob in the word and chop it up. Continue this process until a good answer has been found or all the blobs have been chopped up enough. The results are returned in the WERD_RES.

void chop_word_main (WERD_RES *word)
 
improve_by_chopping

Repeatedly chops the worst blob, classifying the new blobs fixing up all the data, and incrementally runs the segmentation search until a good word is found, or no more chops can be found.

void improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
 
int select_blob_to_split (const GenericVector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment)
 
int select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
 ~Classify () override
 
virtual DictgetDict ()
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uint8_t ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (TFile *fp, uint16_t *Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (TFile *File)
 
float ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (TFile *fp)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (TessdataManager *mgr)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (float Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uint8_t *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (TFile *fp)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, float *XScale, float *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const STRING &filename)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()=default
 
 ~CCStruct () override
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()=default
 
 ~CUtil () override
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 

Public Attributes

bool merge_fragments_in_matrix = true
 
bool wordrec_no_block = false
 
bool wordrec_enable_assoc = true
 
bool force_word_assoc = false
 
double wordrec_worst_state = 1
 
bool fragments_guide_chopper = false
 
int repair_unchopped_blobs = 1
 
double tessedit_certainty_threshold = -2.25
 
int chop_debug = 0
 
bool chop_enable = 1
 
bool chop_vertical_creep = 0
 
int chop_split_length = 10000
 
int chop_same_distance = 2
 
int chop_min_outline_points = 6
 
int chop_seam_pile_size = 150
 
bool chop_new_seam_pile = 1
 
int chop_inside_angle = -50
 
int chop_min_outline_area = 2000
 
double chop_split_dist_knob = 0.5
 
double chop_overlap_knob = 0.9
 
double chop_center_knob = 0.15
 
int chop_centered_maxwidth = 90
 
double chop_sharpness_knob = 0.06
 
double chop_width_change_knob = 5.0
 
double chop_ok_split = 100.0
 
double chop_good_split = 50.0
 
int chop_x_y_weight = 3
 
int segment_adjust_debug = 0
 
bool assume_fixed_pitch_char_segment = false
 
int wordrec_debug_level = 0
 
int wordrec_max_join_chunks = 4
 
bool wordrec_skip_no_truth_words = false
 
bool wordrec_debug_blamer = false
 
bool wordrec_run_blamer = false
 
int segsearch_debug_level = 0
 
int segsearch_max_pain_points = 2000
 
int segsearch_max_futile_classifications = 10
 
double segsearch_max_char_wh_ratio = 2.0
 
bool save_alt_choices = true
 
std::unique_ptr< LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
WERD_CHOICEprev_word_best_choice_
 
GenericVector< int > blame_reasons_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
bool allow_blob_division = true
 
bool prioritize_division = false
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_min_norm_scale_x = 0.0
 
double classify_max_norm_scale_x = 0.325
 
double classify_min_norm_scale_y = 0.0
 
double classify_max_norm_scale_y = 0.325
 
double classify_max_rating_ratio = 1.5
 
double classify_max_certainty_margin = 5.5
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
bool classify_nonlinear_norm = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_reliable_adaptive_result = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
double classify_adapted_pruning_factor = 2.5
 
double classify_adapted_pruning_threshold = -1.0
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = true
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = false
 
bool matcher_debug_separate_windows = false
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 15
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 10
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
ADAPT_TEMPLATES BackupAdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
bool EnableLearning
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
int il1_adaption_test = 0
 
bool classify_bln_numeric_mode = 0
 
double speckle_large_max_size = 0.30
 
double speckle_rating_penalty = 10.0
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
int ambigs_debug_level = 0
 
bool use_definite_ambigs_for_classifier = false
 
bool use_ambigs_for_adaption = false
 

Protected Member Functions

bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
 
void ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
 
void InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Classify
static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
 
- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 

Detailed Description

Definition at line 192 of file wordrec.h.

Constructor & Destructor Documentation

tesseract::Wordrec::Wordrec ( )

Definition at line 47 of file wordrec.cpp.

47  :
48  // control parameters
50  "Merge the fragments in the ratings matrix and delete them"
51  " after merging", params()),
52  BOOL_MEMBER(wordrec_no_block, false, "Don't output block information",
53  params()),
54  BOOL_MEMBER(wordrec_enable_assoc, true, "Associator Enable",
55  params()),
57  "force associator to run regardless of what enable_assoc is."
58  " This is used for CJK where component grouping is necessary.",
59  CCUtil::params()),
60  double_MEMBER(wordrec_worst_state, 1.0, "Worst segmentation state",
61  params()),
63  "Use information from fragments to guide chopping process",
64  params()),
65  INT_MEMBER(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped",
66  params()),
67  double_MEMBER(tessedit_certainty_threshold, -2.25, "Good blob limit",
68  params()),
69  INT_MEMBER(chop_debug, 0, "Chop debug",
70  params()),
71  BOOL_MEMBER(chop_enable, 1, "Chop enable",
72  params()),
73  BOOL_MEMBER(chop_vertical_creep, 0, "Vertical creep",
74  params()),
75  INT_MEMBER(chop_split_length, 10000, "Split Length",
76  params()),
77  INT_MEMBER(chop_same_distance, 2, "Same distance",
78  params()),
79  INT_MEMBER(chop_min_outline_points, 6, "Min Number of Points on Outline",
80  params()),
81  INT_MEMBER(chop_seam_pile_size, 150, "Max number of seams in seam_pile",
82  params()),
83  BOOL_MEMBER(chop_new_seam_pile, 1, "Use new seam_pile", params()),
84  INT_MEMBER(chop_inside_angle, -50, "Min Inside Angle Bend",
85  params()),
86  INT_MEMBER(chop_min_outline_area, 2000, "Min Outline Area",
87  params()),
88  double_MEMBER(chop_split_dist_knob, 0.5, "Split length adjustment",
89  params()),
90  double_MEMBER(chop_overlap_knob, 0.9, "Split overlap adjustment",
91  params()),
92  double_MEMBER(chop_center_knob, 0.15, "Split center adjustment",
93  params()),
94  INT_MEMBER(chop_centered_maxwidth, 90, "Width of (smaller) chopped blobs "
95  "above which we don't care that a chop is not near the center.",
96  params()),
97  double_MEMBER(chop_sharpness_knob, 0.06, "Split sharpness adjustment",
98  params()),
99  double_MEMBER(chop_width_change_knob, 5.0, "Width change adjustment",
100  params()),
101  double_MEMBER(chop_ok_split, 100.0, "OK split limit",
102  params()),
103  double_MEMBER(chop_good_split, 50.0, "Good split limit",
104  params()),
105  INT_MEMBER(chop_x_y_weight, 3, "X / Y length weight",
106  params()),
107  INT_MEMBER(segment_adjust_debug, 0, "Segmentation adjustment debug",
108  params()),
110  "include fixed-pitch heuristics in char segmentation",
111  params()),
113  "Debug level for wordrec", params()),
115  "Max number of broken pieces to associate", params()),
117  "Only run OCR for words that had truth recorded in BlamerBundle",
118  params()),
120  "Print blamer debug messages", params()),
122  "Try to set the blame for errors", params()),
124  "SegSearch debug level", params()),
126  "Maximum number of pain points stored in the queue",
127  params()),
129  "Maximum number of pain point classifications per chunk that"
130  " did not result in finding a better word choice.",
131  params()),
133  "Maximum character width-to-height ratio", params()),
135  "Save alternative paths found during chopping"
136  " and segmentation search",
137  params()),
138  pass2_ok_split(0.0f) {
139  prev_word_best_choice_ = nullptr;
140  language_model_.reset(new LanguageModel(&get_fontinfo_table(),
141  &(getDict())));
142  fill_lattice_ = nullptr;
143 }
ParamsVectors * params()
Definition: ccutil.h:65
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
bool merge_fragments_in_matrix
Definition: wordrec.h:197
bool wordrec_enable_assoc
Definition: wordrec.h:199
bool wordrec_no_block
Definition: wordrec.h:198
int segsearch_max_pain_points
Definition: wordrec.h:240
int segsearch_max_futile_classifications
Definition: wordrec.h:242
int chop_min_outline_area
Definition: wordrec.h:217
void(Wordrec::* fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:485
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:324
bool wordrec_run_blamer
Definition: wordrec.h:237
int chop_seam_pile_size
Definition: wordrec.h:214
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481
virtual Dict & getDict()
Definition: classify.h:107
double chop_overlap_knob
Definition: wordrec.h:219
double chop_split_dist_knob
Definition: wordrec.h:218
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:230
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:318
double chop_sharpness_knob
Definition: wordrec.h:223
int chop_min_outline_points
Definition: wordrec.h:213
bool fragments_guide_chopper
Definition: wordrec.h:205
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:315
bool chop_new_seam_pile
Definition: wordrec.h:215
double tessedit_certainty_threshold
Definition: wordrec.h:207
double chop_good_split
Definition: wordrec.h:226
double chop_center_knob
Definition: wordrec.h:220
int chop_centered_maxwidth
Definition: wordrec.h:222
int wordrec_max_join_chunks
Definition: wordrec.h:233
int wordrec_debug_level
Definition: wordrec.h:231
int segment_adjust_debug
Definition: wordrec.h:228
double chop_ok_split
Definition: wordrec.h:225
bool save_alt_choices
Definition: wordrec.h:247
bool chop_vertical_creep
Definition: wordrec.h:210
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
double segsearch_max_char_wh_ratio
Definition: wordrec.h:244
int repair_unchopped_blobs
Definition: wordrec.h:206
bool wordrec_debug_blamer
Definition: wordrec.h:236
bool wordrec_skip_no_truth_words
Definition: wordrec.h:235
int segsearch_debug_level
Definition: wordrec.h:238
double wordrec_worst_state
Definition: wordrec.h:203
int chop_same_distance
Definition: wordrec.h:212
double chop_width_change_knob
Definition: wordrec.h:224
bool force_word_assoc
Definition: wordrec.h:202
PRIORITY pass2_ok_split
Definition: wordrec.h:477
tesseract::Wordrec::~Wordrec ( )
overridedefault

Member Function Documentation

void tesseract::Wordrec::add_point_to_list ( PointHeap point_heap,
EDGEPT point 
)

Definition at line 57 of file chop.cpp.

57  {
58  if (point_heap->size() < MAX_NUM_POINTS - 2) {
59  PointPair pair(point_priority(point), point);
60  point_heap->Push(&pair);
61  }
62 
63 #ifndef GRAPHICS_DISABLED
64  if (chop_debug > 2)
65  mark_outline(point);
66 #endif
67 }
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:47
void Push(Pair *entry)
Definition: genericheap.h:95
#define MAX_NUM_POINTS
Definition: chop.h:33
void mark_outline(EDGEPT *edgept)
Definition: plotedges.cpp:87
void tesseract::Wordrec::add_seam_to_queue ( float  new_priority,
SEAM new_seam,
SeamQueue seams 
)

Definition at line 66 of file findseam.cpp.

67  {
68  if (new_seam == nullptr) return;
69  if (chop_debug) {
70  tprintf("Pushing new seam with priority %g :", new_priority);
71  new_seam->Print("seam: ");
72  }
73  if (seams->size() >= MAX_NUM_SEAMS) {
74  SeamPair old_pair(0, nullptr);
75  if (seams->PopWorst(&old_pair) && old_pair.key() <= new_priority) {
76  if (chop_debug) {
77  tprintf("Old seam staying with priority %g\n", old_pair.key());
78  }
79  delete new_seam;
80  seams->Push(&old_pair);
81  return;
82  } else if (chop_debug) {
83  tprintf("New seam with priority %g beats old worst seam with %g\n",
84  new_priority, old_pair.key());
85  }
86  }
87  SeamPair new_pair(new_priority, new_seam);
88  seams->Push(&new_pair);
89 }
void Print(const char *label) const
Definition: seam.cpp:154
void Push(Pair *entry)
Definition: genericheap.h:95
#define MAX_NUM_SEAMS
Definition: findseam.cpp:49
bool PopWorst(Pair *entry)
Definition: genericheap.h:140
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int tesseract::Wordrec::angle_change ( EDGEPT point1,
EDGEPT point2,
EDGEPT point3 
)

Definition at line 81 of file chop.cpp.

81  {
82  VECTOR vector1;
83  VECTOR vector2;
84 
85  int angle;
86 
87  /* Compute angle */
88  vector1.x = point2->pos.x - point1->pos.x;
89  vector1.y = point2->pos.y - point1->pos.y;
90  vector2.x = point3->pos.x - point2->pos.x;
91  vector2.y = point3->pos.y - point2->pos.y;
92  /* Use cross product */
93  float length = std::sqrt(static_cast<float>(LENGTH(vector1)) * LENGTH(vector2));
94  if (static_cast<int>(length) == 0)
95  return (0);
96  angle = static_cast<int>(floor(asin(CROSS (vector1, vector2) /
97  length) / M_PI * 180.0 + 0.5));
98 
99  /* Use dot product */
100  if (SCALAR (vector1, vector2) < 0)
101  angle = 180 - angle;
102  /* Adjust angle */
103  if (angle > 180)
104  angle -= 360;
105  if (angle <= -180)
106  angle += 360;
107  return (angle);
108 }
int16_t x
Definition: blobs.h:73
#define LENGTH(a)
Definition: vecfuncs.h:65
TPOINT pos
Definition: blobs.h:165
int16_t y
Definition: blobs.h:74
Definition: blobs.h:52
#define SCALAR(a, b)
Definition: vecfuncs.h:56
#define CROSS(a, b)
Definition: vecfuncs.h:47
SEAM * tesseract::Wordrec::attempt_blob_chop ( TWERD word,
TBLOB blob,
int32_t  blob_number,
bool  italic_blob,
const GenericVector< SEAM * > &  seams 
)

Definition at line 211 of file chopper.cpp.

213  {
215  preserve_outline_tree (blob->outlines);
216  TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
217  // Insert it into the word.
218  word->blobs.insert(other_blob, blob_number + 1);
219 
220  SEAM *seam = nullptr;
221  if (prioritize_division) {
222  TPOINT location;
223  if (divisible_blob(blob, italic_blob, &location)) {
224  seam = new SEAM(0.0f, location);
225  }
226  }
227  if (seam == nullptr)
228  seam = pick_good_seam(blob);
229  if (chop_debug) {
230  if (seam != nullptr)
231  seam->Print("Good seam picked=");
232  else
233  tprintf("\n** no seam picked *** \n");
234  }
235  if (seam) {
236  seam->ApplySeam(italic_blob, blob, other_blob);
237  }
238 
239  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
240  seams, seam);
241  if (seam == nullptr) {
243  restore_outline_tree(blob->outlines);
245  // If the blob can simply be divided into outlines, then do that.
246  TPOINT location;
247  if (divisible_blob(blob, italic_blob, &location)) {
248  other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
249  word->blobs.insert(other_blob, blob_number + 1);
250  seam = new SEAM(0.0f, location);
251  seam->ApplySeam(italic_blob, blob, other_blob);
252  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
253  seams, seam);
254  }
255  }
256  }
257  if (seam != nullptr) {
258  // Make sure this seam doesn't get chopped again.
259  seam->Finalize();
260  }
261  return seam;
262 }
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location)
Definition: blobs.cpp:917
void Print(const char *label) const
Definition: seam.cpp:154
void Finalize()
Definition: seam.h:110
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
SEAM * pick_good_seam(TBLOB *blob)
Definition: findseam.cpp:217
TESSLINE * outlines
Definition: blobs.h:379
Definition: blobs.h:263
bool prioritize_division
Definition: classify.h:428
void ApplySeam(bool italic_blob, TBLOB *blob, TBLOB *other_blob) const
Definition: seam.cpp:118
static TBLOB * ShallowCopy(const TBLOB &src)
Definition: blobs.cpp:339
Definition: blobs.h:52
Definition: seam.h:38
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void insert(const T &t, int index)
bool allow_blob_division
Definition: classify.h:423
int repair_unchopped_blobs
Definition: wordrec.h:206
BLOB_CHOICE_LIST * tesseract::Wordrec::call_matcher ( TBLOB blob)

Definition at line 140 of file tface.cpp.

140  {
141  // Rotate the blob for classification if necessary.
142  TBLOB* rotated_blob = tessblob->ClassifyNormalizeIfNeeded();
143  if (rotated_blob == nullptr) {
144  rotated_blob = tessblob;
145  }
146  auto *ratings = new BLOB_CHOICE_LIST(); // matcher result
147  AdaptiveClassifier(rotated_blob, ratings);
148  if (rotated_blob != tessblob) {
149  delete rotated_blob;
150  }
151  return ratings;
152 }
TBLOB * ClassifyNormalizeIfNeeded() const
Definition: blobs.cpp:350
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:191
Definition: blobs.h:263
void tesseract::Wordrec::CallFillLattice ( const MATRIX ratings,
const WERD_CHOICE_LIST &  best_choices,
const UNICHARSET unicharset,
BlamerBundle blamer_bundle 
)
inline

Definition at line 264 of file wordrec.h.

267  {
268  (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
269  }
void(Wordrec::* fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:485
void tesseract::Wordrec::cc_recog ( WERD_RES word)

Definition at line 125 of file tface.cpp.

125  {
127  chop_word_main(word);
128  word->DebugWordChoices(getDict().stopper_debug_level >= 1,
129  getDict().word_to_debug.string());
130  ASSERT_HOST(word->StatesAllValid());
131 }
virtual Dict & getDict()
Definition: classify.h:107
void chop_word_main(WERD_RES *word)
Definition: chopper.cpp:391
end of line
Definition: werd.h:33
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:484
bool StatesAllValid()
Definition: pageres.cpp:462
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:28
#define ASSERT_HOST(x)
Definition: errcode.h:88
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
WERD * word
Definition: pageres.h:188
void tesseract::Wordrec::choose_best_seam ( SeamQueue seam_queue,
const SPLIT split,
PRIORITY  priority,
SEAM **  seam_result,
TBLOB blob,
SeamPile seam_pile 
)

Definition at line 105 of file findseam.cpp.

107  {
108  SEAM *seam;
109  char str[80];
110  float my_priority;
111  /* Add seam of split */
112  my_priority = priority;
113  if (split != nullptr) {
114  TPOINT split_point = split->point1->pos;
115  split_point += split->point2->pos;
116  split_point /= 2;
117  seam = new SEAM(my_priority, split_point, *split);
118  if (chop_debug > 1) seam->Print("Partial priority ");
119  add_seam_to_queue(my_priority, seam, seam_queue);
120 
121  if (my_priority > chop_good_split)
122  return;
123  }
124 
125  TBOX bbox = blob->bounding_box();
126  /* Queue loop */
127  while (!seam_queue->empty()) {
128  SeamPair seam_pair;
129  seam_queue->Pop(&seam_pair);
130  seam = seam_pair.extract_data();
131  /* Set full priority */
132  my_priority = seam->FullPriority(bbox.left(), bbox.right(),
135  if (chop_debug) {
136  sprintf (str, "Full my_priority %0.0f, ", my_priority);
137  seam->Print(str);
138  }
139 
140  if ((*seam_result == nullptr || (*seam_result)->priority() > my_priority) &&
141  my_priority < chop_ok_split) {
142  /* No crossing */
143  if (seam->IsHealthy(*blob, chop_min_outline_points,
145  delete *seam_result;
146  *seam_result = new SEAM(*seam);
147  (*seam_result)->set_priority(my_priority);
148  } else {
149  delete seam;
150  seam = nullptr;
151  my_priority = BAD_PRIORITY;
152  }
153  }
154 
155  if (my_priority < chop_good_split) {
156  delete seam;
157  return; /* Made good answer */
158  }
159 
160  if (seam) {
161  /* Combine with others */
162  if (seam_pile->size() < chop_seam_pile_size) {
163  combine_seam(*seam_pile, seam, seam_queue);
164  SeamDecPair pair(seam_pair.key(), seam);
165  seam_pile->Push(&pair);
166  } else if (chop_new_seam_pile &&
167  seam_pile->size() == chop_seam_pile_size &&
168  seam_pile->PeekTop().key() > seam_pair.key()) {
169  combine_seam(*seam_pile, seam, seam_queue);
170  SeamDecPair pair;
171  seam_pile->Pop(&pair); // pop the worst.
172  // Replace the seam in pair (deleting the old one) with
173  // the new seam and score, then push back into the heap.
174  pair.set_key(seam_pair.key());
175  pair.set_data(seam);
176  seam_pile->Push(&pair);
177  } else {
178  delete seam;
179  }
180  }
181 
182  my_priority = seam_queue->empty() ? NO_FULL_PRIORITY
183  : seam_queue->PeekTop().key();
184  if ((my_priority > chop_ok_split) ||
185  (my_priority > chop_good_split && split))
186  return;
187  }
188 }
void combine_seam(const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
Definition: findseam.cpp:198
void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams)
Definition: findseam.cpp:66
EDGEPT * point2
Definition: split.h:104
void Print(const char *label) const
Definition: seam.cpp:154
Definition: rect.h:34
void Push(Pair *entry)
Definition: genericheap.h:95
int chop_min_outline_area
Definition: wordrec.h:217
void set_data(Data *new_data)
Definition: kdpair.h:126
int chop_seam_pile_size
Definition: wordrec.h:214
double chop_overlap_knob
Definition: wordrec.h:219
TBOX bounding_box() const
Definition: blobs.cpp:472
int chop_min_outline_points
Definition: wordrec.h:213
TPOINT pos
Definition: blobs.h:165
const Key & key() const
Definition: kdpair.h:116
bool chop_new_seam_pile
Definition: wordrec.h:215
float priority() const
Definition: seam.h:59
bool IsHealthy(const TBLOB &blob, int min_points, int min_area) const
Definition: seam.cpp:66
double chop_good_split
Definition: wordrec.h:226
const Pair & PeekTop() const
Definition: genericheap.h:108
Definition: blobs.h:52
double chop_center_knob
Definition: wordrec.h:220
#define BAD_PRIORITY
Definition: findseam.cpp:53
int chop_centered_maxwidth
Definition: wordrec.h:222
Definition: seam.h:38
Data * extract_data()
Definition: kdpair.h:131
#define NO_FULL_PRIORITY
Definition: findseam.cpp:51
float FullPriority(int xmin, int xmax, double overlap_knob, int centered_maxwidth, double center_knob, double width_change_knob) const
Definition: seam.cpp:239
double chop_ok_split
Definition: wordrec.h:225
int16_t right() const
Definition: rect.h:79
int16_t left() const
Definition: rect.h:72
EDGEPT * point1
Definition: split.h:103
void set_key(const Key &new_key)
Definition: kdpair.h:119
bool Pop(Pair *entry)
Definition: genericheap.h:118
double chop_width_change_knob
Definition: wordrec.h:224
bool empty() const
Definition: genericheap.h:68
SEAM * tesseract::Wordrec::chop_numbered_blob ( TWERD word,
int32_t  blob_number,
bool  italic_blob,
const GenericVector< SEAM * > &  seams 
)

Definition at line 265 of file chopper.cpp.

267  {
268  return attempt_blob_chop(word, word->blobs[blob_number], blob_number,
269  italic_blob, seams);
270 }
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
SEAM * attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
Definition: chopper.cpp:211
SEAM * tesseract::Wordrec::chop_one_blob ( const GenericVector< TBOX > &  boxes,
const GenericVector< BLOB_CHOICE * > &  blob_choices,
WERD_RES word_res,
int *  blob_number 
)

Definition at line 371 of file chopper.cpp.

374  {
375  if (prioritize_division) {
376  return chop_overlapping_blob(boxes, true, word_res, blob_number);
377  } else {
378  return improve_one_blob(blob_choices, nullptr, false, true, word_res,
379  blob_number);
380  }
381 }
bool prioritize_division
Definition: classify.h:428
SEAM * chop_overlapping_blob(const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:273
SEAM * improve_one_blob(const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
Definition: chopper.cpp:327
SEAM * tesseract::Wordrec::chop_overlapping_blob ( const GenericVector< TBOX > &  boxes,
bool  italic_blob,
WERD_RES word_res,
int *  blob_number 
)

Definition at line 273 of file chopper.cpp.

275  {
276  TWERD *word = word_res->chopped_word;
277  for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) {
278  TBLOB *blob = word->blobs[*blob_number];
279  TPOINT topleft, botright;
280  topleft.x = blob->bounding_box().left();
281  topleft.y = blob->bounding_box().top();
282  botright.x = blob->bounding_box().right();
283  botright.y = blob->bounding_box().bottom();
284 
285  TPOINT original_topleft, original_botright;
286  word_res->denorm.DenormTransform(nullptr, topleft, &original_topleft);
287  word_res->denorm.DenormTransform(nullptr, botright, &original_botright);
288 
289  TBOX original_box = TBOX(original_topleft.x, original_botright.y,
290  original_botright.x, original_topleft.y);
291 
292  bool almost_equal_box = false;
293  int num_overlap = 0;
294  for (int i = 0; i < boxes.size(); i++) {
295  if (original_box.overlap_fraction(boxes[i]) > 0.125)
296  num_overlap++;
297  if (original_box.almost_equal(boxes[i], 3))
298  almost_equal_box = true;
299  }
300 
301  TPOINT location;
302  if (divisible_blob(blob, italic_blob, &location) ||
303  (!almost_equal_box && num_overlap > 1)) {
304  SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
305  italic_blob, word_res->seam_array);
306  if (seam != nullptr)
307  return seam;
308  }
309  }
310 
311  *blob_number = -1;
312  return nullptr;
313 }
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location)
Definition: blobs.cpp:917
int16_t top() const
Definition: rect.h:58
Definition: rect.h:34
int16_t x
Definition: blobs.h:73
Definition: blobs.h:397
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
TBOX bounding_box() const
Definition: blobs.cpp:472
GenericVector< SEAM * > seam_array
Definition: pageres.h:216
Definition: blobs.h:263
double overlap_fraction(const TBOX &box) const
Definition: rect.h:388
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
int16_t y
Definition: blobs.h:74
Definition: blobs.h:52
Definition: seam.h:38
void DenormTransform(const DENORM *last_denorm, const TPOINT &pt, TPOINT *original) const
Definition: normalis.cpp:390
SEAM * attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
Definition: chopper.cpp:211
int16_t right() const
Definition: rect.h:79
int16_t bottom() const
Definition: rect.h:65
int NumBlobs() const
Definition: blobs.h:427
int16_t left() const
Definition: rect.h:72
TWERD * chopped_word
Definition: pageres.h:214
int size() const
Definition: genericvector.h:70
DENORM denorm
Definition: pageres.h:203
void tesseract::Wordrec::chop_word_main ( WERD_RES word)

Definition at line 391 of file chopper.cpp.

391  {
392  int num_blobs = word->chopped_word->NumBlobs();
393  if (word->ratings == nullptr) {
394  word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks);
395  }
396  if (word->ratings->get(0, 0) == nullptr) {
397  // Run initial classification.
398  for (int b = 0; b < num_blobs; ++b) {
399  BLOB_CHOICE_LIST* choices = classify_piece(word->seam_array, b, b,
400  "Initial:", word->chopped_word,
401  word->blamer_bundle);
402  word->ratings->put(b, b, choices);
403  }
404  } else {
405  // Blobs have been pre-classified. Set matrix cell for all blob choices
406  for (int col = 0; col < word->ratings->dimension(); ++col) {
407  for (int row = col; row < word->ratings->dimension() &&
408  row < col + word->ratings->bandwidth(); ++row) {
409  BLOB_CHOICE_LIST* choices = word->ratings->get(col, row);
410  if (choices != nullptr) {
411  BLOB_CHOICE_IT bc_it(choices);
412  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
413  bc_it.data()->set_matrix_cell(col, row);
414  }
415  }
416  }
417  }
418  }
419 
420  // Run Segmentation Search.
421  BestChoiceBundle best_choice_bundle(word->ratings->dimension());
422  SegSearch(word, &best_choice_bundle, word->blamer_bundle);
423 
424  if (word->best_choice == nullptr) {
425  // SegSearch found no valid paths, so just use the leading diagonal.
427  }
428  word->RebuildBestState();
429  // If we finished without a hyphen at the end of the word, let the next word
430  // be found in the dictionary.
431  if (word->word->flag(W_EOL) &&
432  !getDict().has_hyphen_end(*word->best_choice)) {
433  getDict().reset_hyphen_vars(true);
434  }
435 
436  if (word->blamer_bundle != nullptr && this->fill_lattice_ != nullptr) {
437  CallFillLattice(*word->ratings, word->best_choices,
438  *word->uch_set, word->blamer_bundle);
439  }
440  if (wordrec_debug_level > 0) {
441  tprintf("Final Ratings Matrix:\n");
442  word->ratings->print(getDict().getUnicharset());
443  }
444  word->FilterWordChoices(getDict().stopper_debug_level);
445 }
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:517
virtual Dict & getDict()
Definition: classify.h:107
GenericVector< SEAM * > seam_array
Definition: pageres.h:216
WERD_CHOICE_LIST best_choices
Definition: pageres.h:242
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:902
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:112
T get(ICOORD pos) const
Definition: matrix.h:231
end of line
Definition: werd.h:33
const UNICHARSET * uch_set
Definition: pageres.h:205
void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:264
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:50
int wordrec_max_join_chunks
Definition: wordrec.h:233
int wordrec_debug_level
Definition: wordrec.h:231
void put(ICOORD pos, const T &thing)
Definition: matrix.h:223
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:147
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
Definition: matrix.h:578
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:28
int NumBlobs() const
Definition: blobs.h:427
WERD_CHOICE * best_choice
Definition: pageres.h:234
MATRIX * ratings
Definition: pageres.h:230
int dimension() const
Definition: matrix.h:536
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
TWERD * chopped_word
Definition: pageres.h:214
void RebuildBestState()
Definition: pageres.cpp:812
int bandwidth() const
Definition: matrix.h:538
WERD * word
Definition: pageres.h:188
void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:42
BlamerBundle * blamer_bundle
Definition: pageres.h:245
BLOB_CHOICE_LIST * tesseract::Wordrec::classify_blob ( TBLOB blob,
const char *  string,
C_COL  color,
BlamerBundle blamer_bundle 
)

Definition at line 54 of file wordclass.cpp.

56  {
57 #ifndef GRAPHICS_DISABLED
59  display_blob(blob, color);
60 #endif
61  // TODO(rays) collapse with call_matcher and move all to wordrec.cpp.
62  BLOB_CHOICE_LIST* choices = call_matcher(blob);
63  // If a blob with the same bounding box as one of the truth character
64  // bounding boxes is not classified as the corresponding truth character
65  // blame character classifier for incorrect answer.
66  if (blamer_bundle != nullptr) {
67  blamer_bundle->BlameClassifier(getDict().getUnicharset(),
68  blob->bounding_box(),
69  *choices,
71  }
72  #ifndef GRAPHICS_DISABLED
73  if (classify_debug_level && string)
74  print_ratings_list(string, choices, getDict().getUnicharset());
75 
78 #endif
79 
80  return choices;
81 }
virtual Dict & getDict()
Definition: classify.h:107
TBOX bounding_box() const
Definition: blobs.cpp:472
BLOB_CHOICE_LIST * call_matcher(TBLOB *blob)
Definition: tface.cpp:140
void display_blob(TBLOB *blob, C_COL color)
Definition: render.cpp:56
void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box, const BLOB_CHOICE_LIST &choices, bool debug)
Definition: blamer.cpp:262
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:833
char window_wait(ScrollView *win)
Definition: callcpp.cpp:103
ScrollView * blob_window
Definition: render.cpp:35
bool wordrec_display_all_blobs
Definition: render.cpp:41
bool wordrec_blob_pause
Definition: render.cpp:45
bool wordrec_debug_blamer
Definition: wordrec.h:236
BLOB_CHOICE_LIST * tesseract::Wordrec::classify_piece ( const GenericVector< SEAM * > &  seams,
int16_t  start,
int16_t  end,
const char *  description,
TWERD word,
BlamerBundle blamer_bundle 
)
virtual

Definition at line 50 of file pieces.cpp.

55  {
56  if (end > start) SEAM::JoinPieces(seams, word->blobs, start, end);
57  BLOB_CHOICE_LIST *choices = classify_blob(word->blobs[start], description,
58  White, blamer_bundle);
59  // Set the matrix_cell_ entries in all the BLOB_CHOICES.
60  BLOB_CHOICE_IT bc_it(choices);
61  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
62  bc_it.data()->set_matrix_cell(start, end);
63  }
64 
65  if (end > start) SEAM::BreakPieces(seams, word->blobs, start, end);
66 
67  return (choices);
68 }
GenericVector< TBLOB * > blobs
Definition: blobs.h:438
static void JoinPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:210
static void BreakPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:188
Definition: callcpp.h:30
BLOB_CHOICE_LIST * classify_blob(TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
Definition: wordclass.cpp:54
void tesseract::Wordrec::combine_seam ( const SeamPile seam_pile,
const SEAM seam,
SeamQueue seam_queue 
)

Definition at line 198 of file findseam.cpp.

199  {
200  for (int x = 0; x < seam_pile.size(); ++x) {
201  const SEAM *this_one = seam_pile.get(x).data();
202  if (seam->CombineableWith(*this_one, SPLIT_CLOSENESS, chop_ok_split)) {
203  SEAM *new_one = new SEAM(*seam);
204  new_one->CombineWith(*this_one);
205  if (chop_debug > 1) new_one->Print("Combo priority ");
206  add_seam_to_queue(new_one->priority(), new_one, seam_queue);
207  }
208  }
209 }
void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams)
Definition: findseam.cpp:66
void Print(const char *label) const
Definition: seam.cpp:154
void CombineWith(const SEAM &other)
Definition: seam.cpp:54
float priority() const
Definition: seam.h:59
Definition: seam.h:38
double chop_ok_split
Definition: wordrec.h:225
bool CombineableWith(const SEAM &other, int max_x_dist, float max_total_priority) const
Definition: seam.cpp:40
#define SPLIT_CLOSENESS
Definition: findseam.cpp:47
const Pair & get(int index) const
Definition: genericheap.h:87
int tesseract::Wordrec::dict_word ( const WERD_CHOICE word)

Definition at line 89 of file tface.cpp.

89  {
90  return getDict().valid_word(word);
91 }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:787
virtual Dict & getDict()
Definition: classify.h:107
void tesseract::Wordrec::DoSegSearch ( WERD_RES word_res)

Definition at line 36 of file segsearch.cpp.

36  {
37  BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
38  // Run Segmentation Search.
39  SegSearch(word_res, &best_choice_bundle, nullptr);
40 }
MATRIX * ratings
Definition: pageres.h:230
int dimension() const
Definition: matrix.h:536
void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:42
int tesseract::Wordrec::end_recog ( )

Definition at line 62 of file tface.cpp.

62  {
63  program_editdown (0);
64 
65  return (0);
66 }
void program_editdown(int32_t elasped_time)
Definition: tface.cpp:75
void tesseract::Wordrec::fill_filtered_fragment_list ( BLOB_CHOICE_LIST *  choices,
int  fragment_pos,
int  num_frag_parts,
BLOB_CHOICE_LIST *  filtered_choices 
)

Definition at line 99 of file pieces.cpp.

102  {
103  BLOB_CHOICE_IT filtered_choices_it(filtered_choices);
104  BLOB_CHOICE_IT choices_it(choices);
105 
106  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
107  choices_it.forward()) {
108  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
109  const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id);
110 
111  if (frag != nullptr && frag->get_pos() == fragment_pos &&
112  frag->get_total() == num_frag_parts) {
113  // Recover the unichar_id of the unichar that this fragment is
114  // a part of
115  auto *b = new BLOB_CHOICE(*choices_it.data());
116  int original_unichar = unicharset.unichar_to_id(frag->get_unichar());
117  b->set_unichar_id(original_unichar);
118  filtered_choices_it.add_to_end(b);
119  }
120  }
121 
122  filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>);
123 }
UNICHARSET unicharset
Definition: ccutil.h:71
const char * get_unichar() const
Definition: unicharset.h:70
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
int get_pos() const
Definition: unicharset.h:71
int UNICHAR_ID
Definition: unichar.h:34
int get_total() const
Definition: unicharset.h:72
void tesseract::Wordrec::FillLattice ( const MATRIX ratings,
const WERD_CHOICE_LIST &  best_choices,
const UNICHARSET unicharset,
BlamerBundle blamer_bundle 
)
void tesseract::Wordrec::get_fragment_lists ( int16_t  current_frag,
int16_t  current_row,
int16_t  start,
int16_t  num_frag_parts,
int16_t  num_blobs,
MATRIX ratings,
BLOB_CHOICE_LIST *  choice_lists 
)

Definition at line 275 of file pieces.cpp.

278  {
279  if (current_frag == num_frag_parts) {
280  merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts,
281  choice_lists, ratings);
282  return;
283  }
284 
285  for (int16_t x = current_row; x < num_blobs; x++) {
286  BLOB_CHOICE_LIST *choices = ratings->get(current_row, x);
287  if (choices == nullptr)
288  continue;
289 
290  fill_filtered_fragment_list(choices, current_frag, num_frag_parts,
291  &choice_lists[current_frag]);
292  if (!choice_lists[current_frag].empty()) {
293  get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts,
294  num_blobs, ratings, choice_lists);
295  choice_lists[current_frag].clear();
296  }
297  }
298 }
void merge_and_put_fragment_lists(int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
Definition: pieces.cpp:132
T get(ICOORD pos) const
Definition: matrix.h:231
void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
Definition: pieces.cpp:99
void get_fragment_lists(int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
Definition: pieces.cpp:275
PRIORITY tesseract::Wordrec::grade_sharpness ( SPLIT split)

Definition at line 69 of file gradechop.cpp.

69  {
70  PRIORITY grade;
71 
72  grade = point_priority (split->point1) + point_priority (split->point2);
73 
74  if (grade < -360.0)
75  grade = 0;
76  else
77  grade += 360.0;
78 
79  grade *= chop_sharpness_knob; /* Values 0 to -360 */
80 
81  return (grade);
82 }
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:47
EDGEPT * point2
Definition: split.h:104
double chop_sharpness_knob
Definition: wordrec.h:223
float PRIORITY
Definition: seam.h:36
EDGEPT * point1
Definition: split.h:103
PRIORITY tesseract::Wordrec::grade_split_length ( SPLIT split)

Definition at line 46 of file gradechop.cpp.

46  {
47  PRIORITY grade;
48  float split_length;
49 
50  split_length =
52 
53  if (split_length <= 0)
54  grade = 0;
55  else
56  grade = sqrt (split_length) * chop_split_dist_knob;
57 
58  return (std::max(0.0f, grade));
59 }
EDGEPT * point2
Definition: split.h:104
double chop_split_dist_knob
Definition: wordrec.h:218
float PRIORITY
Definition: seam.h:36
EDGEPT * point1
Definition: split.h:103
int WeightedDistance(const EDGEPT &other, int x_factor) const
Definition: blobs.h:101
void tesseract::Wordrec::improve_by_chopping ( float  rating_cert_scale,
WERD_RES word,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle,
LMPainPoints pain_points,
GenericVector< SegSearchPending > *  pending 
)

Definition at line 454 of file chopper.cpp.

459  {
460  int blob_number;
461  do { // improvement loop.
462  // Make a simple vector of BLOB_CHOICEs to make it easy to pick which
463  // one to chop.
464  GenericVector<BLOB_CHOICE*> blob_choices;
465  int num_blobs = word->ratings->dimension();
466  for (int i = 0; i < num_blobs; ++i) {
467  BLOB_CHOICE_LIST* choices = word->ratings->get(i, i);
468  if (choices == nullptr || choices->empty()) {
469  blob_choices.push_back(nullptr);
470  } else {
471  BLOB_CHOICE_IT bc_it(choices);
472  blob_choices.push_back(bc_it.data());
473  }
474  }
475  SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt,
476  false, false, word, &blob_number);
477  if (seam == nullptr) break;
478  // A chop has been made. We have to correct all the data structures to
479  // take into account the extra bottom-level blob.
480  // Put the seam into the seam_array and correct everything else on the
481  // word: ratings matrix (including matrix location in the BLOB_CHOICES),
482  // states in WERD_CHOICEs, and blob widths.
483  word->InsertSeam(blob_number, seam);
484  // Insert a new entry in the beam array.
485  best_choice_bundle->beam.insert(new LanguageModelState, blob_number);
486  // Fixpts are outdated, but will get recalculated.
487  best_choice_bundle->fixpt.clear();
488  // Remap existing pain points.
489  pain_points->RemapForSplit(blob_number);
490  // Insert a new pending at the chop point.
491  pending->insert(SegSearchPending(), blob_number);
492 
493  // Classify the two newly created blobs using ProcessSegSearchPainPoint,
494  // as that updates the pending correctly and adds new pain points.
495  MATRIX_COORD pain_point(blob_number, blob_number);
496  ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word,
497  pain_points, blamer_bundle);
498  pain_point.col = blob_number + 1;
499  pain_point.row = blob_number + 1;
500  ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word,
501  pain_points, blamer_bundle);
502  if (language_model_->language_model_ngram_on) {
503  // N-gram evaluation depends on the number of blobs in a chunk, so we
504  // have to re-evaluate everything in the word.
505  ResetNGramSearch(word, best_choice_bundle, pending);
506  blob_number = 0;
507  }
508  // Run language model incrementally. (Except with the n-gram model on.)
509  UpdateSegSearchNodes(rating_cert_scale, blob_number, pending,
510  word, pain_points, best_choice_bundle, blamer_bundle);
511  } while (!language_model_->AcceptableChoiceFound() &&
512  word->ratings->dimension() < kMaxNumChunks);
513 
514  // If after running only the chopper best_choice is incorrect and no blame
515  // has been yet set, blame the classifier if best_choice is classifier's
516  // top choice and is a dictionary word (i.e. language model could not have
517  // helped). Otherwise blame the tradeoff between the classifier and
518  // the old language model (permuters).
519  if (word->blamer_bundle != nullptr &&
521  !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) {
522  bool valid_permuter = word->best_choice != nullptr &&
525  getDict().getUnicharset(),
526  valid_permuter,
528  }
529 }
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
virtual Dict & getDict()
Definition: classify.h:107
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, bool valid_permuter, bool debug)
Definition: blamer.cpp:374
T get(ICOORD pos) const
Definition: matrix.h:231
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:180
SEAM * improve_one_blob(const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
Definition: chopper.cpp:327
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:118
Definition: seam.h:38
int push_back(T object)
void insert(const T &t, int index)
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:248
WERD_CHOICE * best_choice
Definition: pageres.h:234
MATRIX * ratings
Definition: pageres.h:230
int dimension() const
Definition: matrix.h:536
bool wordrec_debug_blamer
Definition: wordrec.h:236
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:116
void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
Definition: segsearch.cpp:311
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:422
uint8_t permuter() const
Definition: ratngs.h:346
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:465
BlamerBundle * blamer_bundle
Definition: pageres.h:245
SEAM * tesseract::Wordrec::improve_one_blob ( const GenericVector< BLOB_CHOICE * > &  blob_choices,
DANGERR fixpt,
bool  split_next_to_fragment,
bool  italic_blob,
WERD_RES word,
int *  blob_number 
)

Definition at line 327 of file chopper.cpp.

332  {
333  float rating_ceiling = FLT_MAX;
334  SEAM *seam = nullptr;
335  do {
336  *blob_number = select_blob_to_split_from_fixpt(fixpt);
337  if (chop_debug) tprintf("blob_number from fixpt = %d\n", *blob_number);
338  bool split_point_from_dict = (*blob_number != -1);
339  if (split_point_from_dict) {
340  fixpt->clear();
341  } else {
342  *blob_number = select_blob_to_split(blob_choices, rating_ceiling,
343  split_next_to_fragment);
344  }
345  if (chop_debug) tprintf("blob_number = %d\n", *blob_number);
346  if (*blob_number == -1)
347  return nullptr;
348 
349  // TODO(rays) it may eventually help to allow italic_blob to be true,
350  seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob,
351  word->seam_array);
352  if (seam != nullptr)
353  return seam; // Success!
354  if (blob_choices[*blob_number] == nullptr)
355  return nullptr;
356  if (!split_point_from_dict) {
357  // We chopped the worst rated blob, try something else next time.
358  rating_ceiling = blob_choices[*blob_number]->rating();
359  }
360  } while (true);
361  return seam;
362 }
int select_blob_to_split_from_fixpt(DANGERR *fixpt)
Definition: chopper.cpp:626
GenericVector< SEAM * > seam_array
Definition: pageres.h:216
Definition: seam.h:38
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
TWERD * chopped_word
Definition: pageres.h:214
SEAM * chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
Definition: chopper.cpp:265
int select_blob_to_split(const GenericVector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment)
Definition: chopper.cpp:538
void tesseract::Wordrec::InitBlamerForSegSearch ( WERD_RES word_res,
LMPainPoints pain_points,
BlamerBundle blamer_bundle,
STRING blamer_debug 
)
protected

Definition at line 328 of file segsearch.cpp.

331  {
332  pain_points->Clear(); // Clear pain points heap.
334  pain_points, &LMPainPoints::GenerateForBlamer,
335  static_cast<double>(segsearch_max_char_wh_ratio), word_res);
336  blamer_bundle->InitForSegSearch(word_res->best_choice, word_res->ratings,
337  getDict().WildcardID(), wordrec_debug_blamer,
338  blamer_debug, pp_cb);
339  delete pp_cb;
340 }
virtual Dict & getDict()
Definition: classify.h:107
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, bool debug, STRING *debug_str, TessResultCallback2< bool, int, int > *pp_cb)
Definition: blamer.cpp:479
bool GenerateForBlamer(double max_char_wh_ratio, WERD_RES *word_res, int col, int row)
WERD_CHOICE * best_choice
Definition: pageres.h:234
MATRIX * ratings
Definition: pageres.h:230
double segsearch_max_char_wh_ratio
Definition: wordrec.h:244
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
bool wordrec_debug_blamer
Definition: wordrec.h:236
void tesseract::Wordrec::InitialSegSearch ( WERD_RES word_res,
LMPainPoints pain_points,
GenericVector< SegSearchPending > *  pending,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

Definition at line 136 of file segsearch.cpp.

139  {
140  if (segsearch_debug_level > 0) {
141  tprintf("Starting SegSearch on ratings matrix%s:\n",
142  wordrec_enable_assoc ? " (with assoc)" : "");
143  word_res->ratings->print(getDict().getUnicharset());
144  }
145 
146  pain_points->GenerateInitial(word_res);
147 
148  // Compute scaling factor that will help us recover blob outline length
149  // from classifier rating and certainty for the blob.
150  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
151 
154  segsearch_max_char_wh_ratio, rating_cert_scale);
155 
156  // Initialize blamer-related information: map character boxes recorded in
157  // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
158  // ratings matrix. We expect this step to succeed, since when running the
159  // chopper we checked that the correct chops are present.
160  if (blamer_bundle != nullptr) {
161  blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
163  }
164 
165  // pending[col] tells whether there is update work to do to combine
166  // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
167  // As the language model state is updated, pending entries are modified to
168  // minimize duplication of work. It is important that during the update the
169  // children are considered in the non-decreasing order of their column, since
170  // this guarantees that all the parents would be up to date before an update
171  // of a child is done.
172  pending->init_to_size(word_res->ratings->dimension(), SegSearchPending());
173 
174  // Search the ratings matrix for the initial best path.
175  (*pending)[0].SetColumnClassified();
176  UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res,
177  pain_points, best_choice_bundle, blamer_bundle);
178 }
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
bool wordrec_enable_assoc
Definition: wordrec.h:199
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481
virtual Dict & getDict()
Definition: classify.h:107
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:230
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:112
void init_to_size(int size, const T &t)
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:180
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void SetupCorrectSegmentation(const TWERD *word, bool debug)
Definition: blamer.cpp:412
MATRIX * ratings
Definition: pageres.h:230
int dimension() const
Definition: matrix.h:536
double segsearch_max_char_wh_ratio
Definition: wordrec.h:244
TWERD * chopped_word
Definition: pageres.h:214
bool wordrec_debug_blamer
Definition: wordrec.h:236
int segsearch_debug_level
Definition: wordrec.h:238
double certainty_scale
Definition: dict.h:617
bool tesseract::Wordrec::is_inside_angle ( EDGEPT pt)

Definition at line 71 of file chop.cpp.

71  {
72  return angle_change(pt->prev, pt, pt->next) < chop_inside_angle;
73 }
int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
Definition: chop.cpp:81
EDGEPT * next
Definition: blobs.h:171
EDGEPT * prev
Definition: blobs.h:172
void tesseract::Wordrec::merge_and_put_fragment_lists ( int16_t  row,
int16_t  column,
int16_t  num_frag_parts,
BLOB_CHOICE_LIST *  choice_lists,
MATRIX ratings 
)

Definition at line 132 of file pieces.cpp.

135  {
136  auto *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];
137 
138  for (int i = 0; i < num_frag_parts; i++) {
139  choice_lists_it[i].set_to_list(&choice_lists[i]);
140  choice_lists_it[i].mark_cycle_pt();
141  }
142 
143  BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
144  if (merged_choice == nullptr)
145  merged_choice = new BLOB_CHOICE_LIST;
146 
147  bool end_of_list = false;
148  BLOB_CHOICE_IT merged_choice_it(merged_choice);
149  while (!end_of_list) {
150  // Find the maximum unichar_id of the current entry the iterators
151  // are pointing at
152  UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
153  for (int i = 0; i < num_frag_parts; i++) {
154  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
155  if (max_unichar_id < unichar_id) {
156  max_unichar_id = unichar_id;
157  }
158  }
159 
160  // Move the each iterators until it gets to an entry that has a
161  // value greater than or equal to max_unichar_id
162  for (int i = 0; i < num_frag_parts; i++) {
163  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
164  while (!choice_lists_it[i].cycled_list() &&
165  unichar_id < max_unichar_id) {
166  choice_lists_it[i].forward();
167  unichar_id = choice_lists_it[i].data()->unichar_id();
168  }
169  if (choice_lists_it[i].cycled_list()) {
170  end_of_list = true;
171  break;
172  }
173  }
174 
175  if (end_of_list)
176  break;
177 
178  // Checks if the fragments are parts of the same character
179  UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
180  bool same_unichar = true;
181  for (int i = 1; i < num_frag_parts; i++) {
182  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
183  if (unichar_id != first_unichar_id) {
184  same_unichar = false;
185  break;
186  }
187  }
188 
189  if (same_unichar) {
190  // Add the merged character to the result
191  UNICHAR_ID merged_unichar_id = first_unichar_id;
192  GenericVector<ScoredFont> merged_fonts =
193  choice_lists_it[0].data()->fonts();
194  float merged_min_xheight = choice_lists_it[0].data()->min_xheight();
195  float merged_max_xheight = choice_lists_it[0].data()->max_xheight();
196  float positive_yshift = 0, negative_yshift = 0;
197  int merged_script_id = choice_lists_it[0].data()->script_id();
198  BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier();
199 
200  float merged_rating = 0, merged_certainty = 0;
201  for (int i = 0; i < num_frag_parts; i++) {
202  float rating = choice_lists_it[i].data()->rating();
203  float certainty = choice_lists_it[i].data()->certainty();
204 
205  if (i == 0 || certainty < merged_certainty)
206  merged_certainty = certainty;
207  merged_rating += rating;
208 
209  choice_lists_it[i].forward();
210  if (choice_lists_it[i].cycled_list())
211  end_of_list = true;
212  IntersectRange(choice_lists_it[i].data()->min_xheight(),
213  choice_lists_it[i].data()->max_xheight(),
214  &merged_min_xheight, &merged_max_xheight);
215  float yshift = choice_lists_it[i].data()->yshift();
216  if (yshift > positive_yshift) positive_yshift = yshift;
217  if (yshift < negative_yshift) negative_yshift = yshift;
218  // Use the min font rating over the parts.
219  // TODO(rays) font lists are unsorted. Need to be faster?
220  const GenericVector<ScoredFont>& frag_fonts =
221  choice_lists_it[i].data()->fonts();
222  for (int f = 0; f < frag_fonts.size(); ++f) {
223  int merged_f = 0;
224  for (merged_f = 0; merged_f < merged_fonts.size() &&
225  merged_fonts[merged_f].fontinfo_id != frag_fonts[f].fontinfo_id;
226  ++merged_f) {}
227  if (merged_f == merged_fonts.size()) {
228  merged_fonts.push_back(frag_fonts[f]);
229  } else if (merged_fonts[merged_f].score > frag_fonts[f].score) {
230  merged_fonts[merged_f].score = frag_fonts[f].score;
231  }
232  }
233  }
234 
235  float merged_yshift = positive_yshift != 0
236  ? (negative_yshift != 0 ? 0 : positive_yshift)
237  : negative_yshift;
238  auto* choice = new BLOB_CHOICE(merged_unichar_id,
239  merged_rating,
240  merged_certainty,
241  merged_script_id,
242  merged_min_xheight,
243  merged_max_xheight,
244  merged_yshift,
245  classifier);
246  choice->set_fonts(merged_fonts);
247  merged_choice_it.add_to_end(choice);
248  }
249  }
250 
252  print_ratings_list("Merged Fragments", merged_choice,
253  unicharset);
254 
255  if (merged_choice->empty())
256  delete merged_choice;
257  else
258  ratings->put(row, column, merged_choice);
259 
260  delete [] choice_lists_it;
261 }
UNICHARSET unicharset
Definition: ccutil.h:71
T get(ICOORD pos) const
Definition: matrix.h:231
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:833
void put(ICOORD pos, const T &thing)
Definition: matrix.h:223
void IntersectRange(const T &lower1, const T &upper1, T *lower2, T *upper2)
Definition: helpers.h:145
int push_back(T object)
int UNICHAR_ID
Definition: unichar.h:34
int size() const
Definition: genericvector.h:70
BlobChoiceClassifier
Definition: ratngs.h:41
void tesseract::Wordrec::merge_fragments ( MATRIX ratings,
int16_t  num_blobs 
)

Definition at line 307 of file pieces.cpp.

307  {
308  BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks];
309  for (int16_t start = 0; start < num_blobs; start++) {
310  for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks;
311  frag_parts++) {
312  get_fragment_lists(0, start, start, frag_parts, num_blobs,
313  ratings, choice_lists);
314  }
315  }
316 
317  // Delete fragments from the rating matrix
318  for (int16_t x = 0; x < num_blobs; x++) {
319  for (int16_t y = x; y < num_blobs; y++) {
320  BLOB_CHOICE_LIST *choices = ratings->get(x, y);
321  if (choices != nullptr) {
322  BLOB_CHOICE_IT choices_it(choices);
323  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
324  choices_it.forward()) {
325  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
326  const CHAR_FRAGMENT *frag =
327  unicharset.get_fragment(choice_unichar_id);
328  if (frag != nullptr)
329  delete choices_it.extract();
330  }
331  }
332  }
333  }
334 }
UNICHARSET unicharset
Definition: ccutil.h:71
T get(ICOORD pos) const
Definition: matrix.h:231
static const int kMaxChunks
Definition: unicharset.h:55
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
void get_fragment_lists(int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
Definition: pieces.cpp:275
int UNICHAR_ID
Definition: unichar.h:34
bool tesseract::Wordrec::near_point ( EDGEPT point,
EDGEPT line_pt_0,
EDGEPT line_pt_1,
EDGEPT **  near_pt 
)

Definition at line 40 of file outlines.cpp.

42  {
43  TPOINT p;
44 
45  float slope;
46  float intercept;
47 
48  float x0 = line_pt_0->pos.x;
49  float x1 = line_pt_1->pos.x;
50  float y0 = line_pt_0->pos.y;
51  float y1 = line_pt_1->pos.y;
52 
53  if (x0 == x1) {
54  /* Handle vertical line */
55  p.x = static_cast<int16_t>(x0);
56  p.y = point->pos.y;
57  }
58  else {
59  /* Slope and intercept */
60  slope = (y0 - y1) / (x0 - x1);
61  intercept = y1 - x1 * slope;
62 
63  /* Find perpendicular */
64  p.x = static_cast<int16_t>((point->pos.x + (point->pos.y - intercept) * slope) /
65  (slope * slope + 1));
66  p.y = static_cast<int16_t>(slope * p.x + intercept);
67  }
68 
69  if (is_on_line (p, line_pt_0->pos, line_pt_1->pos) &&
70  (!same_point (p, line_pt_0->pos)) && (!same_point (p, line_pt_1->pos))) {
71  /* Intersection on line */
72  *near_pt = make_edgept(p.x, p.y, line_pt_1, line_pt_0);
73  return true;
74  } else { /* Intersection not on line */
75  *near_pt = closest(point, line_pt_0, line_pt_1);
76  return false;
77  }
78 }
int16_t x
Definition: blobs.h:73
#define closest(test_p, p1, p2)
Definition: outlines.h:67
TPOINT pos
Definition: blobs.h:165
#define same_point(p1, p2)
Definition: outlines.h:45
#define is_on_line(p, p0, p1)
Definition: outlines.h:116
int16_t y
Definition: blobs.h:74
Definition: blobs.h:52
EDGEPT * make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev)
Definition: split.cpp:139
void tesseract::Wordrec::new_max_point ( EDGEPT local_max,
PointHeap points 
)

Definition at line 237 of file chop.cpp.

237  {
238  int16_t dir;
239 
240  dir = direction (local_max);
241 
242  if (dir > 0) {
243  add_point_to_list(points, local_max);
244  return;
245  }
246 
247  if (dir == 0 && point_priority (local_max) < 0) {
248  add_point_to_list(points, local_max);
249  return;
250  }
251 }
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:47
void add_point_to_list(PointHeap *point_heap, EDGEPT *point)
Definition: chop.cpp:57
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:38
void tesseract::Wordrec::new_min_point ( EDGEPT local_min,
PointHeap points 
)

Definition at line 213 of file chop.cpp.

213  {
214  int16_t dir;
215 
216  dir = direction (local_min);
217 
218  if (dir < 0) {
219  add_point_to_list(points, local_min);
220  return;
221  }
222 
223  if (dir == 0 && point_priority (local_min) < 0) {
224  add_point_to_list(points, local_min);
225  return;
226  }
227 }
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:47
void add_point_to_list(PointHeap *point_heap, EDGEPT *point)
Definition: chop.cpp:57
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:38
EDGEPT * tesseract::Wordrec::pick_close_point ( EDGEPT critical_point,
EDGEPT vertical_point,
int *  best_dist 
)

Definition at line 116 of file chop.cpp.

118  {
119  EDGEPT *best_point = nullptr;
120  int this_distance;
121  int found_better;
122 
123  do {
124  found_better = false;
125 
126  this_distance = edgept_dist (critical_point, vertical_point);
127  if (this_distance <= *best_dist) {
128 
129  if (!(same_point (critical_point->pos, vertical_point->pos) ||
130  same_point (critical_point->pos, vertical_point->next->pos) ||
131  (best_point && same_point (best_point->pos, vertical_point->pos)) ||
132  is_exterior_point (critical_point, vertical_point))) {
133  *best_dist = this_distance;
134  best_point = vertical_point;
136  found_better = true;
137  }
138  }
139  vertical_point = vertical_point->next;
140  }
141  while (found_better == true);
142 
143  return (best_point);
144 }
EDGEPT * next
Definition: blobs.h:171
TPOINT pos
Definition: blobs.h:165
#define same_point(p1, p2)
Definition: outlines.h:45
#define is_exterior_point(edge, point)
Definition: outlines.h:93
Definition: blobs.h:78
bool chop_vertical_creep
Definition: wordrec.h:210
#define edgept_dist(p1, p2)
Definition: outlines.h:83
SEAM * tesseract::Wordrec::pick_good_seam ( TBLOB blob)

Definition at line 217 of file findseam.cpp.

217  {
218  SeamPile seam_pile(chop_seam_pile_size);
219  EDGEPT *points[MAX_NUM_POINTS];
220  EDGEPT_CLIST new_points;
221  SEAM *seam = nullptr;
222  TESSLINE *outline;
223  int16_t num_points = 0;
224 
225 #ifndef GRAPHICS_DISABLED
226  if (chop_debug > 2)
227  wordrec_display_splits.set_value(true);
228 
229  draw_blob_edges(blob);
230 #endif
231 
232  PointHeap point_heap(MAX_NUM_POINTS);
233  for (outline = blob->outlines; outline; outline = outline->next)
234  prioritize_points(outline, &point_heap);
235 
236  while (!point_heap.empty() && num_points < MAX_NUM_POINTS) {
237  points[num_points++] = point_heap.PeekTop().data;
238  point_heap.Pop(nullptr);
239  }
240 
241  /* Initialize queue */
242  SeamQueue seam_queue(MAX_NUM_SEAMS);
243 
244  try_point_pairs(points, num_points, &seam_queue, &seam_pile, &seam, blob);
245  try_vertical_splits(points, num_points, &new_points,
246  &seam_queue, &seam_pile, &seam, blob);
247 
248  if (seam == nullptr) {
249  choose_best_seam(&seam_queue, nullptr, BAD_PRIORITY, &seam, blob, &seam_pile);
250  } else if (seam->priority() > chop_good_split) {
251  choose_best_seam(&seam_queue, nullptr, seam->priority(), &seam, blob,
252  &seam_pile);
253  }
254 
255  EDGEPT_C_IT it(&new_points);
256  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
257  EDGEPT *inserted_point = it.data();
258  if (seam == nullptr || !seam->UsesPoint(inserted_point)) {
259  for (outline = blob->outlines; outline; outline = outline->next) {
260  if (outline->loop == inserted_point) {
261  outline->loop = outline->loop->next;
262  }
263  }
264  remove_edgept(inserted_point);
265  }
266  }
267 
268  if (seam) {
269  if (seam->priority() > chop_ok_split) {
270  delete seam;
271  seam = nullptr;
272  }
273 #ifndef GRAPHICS_DISABLED
274  else if (wordrec_display_splits) {
275  seam->Mark(edge_window);
276  if (chop_debug > 2) {
279  }
280  }
281 #endif
282  }
283 
284  if (chop_debug)
285  wordrec_display_splits.set_value(false);
286 
287  return (seam);
288 }
void try_vertical_splits(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
Definition: findseam.cpp:336
#define update_edge_window()
Definition: plotedges.h:44
TESSLINE * next
Definition: blobs.h:260
#define MAX_NUM_SEAMS
Definition: findseam.cpp:49
int chop_seam_pile_size
Definition: wordrec.h:214
EDGEPT * next
Definition: blobs.h:171
TESSLINE * outlines
Definition: blobs.h:379
void prioritize_points(TESSLINE *outline, PointHeap *points)
Definition: chop.cpp:154
EDGEPT * loop
Definition: blobs.h:259
#define MAX_NUM_POINTS
Definition: chop.h:33
void try_point_pairs(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
Definition: findseam.cpp:298
float priority() const
Definition: seam.h:59
ScrollView * edge_window
Definition: plotedges.cpp:35
double chop_good_split
Definition: wordrec.h:226
void draw_blob_edges(TBLOB *blob)
Definition: plotedges.cpp:69
#define BAD_PRIORITY
Definition: findseam.cpp:53
bool UsesPoint(const EDGEPT *point) const
Definition: seam.h:82
Definition: blobs.h:78
Definition: seam.h:38
void remove_edgept(EDGEPT *point)
Definition: split.cpp:200
double chop_ok_split
Definition: wordrec.h:225
bool wordrec_display_splits
Definition: split.cpp:41
#define edge_window_wait()
Definition: plotedges.h:56
void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
Definition: findseam.cpp:105
void Mark(ScrollView *window) const
Definition: seam.cpp:180
PRIORITY tesseract::Wordrec::point_priority ( EDGEPT point)

Definition at line 47 of file chop.cpp.

47  {
48  return static_cast<PRIORITY>(angle_change(point->prev, point, point->next));
49 }
int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
Definition: chop.cpp:81
EDGEPT * next
Definition: blobs.h:171
float PRIORITY
Definition: seam.h:36
EDGEPT * prev
Definition: blobs.h:172
void tesseract::Wordrec::prioritize_points ( TESSLINE outline,
PointHeap points 
)

Definition at line 154 of file chop.cpp.

154  {
155  EDGEPT *this_point;
156  EDGEPT *local_min = nullptr;
157  EDGEPT *local_max = nullptr;
158 
159  this_point = outline->loop;
160  local_min = this_point;
161  local_max = this_point;
162  do {
163  if (this_point->vec.y < 0) {
164  /* Look for minima */
165  if (local_max != nullptr)
166  new_max_point(local_max, points);
167  else if (is_inside_angle (this_point))
168  add_point_to_list(points, this_point);
169  local_max = nullptr;
170  local_min = this_point->next;
171  }
172  else if (this_point->vec.y > 0) {
173  /* Look for maxima */
174  if (local_min != nullptr)
175  new_min_point(local_min, points);
176  else if (is_inside_angle (this_point))
177  add_point_to_list(points, this_point);
178  local_min = nullptr;
179  local_max = this_point->next;
180  }
181  else {
182  /* Flat area */
183  if (local_max != nullptr) {
184  if (local_max->prev->vec.y != 0) {
185  new_max_point(local_max, points);
186  }
187  local_max = this_point->next;
188  local_min = nullptr;
189  }
190  else {
191  if (local_min->prev->vec.y != 0) {
192  new_min_point(local_min, points);
193  }
194  local_min = this_point->next;
195  local_max = nullptr;
196  }
197  }
198 
199  /* Next point */
200  this_point = this_point->next;
201  }
202  while (this_point != outline->loop);
203 }
EDGEPT * next
Definition: blobs.h:171
EDGEPT * loop
Definition: blobs.h:259
void add_point_to_list(PointHeap *point_heap, EDGEPT *point)
Definition: chop.cpp:57
int16_t y
Definition: blobs.h:74
Definition: blobs.h:78
void new_min_point(EDGEPT *local_min, PointHeap *points)
Definition: chop.cpp:213
bool is_inside_angle(EDGEPT *pt)
Definition: chop.cpp:71
void new_max_point(EDGEPT *local_max, PointHeap *points)
Definition: chop.cpp:237
VECTOR vec
Definition: blobs.h:166
EDGEPT * prev
Definition: blobs.h:172
void tesseract::Wordrec::ProcessSegSearchPainPoint ( float  pain_point_priority,
const MATRIX_COORD pain_point,
const char *  pain_point_type,
GenericVector< SegSearchPending > *  pending,
WERD_RES word_res,
LMPainPoints pain_points,
BlamerBundle blamer_bundle 
)
protected

Definition at line 248 of file segsearch.cpp.

252  {
253  if (segsearch_debug_level > 0) {
254  tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n",
255  pain_point_type, pain_point_priority,
256  pain_point.col, pain_point.row);
257  }
258  ASSERT_HOST(pain_points != nullptr);
259  MATRIX *ratings = word_res->ratings;
260  // Classify blob [pain_point.col pain_point.row]
261  if (!pain_point.Valid(*ratings)) {
262  ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col);
263  }
264  ASSERT_HOST(pain_point.Valid(*ratings));
265  BLOB_CHOICE_LIST *classified = classify_piece(word_res->seam_array,
266  pain_point.col, pain_point.row,
267  pain_point_type,
268  word_res->chopped_word,
269  blamer_bundle);
270  BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row);
271  if (lst == nullptr) {
272  ratings->put(pain_point.col, pain_point.row, classified);
273  } else {
274  // We can not delete old BLOB_CHOICEs, since they might contain
275  // ViterbiStateEntries that are parents of other "active" entries.
276  // Thus if the matrix cell already contains classifications we add
277  // the new ones to the beginning of the list.
278  BLOB_CHOICE_IT it(lst);
279  it.add_list_before(classified);
280  delete classified; // safe to delete, since empty after add_list_before()
281  classified = nullptr;
282  }
283 
284  if (segsearch_debug_level > 0) {
285  print_ratings_list("Updated ratings matrix with a new entry:",
286  ratings->get(pain_point.col, pain_point.row),
287  getDict().getUnicharset());
288  ratings->print(getDict().getUnicharset());
289  }
290 
291  // Insert initial "pain points" to join the newly classified blob
292  // with its left and right neighbors.
293  if (classified != nullptr && !classified->empty()) {
294  if (pain_point.col > 0) {
295  pain_points->GeneratePainPoint(
296  pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0,
297  true, segsearch_max_char_wh_ratio, word_res);
298  }
299  if (pain_point.row + 1 < ratings->dimension()) {
300  pain_points->GeneratePainPoint(
301  pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0,
302  true, segsearch_max_char_wh_ratio, word_res);
303  }
304  }
305  (*pending)[pain_point.col].SetBlobClassified(pain_point.row);
306 }
virtual Dict & getDict()
Definition: classify.h:107
GenericVector< SEAM * > seam_array
Definition: pageres.h:216
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:112
T get(ICOORD pos) const
Definition: matrix.h:231
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:833
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:50
void put(ICOORD pos, const T &thing)
Definition: matrix.h:223
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:49
Definition: matrix.h:578
MATRIX * ratings
Definition: pageres.h:230
#define ASSERT_HOST(x)
Definition: errcode.h:88
int dimension() const
Definition: matrix.h:536
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
double segsearch_max_char_wh_ratio
Definition: wordrec.h:244
TWERD * chopped_word
Definition: pageres.h:214
bool Valid(const MATRIX &m) const
Definition: matrix.h:618
int segsearch_debug_level
Definition: wordrec.h:238
void tesseract::Wordrec::program_editdown ( int32_t  elasped_time)

Definition at line 75 of file tface.cpp.

75  {
76 #ifndef DISABLED_LEGACY_ENGINE
78 #endif // ndef DISABLED_LEGACY_ENGINE
79  getDict().End();
80 }
virtual Dict & getDict()
Definition: classify.h:107
void End()
Definition: dict.cpp:381
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:459
void tesseract::Wordrec::program_editup ( const char *  textbase,
TessdataManager init_classifier,
TessdataManager init_dict 
)

Definition at line 40 of file tface.cpp.

42  {
43  if (textbase != nullptr) imagefile = textbase;
44 #ifndef DISABLED_LEGACY_ENGINE
46  InitAdaptiveClassifier(init_classifier);
47  if (init_dict) {
49  getDict().Load(lang, init_dict);
50  getDict().FinishLoad();
51  }
53 #endif // ndef DISABLED_LEGACY_ENGINE
54 }
virtual Dict & getDict()
Definition: classify.h:107
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:547
bool FinishLoad()
Definition: dict.cpp:360
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:201
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:219
double chop_ok_split
Definition: wordrec.h:225
STRING lang
Definition: ccutil.h:69
void InitAdaptiveClassifier(TessdataManager *mgr)
Definition: adaptmatch.cpp:527
static TESS_API DawgCache * GlobalDawgCache()
Definition: dict.cpp:193
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:112
STRING imagefile
Definition: ccutil.h:73
PRIORITY pass2_ok_split
Definition: wordrec.h:477
void tesseract::Wordrec::ResetNGramSearch ( WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
GenericVector< SegSearchPending > *  pending 
)
protected

Definition at line 311 of file segsearch.cpp.

313  {
314  // TODO(rays) More refactoring required here.
315  // Delete existing viterbi states.
316  for (int col = 0; col < best_choice_bundle->beam.size(); ++col) {
317  best_choice_bundle->beam[col]->Clear();
318  }
319  // Reset best_choice_bundle.
320  word_res->ClearWordChoices();
321  best_choice_bundle->best_vse = nullptr;
322  // Clear out all existing pendings and add a new one for the first column.
323  (*pending)[0].SetColumnClassified();
324  for (int i = 1; i < pending->size(); ++i)
325  (*pending)[i].Clear();
326 }
void ClearWordChoices()
Definition: pageres.cpp:1176
int size() const
Definition: genericvector.h:70
void tesseract::Wordrec::SaveAltChoices ( const LIST best_choices,
WERD_RES word 
)
void tesseract::Wordrec::SegSearch ( WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

Definition at line 42 of file segsearch.cpp.

44  {
45  LMPainPoints pain_points(segsearch_max_pain_points,
49  // Compute scaling factor that will help us recover blob outline length
50  // from classifier rating and certainty for the blob.
51  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
53  InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle,
54  blamer_bundle);
55 
56  if (!SegSearchDone(0)) { // find a better choice
57  if (chop_enable && word_res->chopped_word != nullptr) {
58  improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle,
59  blamer_bundle, &pain_points, &pending);
60  }
61  if (chop_debug) SEAM::PrintSeams("Final seam list:", word_res->seam_array);
62 
63  if (blamer_bundle != nullptr &&
64  !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) {
65  blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer);
66  }
67  }
68  // Keep trying to find a better path by fixing the "pain points".
69 
70  MATRIX_COORD pain_point;
71  float pain_point_priority;
72  int num_futile_classifications = 0;
73  STRING blamer_debug;
74  while (wordrec_enable_assoc &&
75  (!SegSearchDone(num_futile_classifications) ||
76  (blamer_bundle != nullptr &&
77  blamer_bundle->GuidedSegsearchStillGoing()))) {
78  // Get the next valid "pain point".
79  bool found_nothing = true;
80  LMPainPointsType pp_type;
81  while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) !=
82  LM_PPTYPE_NUM) {
83  if (!pain_point.Valid(*word_res->ratings)) {
84  word_res->ratings->IncreaseBandSize(
85  pain_point.row - pain_point.col + 1);
86  }
87  if (pain_point.Valid(*word_res->ratings) &&
88  !word_res->ratings->Classified(pain_point.col, pain_point.row,
89  getDict().WildcardID())) {
90  found_nothing = false;
91  break;
92  }
93  }
94  if (found_nothing) {
95  if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
96  break;
97  }
98  ProcessSegSearchPainPoint(pain_point_priority, pain_point,
100  &pending, word_res, &pain_points, blamer_bundle);
101 
102  UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending,
103  word_res, &pain_points, best_choice_bundle,
104  blamer_bundle);
105  if (!best_choice_bundle->updated) ++num_futile_classifications;
106 
107  if (segsearch_debug_level > 0) {
108  tprintf("num_futile_classifications %d\n", num_futile_classifications);
109  }
110 
111  best_choice_bundle->updated = false; // reset updated
112 
113  // See if it's time to terminate SegSearch or time for starting a guided
114  // search for the true path to find the blame for the incorrect best_choice.
115  if (SegSearchDone(num_futile_classifications) &&
116  blamer_bundle != nullptr &&
117  blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) {
118  InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle,
119  &blamer_debug);
120  }
121  } // end while loop exploring alternative paths
122  if (blamer_bundle != nullptr) {
123  blamer_bundle->FinishSegSearch(word_res->best_choice,
124  wordrec_debug_blamer, &blamer_debug);
125  }
126 
127  if (segsearch_debug_level > 0) {
128  tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
129  language_model_->AcceptableChoiceFound());
130  }
131 }
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
bool wordrec_enable_assoc
Definition: wordrec.h:199
void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:136
void improve_by_chopping(float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
Definition: chopper.cpp:454
int segsearch_max_pain_points
Definition: wordrec.h:240
bool SegSearchDone(int num_futile_classifications)
Definition: wordrec.h:491
Definition: strngs.h:45
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, STRING *debug_str)
Definition: blamer.cpp:514
static const char * PainPointDescription(LMPainPointsType type)
virtual Dict & getDict()
Definition: classify.h:107
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:230
bool Classified(int col, int row, int wildcard_id) const
Definition: matrix.cpp:36
GenericVector< SEAM * > seam_array
Definition: pageres.h:216
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const
Definition: blamer.cpp:466
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:180
void SetChopperBlame(const WERD_RES *word, bool debug)
Definition: blamer.cpp:315
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:49
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:248
WERD_CHOICE * best_choice
Definition: pageres.h:234
MATRIX * ratings
Definition: pageres.h:230
double segsearch_max_char_wh_ratio
Definition: wordrec.h:244
TWERD * chopped_word
Definition: pageres.h:214
bool Valid(const MATRIX &m) const
Definition: matrix.h:618
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:509
bool wordrec_debug_blamer
Definition: wordrec.h:236
int segsearch_debug_level
Definition: wordrec.h:238
static void PrintSeams(const char *label, const GenericVector< SEAM * > &seams)
Definition: seam.cpp:167
void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
Definition: segsearch.cpp:328
double certainty_scale
Definition: dict.h:617
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:116
bool tesseract::Wordrec::SegSearchDone ( int  num_futile_classifications)
inlineprotected

Definition at line 491 of file wordrec.h.

491  {
492  return (language_model_->AcceptableChoiceFound() ||
493  num_futile_classifications >=
495  }
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
int segsearch_max_futile_classifications
Definition: wordrec.h:242
int tesseract::Wordrec::select_blob_to_split ( const GenericVector< BLOB_CHOICE * > &  blob_choices,
float  rating_ceiling,
bool  split_next_to_fragment 
)

Definition at line 538 of file chopper.cpp.

540  {
541  BLOB_CHOICE *blob_choice;
542  int x;
543  float worst = -FLT_MAX;
544  int worst_index = -1;
545  float worst_near_fragment = -FLT_MAX;
546  int worst_index_near_fragment = -1;
547  const CHAR_FRAGMENT **fragments = nullptr;
548 
549  if (chop_debug) {
550  if (rating_ceiling < FLT_MAX)
551  tprintf("rating_ceiling = %8.4f\n", rating_ceiling);
552  else
553  tprintf("rating_ceiling = No Limit\n");
554  }
555 
556  if (split_next_to_fragment && blob_choices.size() > 0) {
557  fragments = new const CHAR_FRAGMENT *[blob_choices.length()];
558  if (blob_choices[0] != nullptr) {
559  fragments[0] = getDict().getUnicharset().get_fragment(
560  blob_choices[0]->unichar_id());
561  } else {
562  fragments[0] = nullptr;
563  }
564  }
565 
566  for (x = 0; x < blob_choices.size(); ++x) {
567  if (blob_choices[x] == nullptr) {
568  delete[] fragments;
569  return x;
570  } else {
571  blob_choice = blob_choices[x];
572  // Populate fragments for the following position.
573  if (split_next_to_fragment && x+1 < blob_choices.size()) {
574  if (blob_choices[x + 1] != nullptr) {
575  fragments[x + 1] = getDict().getUnicharset().get_fragment(
576  blob_choices[x + 1]->unichar_id());
577  } else {
578  fragments[x + 1] = nullptr;
579  }
580  }
581  if (blob_choice->rating() < rating_ceiling &&
582  blob_choice->certainty() < tessedit_certainty_threshold) {
583  // Update worst and worst_index.
584  if (blob_choice->rating() > worst) {
585  worst_index = x;
586  worst = blob_choice->rating();
587  }
588  if (split_next_to_fragment) {
589  // Update worst_near_fragment and worst_index_near_fragment.
590  bool expand_following_fragment =
591  (x + 1 < blob_choices.size() &&
592  fragments[x+1] != nullptr && !fragments[x+1]->is_beginning());
593  bool expand_preceding_fragment =
594  (x > 0 && fragments[x-1] != nullptr && !fragments[x-1]->is_ending());
595  if ((expand_following_fragment || expand_preceding_fragment) &&
596  blob_choice->rating() > worst_near_fragment) {
597  worst_index_near_fragment = x;
598  worst_near_fragment = blob_choice->rating();
599  if (chop_debug) {
600  tprintf("worst_index_near_fragment=%d"
601  " expand_following_fragment=%d"
602  " expand_preceding_fragment=%d\n",
603  worst_index_near_fragment,
604  expand_following_fragment,
605  expand_preceding_fragment);
606  }
607  }
608  }
609  }
610  }
611  }
612  delete[] fragments;
613  // TODO(daria): maybe a threshold of badness for
614  // worst_near_fragment would be useful.
615  return worst_index_near_fragment != -1 ?
616  worst_index_near_fragment : worst_index;
617 }
int length() const
Definition: genericvector.h:84
float rating() const
Definition: ratngs.h:80
virtual Dict & getDict()
Definition: classify.h:107
bool is_ending() const
Definition: unicharset.h:108
double tessedit_certainty_threshold
Definition: wordrec.h:207
float certainty() const
Definition: ratngs.h:83
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int size() const
Definition: genericvector.h:70
bool is_beginning() const
Definition: unicharset.h:105
int tesseract::Wordrec::select_blob_to_split_from_fixpt ( DANGERR fixpt)

Definition at line 626 of file chopper.cpp.

626  {
627  if (!fixpt)
628  return -1;
629  for (int i = 0; i < fixpt->size(); i++) {
630  if ((*fixpt)[i].begin + 1 == (*fixpt)[i].end &&
631  (*fixpt)[i].dangerous &&
632  (*fixpt)[i].correct_is_ngram) {
633  return (*fixpt)[i].begin;
634  }
635  }
636  return -1;
637 }
int size() const
Definition: genericvector.h:70
void tesseract::Wordrec::set_pass1 ( )

Definition at line 101 of file tface.cpp.

101  {
102  chop_ok_split.set_value(70.0);
103  language_model_->getParamsModel().SetPass(ParamsModel::PTRAIN_PASS1);
104  SettupPass1();
105 }
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
double chop_ok_split
Definition: wordrec.h:225
void tesseract::Wordrec::set_pass2 ( )

Definition at line 113 of file tface.cpp.

113  {
114  chop_ok_split.set_value(pass2_ok_split);
115  language_model_->getParamsModel().SetPass(ParamsModel::PTRAIN_PASS2);
116  SettupPass2();
117 }
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
double chop_ok_split
Definition: wordrec.h:225
PRIORITY pass2_ok_split
Definition: wordrec.h:477
void tesseract::Wordrec::try_point_pairs ( EDGEPT points[MAX_NUM_POINTS],
int16_t  num_points,
SeamQueue seam_queue,
SeamPile seam_pile,
SEAM **  seam,
TBLOB blob 
)

Definition at line 298 of file findseam.cpp.

303  {
304  int16_t x;
305  int16_t y;
306  PRIORITY priority;
307 
308  for (x = 0; x < num_points; x++) {
309  for (y = x + 1; y < num_points; y++) {
310  if (points[y] &&
311  points[x]->WeightedDistance(*points[y], chop_x_y_weight) <
313  points[x] != points[y]->next && points[y] != points[x]->next &&
314  !is_exterior_point(points[x], points[y]) &&
315  !is_exterior_point(points[y], points[x])) {
316  SPLIT split(points[x], points[y]);
317  priority = partial_split_priority(&split);
318 
319  choose_best_seam(seam_queue, &split, priority, seam, blob, seam_pile);
320  }
321  }
322  }
323 }
Definition: split.h:37
#define partial_split_priority(split)
Definition: findseam.cpp:41
float PRIORITY
Definition: seam.h:36
#define is_exterior_point(edge, point)
Definition: outlines.h:93
void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
Definition: findseam.cpp:105
void tesseract::Wordrec::try_vertical_splits ( EDGEPT points[MAX_NUM_POINTS],
int16_t  num_points,
EDGEPT_CLIST *  new_points,
SeamQueue seam_queue,
SeamPile seam_pile,
SEAM **  seam,
TBLOB blob 
)

Definition at line 336 of file findseam.cpp.

342  {
343  EDGEPT *vertical_point = nullptr;
344  int16_t x;
345  PRIORITY priority;
346  TESSLINE *outline;
347 
348  for (x = 0; x < num_points; x++) {
349  vertical_point = nullptr;
350  for (outline = blob->outlines; outline; outline = outline->next) {
351  vertical_projection_point(points[x], outline->loop,
352  &vertical_point, new_points);
353  }
354 
355  if (vertical_point && points[x] != vertical_point->next &&
356  vertical_point != points[x]->next &&
357  points[x]->WeightedDistance(*vertical_point, chop_x_y_weight) <
359  SPLIT split(points[x], vertical_point);
360  priority = partial_split_priority(&split);
361  choose_best_seam(seam_queue, &split, priority, seam, blob, seam_pile);
362  }
363  }
364 }
Definition: split.h:37
TESSLINE * next
Definition: blobs.h:260
#define partial_split_priority(split)
Definition: findseam.cpp:41
void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
Definition: chop.cpp:266
EDGEPT * next
Definition: blobs.h:171
TESSLINE * outlines
Definition: blobs.h:379
float PRIORITY
Definition: seam.h:36
EDGEPT * loop
Definition: blobs.h:259
Definition: blobs.h:78
int WeightedDistance(const EDGEPT &other, int x_factor) const
Definition: blobs.h:101
void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
Definition: findseam.cpp:105
void tesseract::Wordrec::UpdateSegSearchNodes ( float  rating_cert_scale,
int  starting_col,
GenericVector< SegSearchPending > *  pending,
WERD_RES word_res,
LMPainPoints pain_points,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 180 of file segsearch.cpp.

187  {
188  MATRIX *ratings = word_res->ratings;
189  ASSERT_HOST(ratings->dimension() == pending->size());
190  ASSERT_HOST(ratings->dimension() == best_choice_bundle->beam.size());
191  for (int col = starting_col; col < ratings->dimension(); ++col) {
192  if (!(*pending)[col].WorkToDo()) continue;
193  int first_row = col;
194  int last_row = std::min(ratings->dimension() - 1,
195  col + ratings->bandwidth() - 1);
196  if ((*pending)[col].SingleRow() >= 0) {
197  first_row = last_row = (*pending)[col].SingleRow();
198  }
199  if (segsearch_debug_level > 0) {
200  tprintf("\n\nUpdateSegSearchNodes: col=%d, rows=[%d,%d], alljust=%d\n",
201  col, first_row, last_row,
202  (*pending)[col].IsRowJustClassified(INT32_MAX));
203  }
204  // Iterate over the pending list for this column.
205  for (int row = first_row; row <= last_row; ++row) {
206  // Update language model state of this child+parent pair.
207  BLOB_CHOICE_LIST *current_node = ratings->get(col, row);
208  LanguageModelState *parent_node =
209  col == 0 ? nullptr : best_choice_bundle->beam[col - 1];
210  if (current_node != nullptr &&
211  language_model_->UpdateState((*pending)[col].IsRowJustClassified(row),
212  col, row, current_node, parent_node,
213  pain_points, word_res,
214  best_choice_bundle, blamer_bundle) &&
215  row + 1 < ratings->dimension()) {
216  // Since the language model state of this entry changed, process all
217  // the child column.
218  (*pending)[row + 1].RevisitWholeColumn();
219  if (segsearch_debug_level > 0) {
220  tprintf("Added child col=%d to pending\n", row + 1);
221  }
222  } // end if UpdateState.
223  } // end for row.
224  } // end for col.
225  if (best_choice_bundle->best_vse != nullptr) {
226  ASSERT_HOST(word_res->StatesAllValid());
227  if (best_choice_bundle->best_vse->updated) {
228  pain_points->GenerateFromPath(rating_cert_scale,
229  best_choice_bundle->best_vse, word_res);
230  if (!best_choice_bundle->fixpt.empty()) {
231  pain_points->GenerateFromAmbigs(best_choice_bundle->fixpt,
232  best_choice_bundle->best_vse, word_res);
233  }
234  }
235  }
236  // The segsearch is completed. Reset all updated flags on all VSEs and reset
237  // all pendings.
238  for (int col = 0; col < pending->size(); ++col) {
239  (*pending)[col].Clear();
240  ViterbiStateEntry_IT
241  vse_it(&best_choice_bundle->beam[col]->viterbi_state_entries);
242  for (vse_it.mark_cycle_pt(); !vse_it.cycled_list(); vse_it.forward()) {
243  vse_it.data()->updated = false;
244  }
245  }
246 }
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
T get(ICOORD pos) const
Definition: matrix.h:231
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
Definition: matrix.h:578
MATRIX * ratings
Definition: pageres.h:230
#define ASSERT_HOST(x)
Definition: errcode.h:88
int dimension() const
Definition: matrix.h:536
int bandwidth() const
Definition: matrix.h:538
int segsearch_debug_level
Definition: wordrec.h:238
int size() const
Definition: genericvector.h:70
void tesseract::Wordrec::vertical_projection_point ( EDGEPT split_point,
EDGEPT target_point,
EDGEPT **  best_point,
EDGEPT_CLIST *  new_points 
)

Definition at line 266 of file chop.cpp.

268  {
269  EDGEPT *p; /* Iterator */
270  EDGEPT *this_edgept; /* Iterator */
271  EDGEPT_C_IT new_point_it(new_points);
272  int x = split_point->pos.x; /* X value of vertical */
273  int best_dist = LARGE_DISTANCE;/* Best point found */
274 
275  if (*best_point != nullptr)
276  best_dist = edgept_dist(split_point, *best_point);
277 
278  p = target_point;
279  /* Look at each edge point */
280  do {
281  if (((p->pos.x <= x && x <= p->next->pos.x) ||
282  (p->next->pos.x <= x && x <= p->pos.x)) &&
283  !same_point(split_point->pos, p->pos) &&
284  !same_point(split_point->pos, p->next->pos) &&
285  !p->IsChopPt() &&
286  (*best_point == nullptr || !same_point((*best_point)->pos, p->pos))) {
287 
288  if (near_point(split_point, p, p->next, &this_edgept)) {
289  new_point_it.add_before_then_move(this_edgept);
290  }
291 
292  if (*best_point == nullptr)
293  best_dist = edgept_dist (split_point, this_edgept);
294 
295  this_edgept =
296  pick_close_point(split_point, this_edgept, &best_dist);
297  if (this_edgept)
298  *best_point = this_edgept;
299  }
300 
301  p = p->next;
302  }
303  while (p != target_point);
304 }
int16_t x
Definition: blobs.h:73
EDGEPT * next
Definition: blobs.h:171
TPOINT pos
Definition: blobs.h:165
#define same_point(p1, p2)
Definition: outlines.h:45
#define LARGE_DISTANCE
Definition: outlines.h:32
bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
Definition: outlines.cpp:40
Definition: blobs.h:78
EDGEPT * pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
Definition: chop.cpp:116
#define edgept_dist(p1, p2)
Definition: outlines.h:83
bool IsChopPt() const
Definition: blobs.h:161

Member Data Documentation

bool tesseract::Wordrec::assume_fixed_pitch_char_segment = false

"include fixed-pitch heuristics in char segmentation"

Definition at line 230 of file wordrec.h.

GenericVector<int> tesseract::Wordrec::blame_reasons_

Definition at line 483 of file wordrec.h.

double tesseract::Wordrec::chop_center_knob = 0.15

"Split center adjustment"

Definition at line 220 of file wordrec.h.

int tesseract::Wordrec::chop_centered_maxwidth = 90

"Width of (smaller) chopped blobs " "above which we don't care that a chop is not near the center."

Definition at line 222 of file wordrec.h.

int tesseract::Wordrec::chop_debug = 0

"Chop debug"

Definition at line 208 of file wordrec.h.

bool tesseract::Wordrec::chop_enable = 1

"Chop enable"

Definition at line 209 of file wordrec.h.

double tesseract::Wordrec::chop_good_split = 50.0

"Good split limit"

Definition at line 226 of file wordrec.h.

int tesseract::Wordrec::chop_inside_angle = -50

"Min Inside Angle Bend"

Definition at line 216 of file wordrec.h.

int tesseract::Wordrec::chop_min_outline_area = 2000

"Min Outline Area"

Definition at line 217 of file wordrec.h.

int tesseract::Wordrec::chop_min_outline_points = 6

"Min Number of Points on Outline"

Definition at line 213 of file wordrec.h.

bool tesseract::Wordrec::chop_new_seam_pile = 1

"Use new seam_pile"

Definition at line 215 of file wordrec.h.

double tesseract::Wordrec::chop_ok_split = 100.0

"OK split limit"

Definition at line 225 of file wordrec.h.

double tesseract::Wordrec::chop_overlap_knob = 0.9

"Split overlap adjustment"

Definition at line 219 of file wordrec.h.

int tesseract::Wordrec::chop_same_distance = 2

"Same distance"

Definition at line 212 of file wordrec.h.

int tesseract::Wordrec::chop_seam_pile_size = 150

"Max number of seams in seam_pile"

Definition at line 214 of file wordrec.h.

double tesseract::Wordrec::chop_sharpness_knob = 0.06

"Split sharpness adjustment"

Definition at line 223 of file wordrec.h.

double tesseract::Wordrec::chop_split_dist_knob = 0.5

"Split length adjustment"

Definition at line 218 of file wordrec.h.

int tesseract::Wordrec::chop_split_length = 10000

"Split Length"

Definition at line 211 of file wordrec.h.

bool tesseract::Wordrec::chop_vertical_creep = 0

"Vertical creep"

Definition at line 210 of file wordrec.h.

double tesseract::Wordrec::chop_width_change_knob = 5.0

"Width change adjustment"

Definition at line 224 of file wordrec.h.

int tesseract::Wordrec::chop_x_y_weight = 3

"X / Y length weight"

Definition at line 227 of file wordrec.h.

void(Wordrec::* tesseract::Wordrec::fill_lattice_) (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

Definition at line 485 of file wordrec.h.

bool tesseract::Wordrec::force_word_assoc = false

"force associator to run regardless of what enable_assoc is." "This is used for CJK where component grouping is necessary."

Definition at line 202 of file wordrec.h.

bool tesseract::Wordrec::fragments_guide_chopper = false

"Use information from fragments to guide chopping process"

Definition at line 205 of file wordrec.h.

std::unique_ptr<LanguageModel> tesseract::Wordrec::language_model_

Definition at line 476 of file wordrec.h.

bool tesseract::Wordrec::merge_fragments_in_matrix = true

"Merge the fragments in the ratings matrix and delete them " "after merging"

Definition at line 197 of file wordrec.h.

PRIORITY tesseract::Wordrec::pass2_ok_split

Definition at line 477 of file wordrec.h.

WERD_CHOICE* tesseract::Wordrec::prev_word_best_choice_

Definition at line 481 of file wordrec.h.

int tesseract::Wordrec::repair_unchopped_blobs = 1

"Fix blobs that aren't chopped"

Definition at line 206 of file wordrec.h.

bool tesseract::Wordrec::save_alt_choices = true

"Save alternative paths found during chopping " "and segmentation search"

Definition at line 247 of file wordrec.h.

int tesseract::Wordrec::segment_adjust_debug = 0

"Segmentation adjustment debug"

Definition at line 228 of file wordrec.h.

int tesseract::Wordrec::segsearch_debug_level = 0

"SegSearch debug level"

Definition at line 238 of file wordrec.h.

double tesseract::Wordrec::segsearch_max_char_wh_ratio = 2.0

"Maximum character width-to-height ratio"

Definition at line 244 of file wordrec.h.

int tesseract::Wordrec::segsearch_max_futile_classifications = 10

"Maximum number of pain point classifications per word."

Definition at line 242 of file wordrec.h.

int tesseract::Wordrec::segsearch_max_pain_points = 2000

"Maximum number of pain points stored in the queue"

Definition at line 240 of file wordrec.h.

double tesseract::Wordrec::tessedit_certainty_threshold = -2.25

"Good blob limit"

Definition at line 207 of file wordrec.h.

bool tesseract::Wordrec::wordrec_debug_blamer = false

"Print blamer debug messages"

Definition at line 236 of file wordrec.h.

int tesseract::Wordrec::wordrec_debug_level = 0

"Debug level for wordrec"

Definition at line 231 of file wordrec.h.

bool tesseract::Wordrec::wordrec_enable_assoc = true

"Associator Enable"

Definition at line 199 of file wordrec.h.

int tesseract::Wordrec::wordrec_max_join_chunks = 4

"Max number of broken pieces to associate"

Definition at line 233 of file wordrec.h.

bool tesseract::Wordrec::wordrec_no_block = false

"Don't output block information"

Definition at line 198 of file wordrec.h.

bool tesseract::Wordrec::wordrec_run_blamer = false

"Try to set the blame for errors"

Definition at line 237 of file wordrec.h.

bool tesseract::Wordrec::wordrec_skip_no_truth_words = false

"Only run OCR for words that had truth recorded in BlamerBundle"

Definition at line 235 of file wordrec.h.

double tesseract::Wordrec::wordrec_worst_state = 1

"Worst segmentation state"

Definition at line 203 of file wordrec.h.


The documentation for this class was generated from the following files: