tesseract 3.04.01

tesseract::Classify Class Reference

#include <classify.h>

Inheritance diagram for tesseract::Classify:
tesseract::CCStruct tesseract::CUtil tesseract::CCUtil tesseract::Wordrec tesseract::Tesseract

List of all members.

Public Member Functions

 Classify ()
virtual ~Classify ()
DictgetDict ()
const ShapeTableshape_table () const
void SetStaticClassifier (ShapeClassifier *static_classifier)
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
bool LargeSpeckle (const TBLOB &blob)
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
int GetFontinfoId (ADAPT_CLASS Class, uinT8 ConfigId)
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uinT8 *normalization_factors, const uinT16 *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
void ReadNewCutoffs (FILE *CutoffFile, bool swap, inT64 end_offset, CLASS_CUTOFF_ARRAY Cutoffs)
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
ADAPT_TEMPLATES ReadAdaptedTemplates (FILE *File)
FLOAT32 ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)
void FreeNormProtos ()
NORM_PROTOSReadNormProtos (FILE *File, inT64 end_offset)
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
void LearnWord (const char *fontname, WERD_RES *word)
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
void InitAdaptiveClassifier (bool load_pre_trained_templates)
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
void MasterMatcher (INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT *features, const uinT8 *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uinT8 *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uinT8 *cn_factors)
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
void RemoveBadMatches (ADAPT_RESULTS *Results)
void SetAdaptiveThreshold (FLOAT32 Threshold)
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
int ShapeIDToClassID (int shape_id) const
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold, ADAPT_TEMPLATES adaptive_templates)
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
bool AdaptableWord (WERD_RES *word)
void EndAdaptiveClassifier ()
void SettupPass1 ()
void SettupPass2 ()
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
void ClassifyAsNoise (ADAPT_RESULTS *Results)
void ResetAdaptiveClassifierInternal ()
void SwitchAdaptiveClassifier ()
void StartBackupAdaptiveClassifier ()
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uinT8 *pruner_norm_array, uinT8 *char_norm_array)
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uinT8 *char_norm_array, uinT8 *pruner_array)
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
bool AdaptiveClassifierIsFull () const
bool AdaptiveClassifierIsEmpty () const
bool LooksLikeGarbage (TBLOB *blob)
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
void ClearCharNormArray (uinT8 *char_norm_array)
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array)
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
INT_TEMPLATES ReadIntTemplates (FILE *File)
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
void ShowMatchDisplay ()
UnicityTable< FontInfo > & get_fontinfo_table ()
const UnicityTable< FontInfo > & get_fontinfo_table () const
UnicityTable< FontSet > & get_fontset_table ()
void NormalizeOutlines (LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale)
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
bool WriteTRFile (const STRING &filename)

Static Public Member Functions

static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)

Public Attributes

bool allow_blob_division = true
bool prioritize_division = FALSE
int tessedit_single_match = FALSE
bool classify_enable_learning = true
int classify_debug_level = 0
int classify_norm_method = character
double classify_char_norm_range = 0.2
double classify_min_norm_scale_x = 0.0
double classify_max_norm_scale_x = 0.325
double classify_min_norm_scale_y = 0.0
double classify_max_norm_scale_y = 0.325
double classify_max_rating_ratio = 1.5
double classify_max_certainty_margin = 5.5
bool tess_cn_matching = 0
bool tess_bn_matching = 0
bool classify_enable_adaptive_matcher = 1
bool classify_use_pre_adapted_templates = 0
bool classify_save_adapted_templates = 0
bool classify_enable_adaptive_debugger = 0
bool classify_nonlinear_norm = 0
int matcher_debug_level = 0
int matcher_debug_flags = 0
int classify_learning_debug_level = 0
double matcher_good_threshold = 0.125
double matcher_reliable_adaptive_result = 0.0
double matcher_perfect_threshold = 0.02
double matcher_bad_match_pad = 0.15
double matcher_rating_margin = 0.1
double matcher_avg_noise_size = 12.0
int matcher_permanent_classes_min = 1
int matcher_min_examples_for_prototyping = 3
int matcher_sufficient_examples_for_prototyping = 5
double matcher_clustering_max_angle_delta = 0.015
double classify_misfit_junk_penalty = 0.0
double rating_scale = 1.5
double certainty_scale = 20.0
double tessedit_class_miss_scale = 0.00390625
double classify_adapted_pruning_factor = 2.5
double classify_adapted_pruning_threshold = -1.0
int classify_adapt_proto_threshold = 230
int classify_adapt_feature_threshold = 230
bool disable_character_fragments = TRUE
double classify_character_fragments_garbage_certainty_threshold = -3.0
bool classify_debug_character_fragments = FALSE
bool matcher_debug_separate_windows = FALSE
char * classify_learn_debug_str = ""
int classify_class_pruner_threshold = 229
int classify_class_pruner_multiplier = 15
int classify_cp_cutoff_strength = 7
int classify_integer_matcher_multiplier = 10
INT_TEMPLATES PreTrainedTemplates
ADAPT_TEMPLATES AdaptedTemplates
ADAPT_TEMPLATES BackupAdaptedTemplates
BIT_VECTOR AllProtosOn
BIT_VECTOR AllConfigsOn
BIT_VECTOR AllConfigsOff
BIT_VECTOR TempProtoMask
bool EnableLearning
NORM_PROTOSNormProtos
UnicityTable< FontInfofontinfo_table_
UnicityTable< FontSetfontset_table_
int il1_adaption_test = 0
bool classify_bln_numeric_mode = 0
double speckle_large_max_size = 0.30
double speckle_rating_penalty = 10.0

Protected Attributes

IntegerMatcher im_
FEATURE_DEFS_STRUCT feature_defs_
ShapeTableshape_table_

Detailed Description

Definition at line 61 of file classify.h.


Constructor & Destructor Documentation

tesseract::Classify::Classify ( )

Definition at line 35 of file classify.cpp.

    : BOOL_MEMBER(allow_blob_division, true, "Use divisible blobs chopping",
                  this->params()),
      BOOL_MEMBER(prioritize_division, FALSE,
                  "Prioritize blob division over chopping", this->params()),
      INT_MEMBER(tessedit_single_match, FALSE, "Top choice only from CP",
                 this->params()),
      BOOL_MEMBER(classify_enable_learning, true, "Enable adaptive classifier",
                  this->params()),
      INT_MEMBER(classify_debug_level, 0, "Classify debug level",
                 this->params()),
      INT_MEMBER(classify_norm_method, character, "Normalization Method   ...",
                 this->params()),
      double_MEMBER(classify_char_norm_range, 0.2,
                    "Character Normalization Range ...", this->params()),
      double_MEMBER(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...",
                    this->params()), /* PREV DEFAULT 0.1 */
      double_MEMBER(classify_max_norm_scale_x, 0.325,
                    "Max char x-norm scale ...",
                    this->params()), /* PREV DEFAULT 0.3 */
      double_MEMBER(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...",
                    this->params()), /* PREV DEFAULT 0.1 */
      double_MEMBER(classify_max_norm_scale_y, 0.325,
                    "Max char y-norm scale ...",
                    this->params()), /* PREV DEFAULT 0.3 */
      double_MEMBER(classify_max_rating_ratio, 1.5,
                    "Veto ratio between classifier ratings", this->params()),
      double_MEMBER(classify_max_certainty_margin, 5.5,
                    "Veto difference between classifier certainties",
                    this->params()),
      BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
                  this->params()),
      BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
                  this->params()),
      BOOL_MEMBER(classify_enable_adaptive_matcher, 1,
                  "Enable adaptive classifier", this->params()),
      BOOL_MEMBER(classify_use_pre_adapted_templates, 0,
                  "Use pre-adapted classifier templates", this->params()),
      BOOL_MEMBER(classify_save_adapted_templates, 0,
                  "Save adapted templates to a file", this->params()),
      BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
                  this->params()),
      BOOL_MEMBER(classify_nonlinear_norm, 0,
                  "Non-linear stroke-density normalization", this->params()),
      INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
      INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
      INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
                 this->params()),
      double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)",
                    this->params()),
      double_MEMBER(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)",
                    this->params()),
      double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)",
                    this->params()),
      double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)",
                    this->params()),
      double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)",
                    this->params()),
      double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length",
                    this->params()),
      INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes",
                 this->params()),
      INT_MEMBER(matcher_min_examples_for_prototyping, 3,
                 "Reliable Config Threshold", this->params()),
      INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5,
                 "Enable adaption even if the ambiguities have not been seen",
                 this->params()),
      double_MEMBER(matcher_clustering_max_angle_delta, 0.015,
                    "Maximum angle delta for prototype clustering",
                    this->params()),
      double_MEMBER(classify_misfit_junk_penalty, 0.0,
                    "Penalty to apply when a non-alnum is vertically out of "
                    "its expected textline position",
                    this->params()),
      double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()),
      double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
                    this->params()),
      double_MEMBER(tessedit_class_miss_scale, 0.00390625,
                    "Scale factor for features not used", this->params()),
      double_MEMBER(
          classify_adapted_pruning_factor, 2.5,
          "Prune poor adapted results this much worse than best result",
          this->params()),
      double_MEMBER(classify_adapted_pruning_threshold, -1.0,
                    "Threshold at which classify_adapted_pruning_factor starts",
                    this->params()),
      INT_MEMBER(classify_adapt_proto_threshold, 230,
                 "Threshold for good protos during adaptive 0-255",
                 this->params()),
      INT_MEMBER(classify_adapt_feature_threshold, 230,
                 "Threshold for good features during adaptive 0-255",
                 this->params()),
      BOOL_MEMBER(disable_character_fragments, TRUE,
                  "Do not include character fragments in the"
                  " results of the classifier",
                  this->params()),
      double_MEMBER(classify_character_fragments_garbage_certainty_threshold,
                    -3.0,
                    "Exclude fragments that do not look like whole"
                    " characters from training and adaption",
                    this->params()),
      BOOL_MEMBER(classify_debug_character_fragments, FALSE,
                  "Bring up graphical debugging windows for fragments training",
                  this->params()),
      BOOL_MEMBER(matcher_debug_separate_windows, FALSE,
                  "Use two different windows for debugging the matching: "
                  "One for the protos and one for the features.",
                  this->params()),
      STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning",
                    this->params()),
      INT_MEMBER(classify_class_pruner_threshold, 229,
                 "Class Pruner Threshold 0-255", this->params()),
      INT_MEMBER(classify_class_pruner_multiplier, 15,
                 "Class Pruner Multiplier 0-255:       ", this->params()),
      INT_MEMBER(classify_cp_cutoff_strength, 7,
                 "Class Pruner CutoffStrength:         ", this->params()),
      INT_MEMBER(classify_integer_matcher_multiplier, 10,
                 "Integer Matcher Multiplier  0-255:   ", this->params()),
      EnableLearning(true),
      INT_MEMBER(il1_adaption_test, 0, "Don't adapt to i/I at beginning of word",
                 this->params()),
      BOOL_MEMBER(classify_bln_numeric_mode, 0,
                  "Assume the input is numbers [0-9].", this->params()),
      double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size",
                    this->params()),
      double_MEMBER(speckle_rating_penalty, 10.0,
                    "Penalty to add to worst rating for noise", this->params()),
      shape_table_(NULL),
      dict_(this),
      static_classifier_(NULL) {
  fontinfo_table_.set_compare_callback(
      NewPermanentTessCallback(CompareFontInfo));
  fontinfo_table_.set_clear_callback(
      NewPermanentTessCallback(FontInfoDeleteCallback));
  fontset_table_.set_compare_callback(
      NewPermanentTessCallback(CompareFontSet));
  fontset_table_.set_clear_callback(
      NewPermanentTessCallback(FontSetDeleteCallback));
  AdaptedTemplates = NULL;
  BackupAdaptedTemplates = NULL;
  PreTrainedTemplates = NULL;
  AllProtosOn = NULL;
  AllConfigsOn = NULL;
  AllConfigsOff = NULL;
  TempProtoMask = NULL;
  NormProtos = NULL;

  NumAdaptationsFailed = 0;

  learn_debug_win_ = NULL;
  learn_fragmented_word_debug_win_ = NULL;
  learn_fragments_debug_win_ = NULL;

  CharNormCutoffs = new uinT16[MAX_NUM_CLASSES];
  BaselineCutoffs = new uinT16[MAX_NUM_CLASSES];
}
tesseract::Classify::~Classify ( ) [virtual]

Definition at line 192 of file classify.cpp.

                    {
  EndAdaptiveClassifier();
  delete learn_debug_win_;
  delete learn_fragmented_word_debug_win_;
  delete learn_fragments_debug_win_;
  delete[] CharNormCutoffs;
  delete[] BaselineCutoffs;
}

Member Function Documentation

bool tesseract::Classify::AdaptableWord ( WERD_RES word)

Return TRUE if the specified word is acceptable for adaptation.

Globals: none

Parameters:
wordcurrent word
Returns:
TRUE or FALSE
Note:
Exceptions: none
History: Thu May 30 14:25:06 1991, DSJ, Created.

Definition at line 850 of file adaptmatch.cpp.

                                           {
  if (word->best_choice == NULL) return false;
  int BestChoiceLength = word->best_choice->length();
  float adaptable_score =
    getDict().segment_penalty_dict_case_ok + ADAPTABLE_WERD_ADJUSTMENT;
  return   // rules that apply in general - simplest to compute first
      BestChoiceLength > 0 &&
      BestChoiceLength == word->rebuild_word->NumBlobs() &&
      BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
      // This basically ensures that the word is at least a dictionary match
      // (freq word, user word, system dawg word, etc).
      // Since all the other adjustments will make adjust factor higher
      // than higher than adaptable_score=1.1+0.05=1.15
      // Since these are other flags that ensure that the word is dict word,
      // this check could be at times redundant.
      word->best_choice->adjust_factor() <= adaptable_score &&
      // Make sure that alternative choices are not dictionary words.
      word->AlternativeChoiceAdjustmentsWorseThan(adaptable_score);
}
void tesseract::Classify::AdaptiveClassifier ( TBLOB Blob,
BLOB_CHOICE_LIST *  Choices 
)

This routine calls the adaptive matcher which returns (in an array) the class id of each class matched.

It also returns the number of classes matched. For each class matched it places the best rating found for that class into the Ratings array.

Bad matches are then removed so that they don't need to be sorted. The remaining good matches are then sorted and converted to choices.

This routine also performs some simple speckle filtering.

Note:
Exceptions: none
History: Mon Mar 11 10:00:58 1991, DSJ, Created.
Parameters:
Blobblob to be classified
[out]ChoicesList of choices found by adaptive matcher. filled on return with the choices found by the class pruner and the ratings therefrom. Also contains the detailed results of the integer matcher.

Definition at line 185 of file adaptmatch.cpp.

                                                                        {
  assert(Choices != NULL);
  ADAPT_RESULTS *Results = new ADAPT_RESULTS;
  Results->Initialize();

  ASSERT_HOST(AdaptedTemplates != NULL);

  DoAdaptiveMatch(Blob, Results);

  RemoveBadMatches(Results);
  Results->match.sort(&UnicharRating::SortDescendingRating);
  RemoveExtraPuncs(Results);
  Results->ComputeBest();
  ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results,
                          Choices);

  // TODO(rays) Move to before ConvertMatchesToChoices!
  if (LargeSpeckle(*Blob) || Choices->length() == 0)
    AddLargeSpeckleTo(Results->BlobLength, Choices);

  if (matcher_debug_level >= 1) {
    tprintf("AD Matches =  ");
    PrintAdaptiveMatchResults(*Results);
  }

#ifndef GRAPHICS_DISABLED
  if (classify_enable_adaptive_debugger)
    DebugAdaptiveClassifier(Blob, Results);
#endif

  delete Results;
}                                /* AdaptiveClassifier */
bool tesseract::Classify::AdaptiveClassifierIsEmpty ( ) const [inline]

Definition at line 285 of file classify.h.

                                         {
    return AdaptedTemplates->NumPermClasses == 0;
  }
bool tesseract::Classify::AdaptiveClassifierIsFull ( ) const [inline]

Definition at line 284 of file classify.h.

{ return NumAdaptationsFailed > 0; }
void tesseract::Classify::AdaptToChar ( TBLOB Blob,
CLASS_ID  ClassId,
int  FontinfoId,
FLOAT32  Threshold,
ADAPT_TEMPLATES  adaptive_templates 
)
Parameters:
Blobblob to add to templates for ClassId
ClassIdclass to add blob to
FontinfoIdfont information from pre-trained templates
Thresholdminimum match rating to existing template
adaptive_templatescurrent set of adapted templates

Globals:

  • AllProtosOn dummy mask to match against all protos
  • AllConfigsOn dummy mask to match against all configs
Returns:
none
Note:
Exceptions: none
History: Thu Mar 14 09:36:03 1991, DSJ, Created.

Definition at line 886 of file adaptmatch.cpp.

                                                               {
  int NumFeatures;
  INT_FEATURE_ARRAY IntFeatures;
  UnicharRating int_result;
  INT_CLASS IClass;
  ADAPT_CLASS Class;
  TEMP_CONFIG TempConfig;
  FEATURE_SET FloatFeatures;
  int NewTempConfigId;

  if (!LegalClassId (ClassId))
    return;

  int_result.unichar_id = ClassId;
  Class = adaptive_templates->Class[ClassId];
  assert(Class != NULL);
  if (IsEmptyAdaptedClass(Class)) {
    InitAdaptedClass(Blob, ClassId, FontinfoId, Class, adaptive_templates);
  } else {
    IClass = ClassForClassId(adaptive_templates->Templates, ClassId);

    NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
    if (NumFeatures <= 0)
      return;

    // Only match configs with the matching font.
    BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);
    for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {
      if (GetFontinfoId(Class, cfg) == FontinfoId) {
        SET_BIT(MatchingFontConfigs, cfg);
      } else {
        reset_bit(MatchingFontConfigs, cfg);
      }
    }
    im_.Match(IClass, AllProtosOn, MatchingFontConfigs,
              NumFeatures, IntFeatures,
              &int_result, classify_adapt_feature_threshold,
              NO_DEBUG, matcher_debug_separate_windows);
    FreeBitVector(MatchingFontConfigs);

    SetAdaptiveThreshold(Threshold);

    if (1.0f - int_result.rating <= Threshold) {
      if (ConfigIsPermanent(Class, int_result.config)) {
        if (classify_learning_debug_level >= 1)
          tprintf("Found good match to perm config %d = %4.1f%%.\n",
                  int_result.config, int_result.rating * 100.0);
        FreeFeatureSet(FloatFeatures);
        return;
      }

      TempConfig = TempConfigFor(Class, int_result.config);
      IncreaseConfidence(TempConfig);
      if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
        Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
      }
      if (classify_learning_debug_level >= 1)
        tprintf("Increasing reliability of temp config %d to %d.\n",
                int_result.config, TempConfig->NumTimesSeen);

      if (TempConfigReliable(ClassId, TempConfig)) {
        MakePermanent(adaptive_templates, ClassId, int_result.config, Blob);
        UpdateAmbigsGroup(ClassId, Blob);
      }
    } else {
      if (classify_learning_debug_level >= 1) {
        tprintf("Found poor match to temp config %d = %4.1f%%.\n",
                int_result.config, int_result.rating * 100.0);
        if (classify_learning_debug_level > 2)
          DisplayAdaptedChar(Blob, IClass);
      }
      NewTempConfigId =
          MakeNewTemporaryConfig(adaptive_templates, ClassId, FontinfoId,
                                 NumFeatures, IntFeatures, FloatFeatures);
      if (NewTempConfigId >= 0 &&
          TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
        MakePermanent(adaptive_templates, ClassId, NewTempConfigId, Blob);
        UpdateAmbigsGroup(ClassId, Blob);
      }

#ifndef GRAPHICS_DISABLED
      if (classify_learning_debug_level > 1) {
        DisplayAdaptedChar(Blob, IClass);
      }
#endif
    }
    FreeFeatureSet(FloatFeatures);
  }
}                                /* AdaptToChar */
void tesseract::Classify::AddLargeSpeckleTo ( int  blob_length,
BLOB_CHOICE_LIST *  choices 
)

Definition at line 212 of file classify.cpp.

                                                                           {
    BLOB_CHOICE_IT bc_it(choices);
  // If there is no classifier result, we will use the worst possible certainty
  // and corresponding rating.
  float certainty = -getDict().certainty_scale;
  float rating = rating_scale * blob_length;
  if (!choices->empty() && blob_length > 0) {
    bc_it.move_to_last();
    BLOB_CHOICE* worst_choice = bc_it.data();
    // Add speckle_rating_penalty to worst rating, matching old value.
    rating = worst_choice->rating() + speckle_rating_penalty;
    // Compute the rating to correspond to the certainty. (Used to be kept
    // the same, but that messes up the language model search.)
    certainty = -rating * getDict().certainty_scale /
        (rating_scale * blob_length);
  }
  BLOB_CHOICE* blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty,
                                             -1, 0.0f, MAX_FLOAT32, 0,
                                             BCC_SPECKLE_CLASSIFIER);
  bc_it.add_to_end(blob_choice);
}
void tesseract::Classify::AddNewResult ( const UnicharRating new_result,
ADAPT_RESULTS results 
)

This routine adds the result of a classification into Results. If the new rating is much worse than the current best rating, it is not entered into results because it would end up being stripped later anyway. If the new rating is better than the old rating for the class, it replaces the old rating. If this is the first rating for the class, the class is added to the list of matched classes in Results. If the new rating is better than the best so far, it becomes the best so far.

Globals:

Parameters:
new_resultnew result to add
[out]resultsresults to add new result to
Note:
Exceptions: none
History: Tue Mar 12 18:19:29 1991, DSJ, Created.

Definition at line 1029 of file adaptmatch.cpp.

                                                    {
  int old_match = FindScoredUnichar(new_result.unichar_id, *results);

  if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
      (old_match < results->match.size() &&
       new_result.rating <= results->match[old_match].rating))
    return;  // New one not good enough.

  if (!unicharset.get_fragment(new_result.unichar_id))
    results->HasNonfragment = true;

  if (old_match < results->match.size()) {
    results->match[old_match].rating = new_result.rating;
  } else {
    results->match.push_back(new_result);
  }

  if (new_result.rating > results->best_rating &&
      // Ensure that fragments do not affect best rating, class and config.
      // This is needed so that at least one non-fragmented character is
      // always present in the results.
      // TODO(daria): verify that this helps accuracy and does not
      // hurt performance.
      !unicharset.get_fragment(new_result.unichar_id)) {
    results->best_match_index = old_match;
    results->best_rating = new_result.rating;
    results->best_unichar_id = new_result.unichar_id;
  }
}                                /* AddNewResult */
void tesseract::Classify::AmbigClassifier ( const GenericVector< INT_FEATURE_STRUCT > &  int_features,
const INT_FX_RESULT_STRUCT fx_info,
const TBLOB blob,
INT_TEMPLATES  templates,
ADAPT_CLASS classes,
UNICHAR_ID ambiguities,
ADAPT_RESULTS results 
)

This routine is identical to CharNormClassifier() except that it does no class pruning. It simply matches the unknown blob against the classes listed in Ambiguities.

Globals:

Parameters:
blobblob to be classified
templatesbuilt-in templates to classify against
classesadapted class templates
ambiguitiesarray of unichar id's to match against
[out]resultsplace to put match results
int_features
fx_info
Note:
Exceptions: none
History: Tue Mar 12 19:40:36 1991, DSJ, Created.

Definition at line 1083 of file adaptmatch.cpp.

                            {
  if (int_features.empty()) return;
  uinT8* CharNormArray = new uinT8[unicharset.size()];
  UnicharRating int_result;

  results->BlobLength = GetCharNormFeature(fx_info, templates, NULL,
                                           CharNormArray);
  bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
  if (debug)
    tprintf("AM Matches =  ");

  int top = blob->bounding_box().top();
  int bottom = blob->bounding_box().bottom();
  while (*ambiguities >= 0) {
    CLASS_ID class_id = *ambiguities;

    int_result.unichar_id = class_id;
    im_.Match(ClassForClassId(templates, class_id),
              AllProtosOn, AllConfigsOn,
              int_features.size(), &int_features[0],
              &int_result,
              classify_adapt_feature_threshold, NO_DEBUG,
              matcher_debug_separate_windows);

    ExpandShapesAndApplyCorrections(NULL, debug, class_id, bottom, top, 0,
                                    results->BlobLength,
                                    classify_integer_matcher_multiplier,
                                    CharNormArray, &int_result, results);
    ambiguities++;
  }
  delete [] CharNormArray;
}                                /* AmbigClassifier */
UNICHAR_ID * tesseract::Classify::BaselineClassifier ( TBLOB Blob,
const GenericVector< INT_FEATURE_STRUCT > &  int_features,
const INT_FX_RESULT_STRUCT fx_info,
ADAPT_TEMPLATES  Templates,
ADAPT_RESULTS Results 
)

This routine extracts baseline normalized features from the unknown character and matches them against the specified set of templates. The classes which match are added to Results.

Globals:

  • BaselineCutoffs expected num features for each class
Parameters:
Blobblob to be classified
Templatescurrent set of adapted templates
Resultsplace to put match results
int_features
fx_info
Returns:
Array of possible ambiguous chars that should be checked.
Note:
Exceptions: none
History: Tue Mar 12 19:38:03 1991, DSJ, Created.

Definition at line 1305 of file adaptmatch.cpp.

                                                       {
  if (int_features.empty()) return NULL;
  uinT8* CharNormArray = new uinT8[unicharset.size()];
  ClearCharNormArray(CharNormArray);

  Results->BlobLength = IntCastRounded(fx_info.Length / kStandardFeatureLength);
  PruneClasses(Templates->Templates, int_features.size(), -1, &int_features[0],
               CharNormArray, BaselineCutoffs, &Results->CPResults);

  if (matcher_debug_level >= 2 || classify_debug_level > 1)
    tprintf("BL Matches =  ");

  MasterMatcher(Templates->Templates, int_features.size(), &int_features[0],
                CharNormArray,
                Templates->Class, matcher_debug_flags, 0,
                Blob->bounding_box(), Results->CPResults, Results);

  delete [] CharNormArray;
  CLASS_ID ClassId = Results->best_unichar_id;
  if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0)
    return NULL;

  return Templates->Class[ClassId]->
      Config[Results->match[Results->best_match_index].config].Perm->Ambigs;
}                                /* BaselineClassifier */
int tesseract::Classify::CharNormClassifier ( TBLOB blob,
const TrainingSample sample,
ADAPT_RESULTS adapt_results 
)

This routine extracts character normalized features from the unknown character and matches them against the specified set of templates. The classes which match are added to Results.

Parameters:
blobblob to be classified
sampletemplates to classify unknown against
adapt_resultsplace to put match results

Globals:

  • CharNormCutoffs expected num features for each class
  • AllProtosOn mask that enables all protos
  • AllConfigsOn mask that enables all configs
Note:
Exceptions: none
History: Tue Mar 12 16:02:52 1991, DSJ, Created.

Definition at line 1354 of file adaptmatch.cpp.

                                                               {
  // This is the length that is used for scaling ratings vs certainty.
  adapt_results->BlobLength =
      IntCastRounded(sample.outline_length() / kStandardFeatureLength);
  GenericVector<UnicharRating> unichar_results;
  static_classifier_->UnicharClassifySample(sample, blob->denorm().pix(), 0,
                                            -1, &unichar_results);
  // Convert results to the format used internally by AdaptiveClassifier.
  for (int r = 0; r < unichar_results.size(); ++r) {
    AddNewResult(unichar_results[r], adapt_results);
  }
  return sample.num_features();
}                                /* CharNormClassifier */
int tesseract::Classify::CharNormTrainingSample ( bool  pruner_only,
int  keep_this,
const TrainingSample sample,
GenericVector< UnicharRating > *  results 
)

Definition at line 1372 of file adaptmatch.cpp.

                                                                            {
  results->clear();
  ADAPT_RESULTS* adapt_results = new ADAPT_RESULTS();
  adapt_results->Initialize();
  // Compute the bounding box of the features.
  int num_features = sample.num_features();
  // Only the top and bottom of the blob_box are used by MasterMatcher, so
  // fabricate right and left using top and bottom.
  TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
                sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
  // Compute the char_norm_array from the saved cn_feature.
  FEATURE norm_feature = sample.GetCNFeature();
  uinT8* char_norm_array = new uinT8[unicharset.size()];
  int num_pruner_classes = MAX(unicharset.size(),
                               PreTrainedTemplates->NumClasses);
  uinT8* pruner_norm_array = new uinT8[num_pruner_classes];
  adapt_results->BlobLength =
      static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5);
  ComputeCharNormArrays(norm_feature, PreTrainedTemplates, char_norm_array,
                        pruner_norm_array);

  PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(),
               pruner_norm_array,
               shape_table_ != NULL ? &shapetable_cutoffs_[0] : CharNormCutoffs,
               &adapt_results->CPResults);
  delete [] pruner_norm_array;
  if (keep_this >= 0) {
    adapt_results->CPResults[0].Class = keep_this;
    adapt_results->CPResults.truncate(1);
  }
  if (pruner_only) {
    // Convert pruner results to output format.
    for (int i = 0; i < adapt_results->CPResults.size(); ++i) {
      int class_id = adapt_results->CPResults[i].Class;
      results->push_back(
          UnicharRating(class_id, 1.0f - adapt_results->CPResults[i].Rating));
    }
  } else {
    MasterMatcher(PreTrainedTemplates, num_features, sample.features(),
                  char_norm_array,
                  NULL, matcher_debug_flags,
                  classify_integer_matcher_multiplier,
                  blob_box, adapt_results->CPResults, adapt_results);
    // Convert master matcher results to output format.
    for (int i = 0; i < adapt_results->match.size(); i++) {
      results->push_back(adapt_results->match[i]);
    }
    results->sort(&UnicharRating::SortDescendingRating);
  }
  delete [] char_norm_array;
  delete adapt_results;
  return num_features;
}                                /* CharNormTrainingSample */
int tesseract::Classify::ClassAndConfigIDToFontOrShapeID ( int  class_id,
int  int_result_config 
) const

Definition at line 2283 of file adaptmatch.cpp.

                                                                           {
  int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
  // Older inttemps have no font_ids.
  if (font_set_id < 0)
    return kBlankFontinfoId;
  const FontSet &fs = fontset_table_.get(font_set_id);
  ASSERT_HOST(int_result_config >= 0 && int_result_config < fs.size);
  return fs.configs[int_result_config];
}
STRING tesseract::Classify::ClassIDToDebugStr ( const INT_TEMPLATES_STRUCT templates,
int  class_id,
int  config_id 
) const

Definition at line 2270 of file adaptmatch.cpp.

                                                                      {
  STRING class_string;
  if (templates == PreTrainedTemplates && shape_table_ != NULL) {
    int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);
    class_string = shape_table_->DebugStr(shape_id);
  } else {
    class_string = unicharset.debug_str(class_id);
  }
  return class_string;
}
void tesseract::Classify::ClassifyAsNoise ( ADAPT_RESULTS results)

This routine computes a rating which reflects the likelihood that the blob being classified is a noise blob. NOTE: assumes that the blob length has already been computed and placed into Results.

Parameters:
resultsresults to add noise classification to

Globals:

  • matcher_avg_noise_size avg. length of a noise blob
Note:
Exceptions: none
History: Tue Mar 12 18:36:52 1991, DSJ, Created.

Definition at line 1445 of file adaptmatch.cpp.

                                                     {
  float rating = results->BlobLength / matcher_avg_noise_size;
  rating *= rating;
  rating /= 1.0 + rating;

  AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results);
}                                /* ClassifyAsNoise */
void tesseract::Classify::ClearCharNormArray ( uinT8 char_norm_array)

For each class in the unicharset, clears the corresponding entry in char_norm_array. char_norm_array is indexed by unichar_id.

Globals:

  • none
Parameters:
char_norm_arrayarray to be cleared
Note:
Exceptions: none
History: Wed Feb 20 11:20:54 1991, DSJ, Created.

Definition at line 48 of file float2int.cpp.

                                                        {
  memset(char_norm_array, 0, sizeof(*char_norm_array) * unicharset.size());
}                                /* ClearCharNormArray */
void tesseract::Classify::ComputeCharNormArrays ( FEATURE_STRUCT norm_feature,
INT_TEMPLATES_STRUCT templates,
uinT8 char_norm_array,
uinT8 pruner_array 
)

Definition at line 1747 of file adaptmatch.cpp.

                                                          {
  ComputeIntCharNormArray(*norm_feature, char_norm_array);
  if (pruner_array != NULL) {
    if (shape_table_ == NULL) {
      ComputeIntCharNormArray(*norm_feature, pruner_array);
    } else {
      memset(pruner_array, MAX_UINT8,
             templates->NumClasses * sizeof(pruner_array[0]));
      // Each entry in the pruner norm array is the MIN of all the entries of
      // the corresponding unichars in the CharNormArray.
      for (int id = 0; id < templates->NumClasses; ++id) {
        int font_set_id = templates->Class[id]->font_set_id;
        const FontSet &fs = fontset_table_.get(font_set_id);
        for (int config = 0; config < fs.size; ++config) {
          const Shape& shape = shape_table_->GetShape(fs.configs[config]);
          for (int c = 0; c < shape.size(); ++c) {
            if (char_norm_array[shape[c].unichar_id] < pruner_array[id])
              pruner_array[id] = char_norm_array[shape[c].unichar_id];
          }
        }
      }
    }
  }
  FreeFeature(norm_feature);
}
double tesseract::Classify::ComputeCorrectedRating ( bool  debug,
int  unichar_id,
double  cp_rating,
double  im_rating,
int  feature_misses,
int  bottom,
int  top,
int  blob_length,
int  matcher_multiplier,
const uinT8 cn_factors 
)

Definition at line 1240 of file adaptmatch.cpp.

                                                                 {
  // Compute class feature corrections.
  double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length,
                                              cn_factors[unichar_id],
                                              matcher_multiplier);
  double miss_penalty = tessedit_class_miss_scale * feature_misses;
  double vertical_penalty = 0.0;
  // Penalize non-alnums for being vertical misfits.
  if (!unicharset.get_isalpha(unichar_id) &&
      !unicharset.get_isdigit(unichar_id) &&
      cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {
    int min_bottom, max_bottom, min_top, max_top;
    unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom,
                              &min_top, &max_top);
    if (debug) {
      tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n",
              top, min_top, max_top, bottom, min_bottom, max_bottom);
    }
    if (top < min_top || top > max_top ||
        bottom < min_bottom || bottom > max_bottom) {
      vertical_penalty = classify_misfit_junk_penalty;
    }
  }
  double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty);
  if (result < WORST_POSSIBLE_RATING)
    result = WORST_POSSIBLE_RATING;
  if (debug) {
    tprintf("%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
            unicharset.id_to_unichar(unichar_id),
            result * 100.0,
            cp_rating * 100.0,
            (1.0 - im_rating) * 100.0,
            (cn_corrected - (1.0 - im_rating)) * 100.0,
            cn_factors[unichar_id],
            miss_penalty * 100.0,
            vertical_penalty * 100.0);
  }
  return result;
}
void tesseract::Classify::ComputeIntCharNormArray ( const FEATURE_STRUCT norm_feature,
uinT8 char_norm_array 
)

For each class in unicharset, computes the match between norm_feature and the normalization protos for that class. Converts this number to the range from 0 - 255 and stores it into char_norm_array. CharNormArray is indexed by unichar_id.

Globals:

  • PreTrainedTemplates current set of built-in templates
Parameters:
norm_featurecharacter normalization feature
[out]char_norm_arrayplace to put results of size unicharset.size()
Note:
Exceptions: none
History: Wed Feb 20 11:20:54 1991, DSJ, Created.

Definition at line 69 of file float2int.cpp.

                                                               {
  for (int i = 0; i < unicharset.size(); i++) {
    if (i < PreTrainedTemplates->NumClasses) {
      int norm_adjust = static_cast<int>(INT_CHAR_NORM_RANGE *
        ComputeNormMatch(i, norm_feature, FALSE));
      char_norm_array[i] = ClipToRange(norm_adjust, 0, MAX_INT_CHAR_NORM);
    } else {
      // Classes with no templates (eg. ambigs & ligatures) default
      // to worst match.
      char_norm_array[i] = MAX_INT_CHAR_NORM;
    }
  }
}                                /* ComputeIntCharNormArray */
void tesseract::Classify::ComputeIntFeatures ( FEATURE_SET  Features,
INT_FEATURE_ARRAY  IntFeatures 
)

This routine converts each floating point pico-feature in Features into integer format and saves it into IntFeatures.

Globals:

  • none
Parameters:
Featuresfloating point pico-features to be converted
[out]IntFeaturesarray to put converted features into
Note:
Exceptions: none
History: Wed Feb 20 10:58:45 1991, DSJ, Created.

Definition at line 100 of file float2int.cpp.

                                                                 {
  int Fid;
  FEATURE Feature;
  FLOAT32 YShift;

  if (classify_norm_method == baseline)
    YShift = BASELINE_Y_SHIFT;
  else
    YShift = Y_SHIFT;

  for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
    Feature = Features->Features[Fid];

    IntFeatures[Fid].X =
        Bucket8For(Feature->Params[PicoFeatX], X_SHIFT, INT_FEAT_RANGE);
    IntFeatures[Fid].Y =
        Bucket8For(Feature->Params[PicoFeatY], YShift, INT_FEAT_RANGE);
    IntFeatures[Fid].Theta = CircBucketFor(Feature->Params[PicoFeatDir],
                                           ANGLE_SHIFT, INT_FEAT_RANGE);
    IntFeatures[Fid].CP_misses = 0;
  }
}                                /* ComputeIntFeatures */
FLOAT32 tesseract::Classify::ComputeNormMatch ( CLASS_ID  ClassId,
const FEATURE_STRUCT feature,
BOOL8  DebugMatch 
)

This routine compares Features against each character normalization proto for ClassId and returns the match rating of the best match.

Parameters:
ClassIdid of class to match against
featurecharacter normalization feature
DebugMatchcontrols dump of debug info

Globals: NormProtos character normalization prototypes

Returns:
Best match rating for Feature against protos of ClassId.
Note:
Exceptions: none
History: Wed Dec 19 16:56:12 1990, DSJ, Created.

Definition at line 88 of file normmatch.cpp.

                                                     {
  LIST Protos;
  FLOAT32 BestMatch;
  FLOAT32 Match;
  FLOAT32 Delta;
  PROTOTYPE *Proto;
  int ProtoId;

  if (ClassId >= NormProtos->NumProtos) {
    ClassId = NO_CLASS;
  }

  /* handle requests for classification as noise */
  if (ClassId == NO_CLASS) {
    /* kludge - clean up constants and make into control knobs later */
    Match = (feature.Params[CharNormLength] *
      feature.Params[CharNormLength] * 500.0 +
      feature.Params[CharNormRx] *
      feature.Params[CharNormRx] * 8000.0 +
      feature.Params[CharNormRy] *
      feature.Params[CharNormRy] * 8000.0);
    return (1.0 - NormEvidenceOf (Match));
  }

  BestMatch = MAX_FLOAT32;
  Protos = NormProtos->Protos[ClassId];

  if (DebugMatch) {
    tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
  }

  ProtoId = 0;
  iterate(Protos) {
    Proto = (PROTOTYPE *) first_node (Protos);
    Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
    Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
    if (DebugMatch) {
      tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
              Proto->Mean[CharNormY], Delta,
              Proto->Weight.Elliptical[CharNormY], Match);
    }
    Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
    Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
    if (DebugMatch) {
      tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
              Proto->Mean[CharNormRx], Delta,
              Proto->Weight.Elliptical[CharNormRx], Match);
    }
    // Ry is width! See intfx.cpp.
    Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
    if (DebugMatch) {
      tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
              Proto->Mean[CharNormRy], Delta,
              Proto->Weight.Elliptical[CharNormRy]);
    }
    Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
    Delta *= kWidthErrorWeighting;
    Match += Delta;
    if (DebugMatch) {
      tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
              Match, Match / classify_norm_adj_midpoint,
              NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
    }

    if (Match < BestMatch)
      BestMatch = Match;

    ProtoId++;
  }
  return 1.0 - NormEvidenceOf(BestMatch);
}                                /* ComputeNormMatch */
void tesseract::Classify::ConvertMatchesToChoices ( const DENORM denorm,
const TBOX box,
ADAPT_RESULTS Results,
BLOB_CHOICE_LIST *  Choices 
)

The function converts the given match ratings to the list of blob choices with ratings and certainties (used by the context checkers). If character fragments are present in the results, this function also makes sure that there is at least one non-fragmented classification included. For each classification result check the unicharset for "definite" ambiguities and modify the resulting Choices accordingly.

Definition at line 1459 of file adaptmatch.cpp.

                                                                  {
  assert(Choices != NULL);
  FLOAT32 Rating;
  FLOAT32 Certainty;
  BLOB_CHOICE_IT temp_it;
  bool contains_nonfrag = false;
  temp_it.set_to_list(Choices);
  int choices_length = 0;
  // With no shape_table_ maintain the previous MAX_MATCHES as the maximum
  // number of returned results, but with a shape_table_ we want to have room
  // for at least the biggest shape (which might contain hundreds of Indic
  // grapheme fragments) and more, so use double the size of the biggest shape
  // if that is more than the default.
  int max_matches = MAX_MATCHES;
  if (shape_table_ != NULL) {
    max_matches = shape_table_->MaxNumUnichars() * 2;
    if (max_matches < MAX_MATCHES)
      max_matches = MAX_MATCHES;
  }

  float best_certainty = -MAX_FLOAT32;
  for (int i = 0; i < Results->match.size(); i++) {
    const UnicharRating& result = Results->match[i];
    bool adapted = result.adapted;
    bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != NULL);
    if (temp_it.length()+1 == max_matches &&
        !contains_nonfrag && current_is_frag) {
      continue;  // look for a non-fragmented character to fill the
                 // last spot in Choices if only fragments are present
    }
    // BlobLength can never be legally 0, this means recognition failed.
    // But we must return a classification result because some invoking
    // functions (chopper/permuter) do not anticipate a null blob choice.
    // So we need to assign a poor, but not infinitely bad score.
    if (Results->BlobLength == 0) {
      Certainty = -20;
      Rating = 100;    // should be -certainty * real_blob_length
    } else {
      Rating = Certainty = (1.0f - result.rating);
      Rating *= rating_scale * Results->BlobLength;
      Certainty *= -(getDict().certainty_scale);
    }
    // Adapted results, by their very nature, should have good certainty.
    // Those that don't are at best misleading, and often lead to errors,
    // so don't accept adapted results that are too far behind the best result,
    // whether adapted or static.
    // TODO(rays) find some way of automatically tuning these constants.
    if (Certainty > best_certainty) {
      best_certainty = MIN(Certainty, classify_adapted_pruning_threshold);
    } else if (adapted &&
               Certainty / classify_adapted_pruning_factor < best_certainty) {
      continue;  // Don't accept bad adapted results.
    }

    float min_xheight, max_xheight, yshift;
    denorm.XHeightRange(result.unichar_id, unicharset, box,
                        &min_xheight, &max_xheight, &yshift);
    BLOB_CHOICE* choice =
        new BLOB_CHOICE(result.unichar_id, Rating, Certainty,
                        unicharset.get_script(result.unichar_id),
                        min_xheight, max_xheight, yshift,
                        adapted ? BCC_ADAPTED_CLASSIFIER
                                : BCC_STATIC_CLASSIFIER);
    choice->set_fonts(result.fonts);
    temp_it.add_to_end(choice);
    contains_nonfrag |= !current_is_frag;  // update contains_nonfrag
    choices_length++;
    if (choices_length >= max_matches) break;
  }
  Results->match.truncate(choices_length);
}  // ConvertMatchesToChoices
void tesseract::Classify::ConvertProto ( PROTO  Proto,
int  ProtoId,
INT_CLASS  Class 
)

This routine converts Proto to integer format and installs it as ProtoId in Class.

Parameters:
Protofloating-pt proto to be converted to integer format
ProtoIdid of proto
Classinteger class to add converted proto to
Returns:
none
Note:
Globals: none
Exceptions: none
History: Fri Feb 8 11:22:43 1991, DSJ, Created.

Definition at line 522 of file intproto.cpp.

                                                                     {
  INT_PROTO P;
  FLOAT32 Param;

  assert(ProtoId < Class->NumProtos);

  P = ProtoForProtoId(Class, ProtoId);

  Param = Proto->A * 128;
  P->A = TruncateParam(Param, -128, 127, NULL);

  Param = -Proto->B * 256;
  P->B = TruncateParam(Param, 0, 255, NULL);

  Param = Proto->C * 128;
  P->C = TruncateParam(Param, -128, 127, NULL);

  Param = Proto->Angle * 256;
  if (Param < 0 || Param >= 256)
    P->Angle = 0;
  else
    P->Angle = (uinT8) Param;

  /* round proto length to nearest integer number of pico-features */
  Param = (Proto->Length / GetPicoFeatureLength()) + 0.5;
  Class->ProtoLengths[ProtoId] = TruncateParam(Param, 1, 255, NULL);
  if (classify_learning_debug_level >= 2)
    cprintf("Converted ffeat to (A=%d,B=%d,C=%d,L=%d)",
            P->A, P->B, P->C, Class->ProtoLengths[ProtoId]);
}                                /* ConvertProto */
INT_TEMPLATES tesseract::Classify::CreateIntTemplates ( CLASSES  FloatProtos,
const UNICHARSET target_unicharset 
)

This routine converts from the old floating point format to the new integer format.

Parameters:
FloatProtosprototypes in old floating pt format
target_unicharsetthe UNICHARSET to use
Returns:
New set of training templates in integer format.
Note:
Globals: none
Exceptions: none
History: Thu Feb 7 14:40:42 1991, DSJ, Created.

Definition at line 564 of file intproto.cpp.

                                                              {
  INT_TEMPLATES IntTemplates;
  CLASS_TYPE FClass;
  INT_CLASS IClass;
  int ClassId;
  int ProtoId;
  int ConfigId;

  IntTemplates = NewIntTemplates();

  for (ClassId = 0; ClassId < target_unicharset.size(); ClassId++) {
    FClass = &(FloatProtos[ClassId]);
    if (FClass->NumProtos == 0 && FClass->NumConfigs == 0 &&
        strcmp(target_unicharset.id_to_unichar(ClassId), " ") != 0) {
      cprintf("Warning: no protos/configs for %s in CreateIntTemplates()\n",
              target_unicharset.id_to_unichar(ClassId));
    }
    assert(UnusedClassIdIn(IntTemplates, ClassId));
    IClass = NewIntClass(FClass->NumProtos, FClass->NumConfigs);
    FontSet fs;
    fs.size = FClass->font_set.size();
    fs.configs = new int[fs.size];
    for (int i = 0; i < fs.size; ++i) {
      fs.configs[i] = FClass->font_set.get(i);
    }
    if (this->fontset_table_.contains(fs)) {
      IClass->font_set_id = this->fontset_table_.get_id(fs);
      delete[] fs.configs;
    } else {
      IClass->font_set_id = this->fontset_table_.push_back(fs);
    }
    AddIntClass(IntTemplates, ClassId, IClass);

    for (ProtoId = 0; ProtoId < FClass->NumProtos; ProtoId++) {
      AddIntProto(IClass);
      ConvertProto(ProtoIn(FClass, ProtoId), ProtoId, IClass);
      AddProtoToProtoPruner(ProtoIn(FClass, ProtoId), ProtoId, IClass,
                            classify_learning_debug_level >= 2);
      AddProtoToClassPruner(ProtoIn(FClass, ProtoId), ClassId, IntTemplates);
    }

    for (ConfigId = 0; ConfigId < FClass->NumConfigs; ConfigId++) {
      AddIntConfig(IClass);
      ConvertConfig(FClass->Configurations[ConfigId], ConfigId, IClass);
    }
  }
  return (IntTemplates);
}                                /* CreateIntTemplates */
void tesseract::Classify::DebugAdaptiveClassifier ( TBLOB blob,
ADAPT_RESULTS Results 
)
Parameters:
blobblob whose classification is being debugged
Resultsresults of match being debugged

Globals: none

Note:
Exceptions: none
History: Wed Mar 13 16:44:41 1991, DSJ, Created.

Definition at line 1546 of file adaptmatch.cpp.

                                                               {
  if (static_classifier_ == NULL) return;
  INT_FX_RESULT_STRUCT fx_info;
  GenericVector<INT_FEATURE_STRUCT> bl_features;
  TrainingSample* sample =
      BlobToTrainingSample(*blob, false, &fx_info, &bl_features);
  if (sample == NULL) return;
  static_classifier_->DebugDisplay(*sample, blob->denorm().pix(),
                                   Results->best_unichar_id);
}                                /* DebugAdaptiveClassifier */
void tesseract::Classify::DisplayAdaptedChar ( TBLOB blob,
INT_CLASS_STRUCT int_class 
)

Definition at line 978 of file adaptmatch.cpp.

                                                                          {
#ifndef GRAPHICS_DISABLED
  INT_FX_RESULT_STRUCT fx_info;
  GenericVector<INT_FEATURE_STRUCT> bl_features;
  TrainingSample* sample =
      BlobToTrainingSample(*blob, classify_nonlinear_norm, &fx_info,
                           &bl_features);
  if (sample == NULL) return;

  UnicharRating int_result;
  im_.Match(int_class, AllProtosOn, AllConfigsOn,
            bl_features.size(), &bl_features[0],
            &int_result, classify_adapt_feature_threshold,
            NO_DEBUG, matcher_debug_separate_windows);
  tprintf("Best match to temp config %d = %4.1f%%.\n",
          int_result.config, int_result.rating * 100.0);
  if (classify_learning_debug_level >= 2) {
    uinT32 ConfigMask;
    ConfigMask = 1 << int_result.config;
    ShowMatchDisplay();
    im_.Match(int_class, AllProtosOn, (BIT_VECTOR)&ConfigMask,
              bl_features.size(), &bl_features[0],
              &int_result, classify_adapt_feature_threshold,
              6 | 0x19, matcher_debug_separate_windows);
    UpdateMatchDisplay();
  }
#endif
}
void tesseract::Classify::DoAdaptiveMatch ( TBLOB Blob,
ADAPT_RESULTS Results 
)

This routine performs an adaptive classification. If we have not yet adapted to enough classes, a simple classification to the pre-trained templates is performed. Otherwise, we match the blob against the adapted templates. If the adapted templates do not match well, we try a match against the pre-trained templates. If an adapted template match is found, we do a match to any pre-trained templates which could be ambiguous. The results from all of these classifications are merged together into Results.

Parameters:
Blobblob to be classified
Resultsplace to put match results

Globals:

  • PreTrainedTemplates built-in training templates
  • AdaptedTemplates templates adapted for this page
  • matcher_reliable_adaptive_result rating limit for a great match
Note:
Exceptions: none
History: Tue Mar 12 08:50:11 1991, DSJ, Created.

Definition at line 1582 of file adaptmatch.cpp.

                                                                  {
  UNICHAR_ID *Ambiguities;

  INT_FX_RESULT_STRUCT fx_info;
  GenericVector<INT_FEATURE_STRUCT> bl_features;
  TrainingSample* sample =
      BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info,
                           &bl_features);
  if (sample == NULL) return;

  if (AdaptedTemplates->NumPermClasses < matcher_permanent_classes_min ||
      tess_cn_matching) {
    CharNormClassifier(Blob, *sample, Results);
  } else {
    Ambiguities = BaselineClassifier(Blob, bl_features, fx_info,
                                     AdaptedTemplates, Results);
    if ((!Results->match.empty() &&
         MarginalMatch(Results->best_rating,
                       matcher_reliable_adaptive_result) &&
         !tess_bn_matching) ||
        Results->match.empty()) {
      CharNormClassifier(Blob, *sample, Results);
    } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
      AmbigClassifier(bl_features, fx_info, Blob,
                      PreTrainedTemplates,
                      AdaptedTemplates->Class,
                      Ambiguities,
                      Results);
    }
  }

  // Force the blob to be classified as noise
  // if the results contain only fragments.
  // TODO(daria): verify that this is better than
  // just adding a NULL classification.
  if (!Results->HasNonfragment || Results->match.empty())
    ClassifyAsNoise(Results);
  delete sample;
}   /* DoAdaptiveMatch */
void tesseract::Classify::EndAdaptiveClassifier ( )

This routine performs cleanup operations on the adaptive classifier. It should be called before the program is terminated. Its main function is to save the adapted templates to a file.

Globals:

Note:
Exceptions: none
History: Tue Mar 19 14:37:06 1991, DSJ, Created.

Definition at line 456 of file adaptmatch.cpp.

                                     {
  STRING Filename;
  FILE *File;

  if (AdaptedTemplates != NULL &&
      classify_enable_adaptive_matcher && classify_save_adapted_templates) {
    Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
    File = fopen (Filename.string(), "wb");
    if (File == NULL)
      cprintf ("Unable to save adapted templates to %s!\n", Filename.string());
    else {
      cprintf ("\nSaving adapted templates to %s ...", Filename.string());
      fflush(stdout);
      WriteAdaptedTemplates(File, AdaptedTemplates);
      cprintf ("\n");
      fclose(File);
    }
  }

  if (AdaptedTemplates != NULL) {
    free_adapted_templates(AdaptedTemplates);
    AdaptedTemplates = NULL;
  }
  if (BackupAdaptedTemplates != NULL) {
    free_adapted_templates(BackupAdaptedTemplates);
    BackupAdaptedTemplates = NULL;
  }

  if (PreTrainedTemplates != NULL) {
    free_int_templates(PreTrainedTemplates);
    PreTrainedTemplates = NULL;
  }
  getDict().EndDangerousAmbigs();
  FreeNormProtos();
  if (AllProtosOn != NULL) {
    FreeBitVector(AllProtosOn);
    FreeBitVector(AllConfigsOn);
    FreeBitVector(AllConfigsOff);
    FreeBitVector(TempProtoMask);
    AllProtosOn = NULL;
    AllConfigsOn = NULL;
    AllConfigsOff = NULL;
    TempProtoMask = NULL;
  }
  delete shape_table_;
  shape_table_ = NULL;
  if (static_classifier_ != NULL) {
    delete static_classifier_;
    static_classifier_ = NULL;
  }
}                                /* EndAdaptiveClassifier */
void tesseract::Classify::ExpandShapesAndApplyCorrections ( ADAPT_CLASS classes,
bool  debug,
int  class_id,
int  bottom,
int  top,
float  cp_rating,
int  blob_length,
int  matcher_multiplier,
const uinT8 cn_factors,
UnicharRating int_result,
ADAPT_RESULTS final_results 
)

Definition at line 1166 of file adaptmatch.cpp.

                                                             {
  if (classes != NULL) {
    // Adapted result. Convert configs to fontinfo_ids.
    int_result->adapted = true;
    for (int f = 0; f < int_result->fonts.size(); ++f) {
      int_result->fonts[f].fontinfo_id =
          GetFontinfoId(classes[class_id], int_result->fonts[f].fontinfo_id);
    }
  } else {
    // Pre-trained result. Map fonts using font_sets_.
    int_result->adapted = false;
    for (int f = 0; f < int_result->fonts.size(); ++f) {
      int_result->fonts[f].fontinfo_id =
          ClassAndConfigIDToFontOrShapeID(class_id,
                                          int_result->fonts[f].fontinfo_id);
    }
    if (shape_table_ != NULL) {
      // Two possible cases:
      // 1. Flat shapetable. All unichar-ids of the shapes referenced by
      // int_result->fonts are the same. In this case build a new vector of
      // mapped fonts and replace the fonts in int_result.
      // 2. Multi-unichar shapetable. Variable unichars in the shapes referenced
      // by int_result. In this case, build a vector of UnicharRating to
      // gather together different font-ids for each unichar. Also covers case1.
      GenericVector<UnicharRating> mapped_results;
      for (int f = 0; f < int_result->fonts.size(); ++f) {
        int shape_id = int_result->fonts[f].fontinfo_id;
        const Shape& shape = shape_table_->GetShape(shape_id);
        for (int c = 0; c < shape.size(); ++c) {
          int unichar_id = shape[c].unichar_id;
          if (!unicharset.get_enabled(unichar_id)) continue;
          // Find the mapped_result for unichar_id.
          int r = 0;
          for (r = 0; r < mapped_results.size() &&
               mapped_results[r].unichar_id != unichar_id; ++r) {}
          if (r == mapped_results.size()) {
            mapped_results.push_back(*int_result);
            mapped_results[r].unichar_id = unichar_id;
            mapped_results[r].fonts.truncate(0);
          }
          for (int i = 0; i < shape[c].font_ids.size(); ++i) {
            mapped_results[r].fonts.push_back(
                ScoredFont(shape[c].font_ids[i], int_result->fonts[f].score));
          }
        }
      }
      for (int m = 0; m < mapped_results.size(); ++m) {
        mapped_results[m].rating =
            ComputeCorrectedRating(debug, mapped_results[m].unichar_id,
                                   cp_rating, int_result->rating,
                                   int_result->feature_misses, bottom, top,
                                   blob_length, matcher_multiplier, cn_factors);
        AddNewResult(mapped_results[m], final_results);
      }
      return;
    }
  }
  if (unicharset.get_enabled(class_id)) {
    int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating,
                                                int_result->rating,
                                                int_result->feature_misses,
                                                bottom, top, blob_length,
                                                matcher_multiplier, cn_factors);
    AddNewResult(*int_result, final_results);
  }
}
void tesseract::Classify::ExtractFeatures ( const TBLOB blob,
bool  nonlinear_norm,
GenericVector< INT_FEATURE_STRUCT > *  bl_features,
GenericVector< INT_FEATURE_STRUCT > *  cn_features,
INT_FX_RESULT_STRUCT results,
GenericVector< int > *  outline_cn_counts 
) [static]

Definition at line 445 of file intfx.cpp.

                                                                      {
  DENORM bl_denorm, cn_denorm;
  tesseract::Classify::SetupBLCNDenorms(blob, nonlinear_norm,
                                        &bl_denorm, &cn_denorm, results);
  if (outline_cn_counts != NULL)
    outline_cn_counts->truncate(0);
  // Iterate the outlines.
  for (TESSLINE* ol = blob.outlines; ol != NULL; ol = ol->next) {
    // Iterate the polygon.
    EDGEPT* loop_pt = ol->FindBestStartPt();
    EDGEPT* pt = loop_pt;
    if (pt == NULL) continue;
    do {
      if (pt->IsHidden()) continue;
      // Find a run of equal src_outline.
      EDGEPT* last_pt = pt;
      do {
        last_pt = last_pt->next;
      } while (last_pt != loop_pt && !last_pt->IsHidden() &&
               last_pt->src_outline == pt->src_outline);
      last_pt = last_pt->prev;
      // Until the adaptive classifier can be weaned off polygon segments,
      // we have to force extraction from the polygon for the bl_features.
      ExtractFeaturesFromRun(pt, last_pt, bl_denorm, kStandardFeatureLength,
                             true, bl_features);
      ExtractFeaturesFromRun(pt, last_pt, cn_denorm, kStandardFeatureLength,
                             false, cn_features);
      pt = last_pt;
    } while ((pt = pt->next) != loop_pt);
    if (outline_cn_counts != NULL)
      outline_cn_counts->push_back(cn_features->size());
  }
  results->NumBL = bl_features->size();
  results->NumCN = cn_features->size();
  results->YBottom = blob.bounding_box().bottom();
  results->YTop = blob.bounding_box().top();
  results->Width = blob.bounding_box().width();
}
FEATURE_SET tesseract::Classify::ExtractIntCNFeatures ( const TBLOB blob,
const INT_FX_RESULT_STRUCT fx_info 
)
Parameters:
blobblob to extract features from
fx_info
Returns:
Integer character-normalized features for blob.
Note:
Exceptions: none
History: 8/8/2011, rays, Created.

Definition at line 230 of file picofeat.cpp.

                                                            {
  INT_FX_RESULT_STRUCT local_fx_info(fx_info);
  GenericVector<INT_FEATURE_STRUCT> bl_features;
  tesseract::TrainingSample* sample = tesseract::BlobToTrainingSample(
      blob, false, &local_fx_info, &bl_features);
  if (sample == NULL) return NULL;

  int num_features = sample->num_features();
  const INT_FEATURE_STRUCT* features = sample->features();
  FEATURE_SET feature_set = NewFeatureSet(num_features);
  for (int f = 0; f < num_features; ++f) {
    FEATURE feature = NewFeature(&IntFeatDesc);

    feature->Params[IntX] = features[f].X;
    feature->Params[IntY] = features[f].Y;
    feature->Params[IntDir] = features[f].Theta;
    AddFeature(feature_set, feature);
  }
  delete sample;

  return feature_set;
}                                /* ExtractIntCNFeatures */
FEATURE_SET tesseract::Classify::ExtractIntGeoFeatures ( const TBLOB blob,
const INT_FX_RESULT_STRUCT fx_info 
)
Parameters:
blobblob to extract features from
fx_info
Returns:
Geometric (top/bottom/width) features for blob.
Note:
Exceptions: none
History: 8/8/2011, rays, Created.

Definition at line 262 of file picofeat.cpp.

                                                            {
  INT_FX_RESULT_STRUCT local_fx_info(fx_info);
  GenericVector<INT_FEATURE_STRUCT> bl_features;
  tesseract::TrainingSample* sample = tesseract::BlobToTrainingSample(
      blob, false, &local_fx_info, &bl_features);
  if (sample == NULL) return NULL;

  FEATURE_SET feature_set = NewFeatureSet(1);
  FEATURE feature = NewFeature(&IntFeatDesc);

  feature->Params[GeoBottom] = sample->geo_feature(GeoBottom);
  feature->Params[GeoTop] = sample->geo_feature(GeoTop);
  feature->Params[GeoWidth] = sample->geo_feature(GeoWidth);
  AddFeature(feature_set, feature);
  delete sample;

  return feature_set;
}                                /* ExtractIntGeoFeatures */
FEATURE_SET tesseract::Classify::ExtractOutlineFeatures ( TBLOB Blob)

Convert each segment in the outline to a feature and return the features.

Parameters:
Blobblob to extract pico-features from
Returns:
Outline-features for Blob.
Note:
Globals: none
Exceptions: none
History:
  • 11/13/90, DSJ, Created.
  • 05/24/91, DSJ, Updated for either char or baseline normalize.

Definition at line 47 of file outfeat.cpp.

                                                        {
  LIST Outlines;
  LIST RemainingOutlines;
  MFOUTLINE Outline;
  FEATURE_SET FeatureSet;
  FLOAT32 XScale, YScale;

  FeatureSet = NewFeatureSet (MAX_OUTLINE_FEATURES);
  if (Blob == NULL)
    return (FeatureSet);

  Outlines = ConvertBlob (Blob);

  NormalizeOutlines(Outlines, &XScale, &YScale);
  RemainingOutlines = Outlines;
  iterate(RemainingOutlines) {
    Outline = (MFOUTLINE) first_node (RemainingOutlines);
    ConvertToOutlineFeatures(Outline, FeatureSet);
  }
  if (classify_norm_method == baseline)
    NormalizeOutlineX(FeatureSet);
  FreeOutlines(Outlines);
  return (FeatureSet);
}                                /* ExtractOutlineFeatures */
FEATURE_SET tesseract::Classify::ExtractPicoFeatures ( TBLOB Blob)

Operation: Dummy for now.

Globals:

  • classify_norm_method normalization method currently specified
    Parameters:
    Blobblob to extract pico-features from
    Returns:
    Pico-features for Blob.
    Note:
    Exceptions: none
    History: 9/4/90, DSJ, Created.

Definition at line 67 of file picofeat.cpp.

                                                     {
  LIST Outlines;
  LIST RemainingOutlines;
  MFOUTLINE Outline;
  FEATURE_SET FeatureSet;
  FLOAT32 XScale, YScale;

  FeatureSet = NewFeatureSet(MAX_PICO_FEATURES);
  Outlines = ConvertBlob(Blob);
  NormalizeOutlines(Outlines, &XScale, &YScale);
  RemainingOutlines = Outlines;
  iterate(RemainingOutlines) {
    Outline = (MFOUTLINE) first_node (RemainingOutlines);
    ConvertToPicoFeatures2(Outline, FeatureSet);
  }
  if (classify_norm_method == baseline)
    NormalizePicoX(FeatureSet);
  FreeOutlines(Outlines);
  return (FeatureSet);

}                                /* ExtractPicoFeatures */
void tesseract::Classify::FreeNormProtos ( )

Definition at line 162 of file normmatch.cpp.

                              {
  if (NormProtos != NULL) {
    for (int i = 0; i < NormProtos->NumProtos; i++)
      FreeProtoList(&NormProtos->Protos[i]);
    Efree(NormProtos->Protos);
    Efree(NormProtos->ParamDesc);
    Efree(NormProtos);
    NormProtos = NULL;
  }
}
UnicityTable<FontInfo>& tesseract::Classify::get_fontinfo_table ( ) [inline]

Definition at line 345 of file classify.h.

                                               {
    return fontinfo_table_;
  }
const UnicityTable<FontInfo>& tesseract::Classify::get_fontinfo_table ( ) const [inline]

Definition at line 348 of file classify.h.

                                                           {
    return fontinfo_table_;
  }
UnicityTable<FontSet>& tesseract::Classify::get_fontset_table ( ) [inline]

Definition at line 351 of file classify.h.

                                             {
    return fontset_table_;
  }
int tesseract::Classify::GetAdaptiveFeatures ( TBLOB Blob,
INT_FEATURE_ARRAY  IntFeatures,
FEATURE_SET FloatFeatures 
)

This routine sets up the feature extractor to extract baseline normalized pico-features.

The extracted pico-features are converted to integer form and placed in IntFeatures. The original floating-pt. features are returned in FloatFeatures.

Globals: none

Parameters:
Blobblob to extract features from
[out]IntFeaturesarray to fill with integer features
[out]FloatFeaturesplace to return actual floating-pt features
Returns:
Number of pico-features returned (0 if an error occurred)
Note:
Exceptions: none
History: Tue Mar 12 17:55:18 1991, DSJ, Created.

Definition at line 812 of file adaptmatch.cpp.

                                                              {
  FEATURE_SET Features;
  int NumFeatures;

  classify_norm_method.set_value(baseline);
  Features = ExtractPicoFeatures(Blob);

  NumFeatures = Features->NumFeatures;
  if (NumFeatures > UNLIKELY_NUM_FEAT) {
    FreeFeatureSet(Features);
    return 0;
  }

  ComputeIntFeatures(Features, IntFeatures);
  *FloatFeatures = Features;

  return NumFeatures;
}                                /* GetAdaptiveFeatures */
UNICHAR_ID * tesseract::Classify::GetAmbiguities ( TBLOB Blob,
CLASS_ID  CorrectClass 
)

This routine matches blob to the built-in templates to find out if there are any classes other than the correct class which are potential ambiguities.

Parameters:
Blobblob to get classification ambiguities for
CorrectClasscorrect class for Blob

Globals:

  • CurrentRatings used by qsort compare routine
  • PreTrainedTemplates built-in templates
Returns:
String containing all possible ambiguous classes.
Note:
Exceptions: none
History: Fri Mar 15 08:08:22 1991, DSJ, Created.

Definition at line 1639 of file adaptmatch.cpp.

                                                            {
  ADAPT_RESULTS *Results = new ADAPT_RESULTS();
  UNICHAR_ID *Ambiguities;
  int i;

  Results->Initialize();
  INT_FX_RESULT_STRUCT fx_info;
  GenericVector<INT_FEATURE_STRUCT> bl_features;
  TrainingSample* sample =
      BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info,
                           &bl_features);
  if (sample == NULL) {
    delete Results;
    return NULL;
  }

  CharNormClassifier(Blob, *sample, Results);
  delete sample;
  RemoveBadMatches(Results);
  Results->match.sort(&UnicharRating::SortDescendingRating);

  /* copy the class id's into an string of ambiguities - don't copy if
     the correct class is the only class id matched */
  Ambiguities = new UNICHAR_ID[Results->match.size() + 1];
  if (Results->match.size() > 1 ||
      (Results->match.size() == 1 &&
          Results->match[0].unichar_id != CorrectClass)) {
    for (i = 0; i < Results->match.size(); i++)
      Ambiguities[i] = Results->match[i].unichar_id;
    Ambiguities[i] = -1;
  } else {
    Ambiguities[0] = -1;
  }

  delete Results;
  return Ambiguities;
}                              /* GetAmbiguities */
int tesseract::Classify::GetCharNormFeature ( const INT_FX_RESULT_STRUCT fx_info,
INT_TEMPLATES  templates,
uinT8 pruner_norm_array,
uinT8 char_norm_array 
)

This routine calls the integer (Hardware) feature extractor if it has not been called before for this blob.

The results from the feature extractor are placed into globals so that they can be used in other routines without re-extracting the features.

It then copies the char norm features into the IntFeatures array provided by the caller.

Parameters:
templatesused to compute char norm adjustments
pruner_norm_arrayArray of factors from blob normalization process
char_norm_arrayarray to fill with dummy char norm adjustments
fx_infoGlobals:
Returns:
Number of features extracted or 0 if an error occurred.
Note:
Exceptions: none
History: Tue May 28 10:40:52 1991, DSJ, Created.

Definition at line 1727 of file adaptmatch.cpp.

                                                         {
  FEATURE norm_feature = NewFeature(&CharNormDesc);
  float baseline = kBlnBaselineOffset;
  float scale = MF_SCALE_FACTOR;
  norm_feature->Params[CharNormY] = (fx_info.Ymean - baseline) * scale;
  norm_feature->Params[CharNormLength] =
      fx_info.Length * scale / LENGTH_COMPRESSION;
  norm_feature->Params[CharNormRx] = fx_info.Rx * scale;
  norm_feature->Params[CharNormRy] = fx_info.Ry * scale;
  // Deletes norm_feature.
  ComputeCharNormArrays(norm_feature, templates, char_norm_array,
                        pruner_norm_array);
  return IntCastRounded(fx_info.Length / kStandardFeatureLength);
}                              /* GetCharNormFeature */
CLASS_ID tesseract::Classify::GetClassToDebug ( const char *  Prompt,
bool *  adaptive_on,
bool *  pretrained_on,
int *  shape_id 
)

This routine prompts the user with Prompt and waits for the user to enter something in the debug window.

Parameters:
Promptprompt to print while waiting for input from window
adaptive_on
pretrained_on
shape_id
Returns:
Character entered in the debug window.
Note:
Globals: none
Exceptions: none
History: Thu Mar 21 16:55:13 1991, DSJ, Created.

Definition at line 1405 of file intproto.cpp.

                                                                       {
  tprintf("%s\n", Prompt);
  SVEvent* ev;
  SVEventType ev_type;
  int unichar_id = INVALID_UNICHAR_ID;
  // Wait until a click or popup event.
  do {
    ev = IntMatchWindow->AwaitEvent(SVET_ANY);
    ev_type = ev->type;
    if (ev_type == SVET_POPUP) {
      if (ev->command_id == IDA_SHAPE_INDEX) {
        if (shape_table_ != NULL) {
          *shape_id = atoi(ev->parameter);
          *adaptive_on = false;
          *pretrained_on = true;
          if (*shape_id >= 0 && *shape_id < shape_table_->NumShapes()) {
            int font_id;
            shape_table_->GetFirstUnicharAndFont(*shape_id, &unichar_id,
                                                 &font_id);
            tprintf("Shape %d, first unichar=%d, font=%d\n",
                    *shape_id, unichar_id, font_id);
            return unichar_id;
          }
          tprintf("Shape index '%s' not found in shape table\n", ev->parameter);
        } else {
          tprintf("No shape table loaded!\n");
        }
      } else {
        if (unicharset.contains_unichar(ev->parameter)) {
          unichar_id = unicharset.unichar_to_id(ev->parameter);
          if (ev->command_id == IDA_ADAPTIVE) {
            *adaptive_on = true;
            *pretrained_on = false;
            *shape_id = -1;
          } else if (ev->command_id == IDA_STATIC) {
            *adaptive_on = false;
            *pretrained_on = true;
          } else {
            *adaptive_on = true;
            *pretrained_on = true;
          }
          if (ev->command_id == IDA_ADAPTIVE || shape_table_ == NULL) {
            *shape_id = -1;
            return unichar_id;
          }
          for (int s = 0; s < shape_table_->NumShapes(); ++s) {
            if (shape_table_->GetShape(s).ContainsUnichar(unichar_id)) {
              tprintf("%s\n", shape_table_->DebugStr(s).string());
            }
          }
        } else {
          tprintf("Char class '%s' not found in unicharset",
                  ev->parameter);
        }
      }
    }
    delete ev;
  } while (ev_type != SVET_CLICK);
  return 0;
}                                /* GetClassToDebug */
Dict& tesseract::Classify::getDict ( ) [inline]

Definition at line 65 of file classify.h.

                  {
    return dict_;
  }
int tesseract::Classify::GetFontinfoId ( ADAPT_CLASS  Class,
uinT8  ConfigId 
)

Definition at line 190 of file adaptive.cpp.

                                                             {
  return (ConfigIsPermanent(Class, ConfigId) ?
      PermConfigFor(Class, ConfigId)->FontinfoId :
      TempConfigFor(Class, ConfigId)->FontinfoId);
}
void tesseract::Classify::InitAdaptedClass ( TBLOB Blob,
CLASS_ID  ClassId,
int  FontinfoId,
ADAPT_CLASS  Class,
ADAPT_TEMPLATES  Templates 
)

This routine creates a new adapted class and uses Blob as the model for the first config in that class.

Parameters:
Blobblob to model new class after
ClassIdid of the class to be initialized
FontinfoIdfont information inferred from pre-trained templates
Classadapted class to be initialized
Templatesadapted templates to add new class to

Globals:

Note:
Exceptions: none
History: Thu Mar 14 12:49:39 1991, DSJ, Created.

Definition at line 717 of file adaptmatch.cpp.

                                                           {
  FEATURE_SET Features;
  int Fid, Pid;
  FEATURE Feature;
  int NumFeatures;
  TEMP_PROTO TempProto;
  PROTO Proto;
  INT_CLASS IClass;
  TEMP_CONFIG Config;

  classify_norm_method.set_value(baseline);
  Features = ExtractOutlineFeatures(Blob);
  NumFeatures = Features->NumFeatures;
  if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
    FreeFeatureSet(Features);
    return;
  }

  Config = NewTempConfig(NumFeatures - 1, FontinfoId);
  TempConfigFor(Class, 0) = Config;

  /* this is a kludge to construct cutoffs for adapted templates */
  if (Templates == AdaptedTemplates)
    BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];

  IClass = ClassForClassId (Templates->Templates, ClassId);

  for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
    Pid = AddIntProto (IClass);
    assert (Pid != NO_PROTO);

    Feature = Features->Features[Fid];
    TempProto = NewTempProto ();
    Proto = &(TempProto->Proto);

    /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
       ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
       instead of the -0.25 to 0.75 used in baseline normalization */
    Proto->Angle = Feature->Params[OutlineFeatDir];
    Proto->X = Feature->Params[OutlineFeatX];
    Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
    Proto->Length = Feature->Params[OutlineFeatLength];
    FillABC(Proto);

    TempProto->ProtoId = Pid;
    SET_BIT (Config->Protos, Pid);

    ConvertProto(Proto, Pid, IClass);
    AddProtoToProtoPruner(Proto, Pid, IClass,
                          classify_learning_debug_level >= 2);

    Class->TempProtos = push (Class->TempProtos, TempProto);
  }
  FreeFeatureSet(Features);

  AddIntConfig(IClass);
  ConvertConfig (AllProtosOn, 0, IClass);

  if (classify_learning_debug_level >= 1) {
    tprintf("Added new class '%s' with class id %d and %d protos.\n",
            unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
    if (classify_learning_debug_level > 1)
      DisplayAdaptedChar(Blob, IClass);
  }

  if (IsEmptyAdaptedClass(Class))
    (Templates->NumNonEmptyClasses)++;
}                                /* InitAdaptedClass */
void tesseract::Classify::InitAdaptiveClassifier ( bool  load_pre_trained_templates)

This routine reads in the training information needed by the adaptive classifier and saves it into global variables. Parameters: load_pre_trained_templates Indicates whether the pre-trained templates (inttemp, normproto and pffmtable components) should be lodaded. Should only be set to true if the necessary classifier components are present in the [lang].traineddata file. Globals: BuiltInTemplatesFile file to get built-in temps from BuiltInCutoffsFile file to get avg. feat per class from classify_use_pre_adapted_templates enables use of pre-adapted templates

Note:
History: Mon Mar 11 12:49:34 1991, DSJ, Created.

Definition at line 527 of file adaptmatch.cpp.

                                                                     {
  if (!classify_enable_adaptive_matcher)
    return;
  if (AllProtosOn != NULL)
    EndAdaptiveClassifier();  // Don't leak with multiple inits.

  // If there is no language_data_path_prefix, the classifier will be
  // adaptive only.
  if (language_data_path_prefix.length() > 0 &&
      load_pre_trained_templates) {
    ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_INTTEMP));
    PreTrainedTemplates =
      ReadIntTemplates(tessdata_manager.GetDataFilePtr());
    if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded inttemp\n");

    if (tessdata_manager.SeekToStart(TESSDATA_SHAPE_TABLE)) {
      shape_table_ = new ShapeTable(unicharset);
      if (!shape_table_->DeSerialize(tessdata_manager.swap(),
                                     tessdata_manager.GetDataFilePtr())) {
        tprintf("Error loading shape table!\n");
        delete shape_table_;
        shape_table_ = NULL;
      } else if (tessdata_manager.DebugLevel() > 0) {
        tprintf("Successfully loaded shape table!\n");
      }
    }

    ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_PFFMTABLE));
    ReadNewCutoffs(tessdata_manager.GetDataFilePtr(),
                   tessdata_manager.swap(),
                   tessdata_manager.GetEndOffset(TESSDATA_PFFMTABLE),
                   CharNormCutoffs);
    if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded pffmtable\n");

    ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_NORMPROTO));
    NormProtos =
      ReadNormProtos(tessdata_manager.GetDataFilePtr(),
                     tessdata_manager.GetEndOffset(TESSDATA_NORMPROTO));
    if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded normproto\n");
    static_classifier_ = new TessClassifier(false, this);
  }

  im_.Init(&classify_debug_level);
  InitIntegerFX();

  AllProtosOn = NewBitVector(MAX_NUM_PROTOS);
  AllConfigsOn = NewBitVector(MAX_NUM_CONFIGS);
  AllConfigsOff = NewBitVector(MAX_NUM_CONFIGS);
  TempProtoMask = NewBitVector(MAX_NUM_PROTOS);
  set_all_bits(AllProtosOn, WordsInVectorOfSize(MAX_NUM_PROTOS));
  set_all_bits(AllConfigsOn, WordsInVectorOfSize(MAX_NUM_CONFIGS));
  zero_all_bits(AllConfigsOff, WordsInVectorOfSize(MAX_NUM_CONFIGS));

  for (int i = 0; i < MAX_NUM_CLASSES; i++) {
     BaselineCutoffs[i] = 0;
  }

  if (classify_use_pre_adapted_templates) {
    FILE *File;
    STRING Filename;

    Filename = imagefile;
    Filename += ADAPT_TEMPLATE_SUFFIX;
    File = fopen(Filename.string(), "rb");
    if (File == NULL) {
      AdaptedTemplates = NewAdaptedTemplates(true);
    } else {
      cprintf("\nReading pre-adapted templates from %s ...\n",
              Filename.string());
      fflush(stdout);
      AdaptedTemplates = ReadAdaptedTemplates(File);
      cprintf("\n");
      fclose(File);
      PrintAdaptedTemplates(stdout, AdaptedTemplates);

      for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
        BaselineCutoffs[i] = CharNormCutoffs[i];
      }
    }
  } else {
    if (AdaptedTemplates != NULL)
      free_adapted_templates(AdaptedTemplates);
    AdaptedTemplates = NewAdaptedTemplates(true);
  }
}                                /* InitAdaptiveClassifier */
bool tesseract::Classify::LargeSpeckle ( const TBLOB blob)

Definition at line 235 of file classify.cpp.

                                             {
  double speckle_size = kBlnXHeight * speckle_large_max_size;
  TBOX bbox = blob.bounding_box();
  return bbox.width() < speckle_size && bbox.height() < speckle_size;
}
void tesseract::Classify::LearnBlob ( const STRING fontname,
TBLOB Blob,
const DENORM cn_denorm,
const INT_FX_RESULT_STRUCT fx_info,
const char *  blob_text 
)

Definition at line 69 of file blobclass.cpp.

                                                {
  CHAR_DESC CharDesc = NewCharDescription(feature_defs_);
  CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);
  CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);
  CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);
  CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);

  if (ValidCharDescription(feature_defs_, CharDesc)) {
    // Label the features with a class name and font name.
    tr_file_data_ += "\n";
    tr_file_data_ += fontname;
    tr_file_data_ += " ";
    tr_file_data_ += blob_text;
    tr_file_data_ += "\n";

    // write micro-features to file and clean up
    WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_);
  } else {
    tprintf("Blob learned was invalid!\n");
  }
  FreeCharDescription(CharDesc);
}                                // LearnBlob
void tesseract::Classify::LearnPieces ( const char *  fontname,
int  start,
int  length,
float  threshold,
CharSegmentationType  segmentation,
const char *  correct_text,
WERD_RES word 
)

Definition at line 368 of file adaptmatch.cpp.

                                                                     {
  // TODO(daria) Remove/modify this if/when we want
  // to train and/or adapt to n-grams.
  if (segmentation != CST_WHOLE &&
      (segmentation != CST_FRAGMENT || disable_character_fragments))
    return;

  if (length > 1) {
    SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start,
                     start + length - 1);
  }
  TBLOB* blob = word->chopped_word->blobs[start];
  // Rotate the blob if needed for classification.
  TBLOB* rotated_blob = blob->ClassifyNormalizeIfNeeded();
  if (rotated_blob == NULL)
    rotated_blob = blob;

  #ifndef GRAPHICS_DISABLED
  // Draw debug windows showing the blob that is being learned if needed.
  if (strcmp(classify_learn_debug_str.string(), correct_text) == 0) {
    RefreshDebugWindow(&learn_debug_win_, "LearnPieces", 600,
                       word->chopped_word->bounding_box());
    rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
    learn_debug_win_->Update();
    window_wait(learn_debug_win_);
  }
  if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {
    ASSERT_HOST(learn_fragments_debug_win_ != NULL);  // set up in LearnWord
    blob->plot(learn_fragments_debug_win_,
               ScrollView::BLUE, ScrollView::BROWN);
    learn_fragments_debug_win_->Update();
  }
  #endif  // GRAPHICS_DISABLED

  if (fontname != NULL) {
    classify_norm_method.set_value(character);  // force char norm spc 30/11/93
    tess_bn_matching.set_value(false);    // turn it off
    tess_cn_matching.set_value(false);
    DENORM bl_denorm, cn_denorm;
    INT_FX_RESULT_STRUCT fx_info;
    SetupBLCNDenorms(*rotated_blob, classify_nonlinear_norm,
                     &bl_denorm, &cn_denorm, &fx_info);
    LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
  } else if (unicharset.contains_unichar(correct_text)) {
    UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
    int font_id = word->fontinfo != NULL
                ? fontinfo_table_.get_id(*word->fontinfo)
                : 0;
    if (classify_learning_debug_level >= 1)
      tprintf("Adapting to char = %s, thr= %g font_id= %d\n",
              unicharset.id_to_unichar(class_id), threshold, font_id);
    // If filename is not NULL we are doing recognition
    // (as opposed to training), so we must have already set word fonts.
    AdaptToChar(rotated_blob, class_id, font_id, threshold, AdaptedTemplates);
    if (BackupAdaptedTemplates != NULL) {
      // Adapt the backup templates too. They will be used if the primary gets
      // too full.
      AdaptToChar(rotated_blob, class_id, font_id, threshold,
                  BackupAdaptedTemplates);
    }
  } else if (classify_debug_level >= 1) {
    tprintf("Can't adapt to %s not in unicharset\n", correct_text);
  }
  if (rotated_blob != blob) {
    delete rotated_blob;
  }

  SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start,
                    start + length - 1);
}  // LearnPieces.
void tesseract::Classify::LearnWord ( const char *  fontname,
WERD_RES word 
)

Definition at line 244 of file adaptmatch.cpp.

                                                             {
  int word_len = word->correct_text.size();
  if (word_len == 0) return;

  float* thresholds = NULL;
  if (fontname == NULL) {
    // Adaption mode.
    if (!EnableLearning || word->best_choice == NULL)
      return;  // Can't or won't adapt.

    if (classify_learning_debug_level >= 1)
      tprintf("\n\nAdapting to word = %s\n",
              word->best_choice->debug_string().string());
    thresholds = new float[word_len];
    word->ComputeAdaptionThresholds(certainty_scale,
                                    matcher_perfect_threshold,
                                    matcher_good_threshold,
                                    matcher_rating_margin, thresholds);
  }
  int start_blob = 0;

  #ifndef GRAPHICS_DISABLED
  if (classify_debug_character_fragments) {
    if (learn_fragmented_word_debug_win_ != NULL) {
      window_wait(learn_fragmented_word_debug_win_);
    }
    RefreshDebugWindow(&learn_fragments_debug_win_, "LearnPieces", 400,
                       word->chopped_word->bounding_box());
    RefreshDebugWindow(&learn_fragmented_word_debug_win_, "LearnWord", 200,
                       word->chopped_word->bounding_box());
    word->chopped_word->plot(learn_fragmented_word_debug_win_);
    ScrollView::Update();
  }
  #endif  // GRAPHICS_DISABLED

  for (int ch = 0; ch < word_len; ++ch) {
    if (classify_debug_character_fragments) {
      tprintf("\nLearning %s\n",  word->correct_text[ch].string());
    }
    if (word->correct_text[ch].length() > 0) {
      float threshold = thresholds != NULL ? thresholds[ch] : 0.0f;

      LearnPieces(fontname, start_blob, word->best_state[ch], threshold,
                  CST_WHOLE, word->correct_text[ch].string(), word);

      if (word->best_state[ch] > 1 && !disable_character_fragments) {
        // Check that the character breaks into meaningful fragments
        // that each match a whole character with at least
        // classify_character_fragments_garbage_certainty_threshold
        bool garbage = false;
        int frag;
        for (frag = 0; frag < word->best_state[ch]; ++frag) {
          TBLOB* frag_blob = word->chopped_word->blobs[start_blob + frag];
          if (classify_character_fragments_garbage_certainty_threshold < 0) {
            garbage |= LooksLikeGarbage(frag_blob);
          }
        }
        // Learn the fragments.
        if (!garbage) {
          bool pieces_all_natural = word->PiecesAllNatural(start_blob,
              word->best_state[ch]);
          if (pieces_all_natural || !prioritize_division) {
            for (frag = 0; frag < word->best_state[ch]; ++frag) {
              GenericVector<STRING> tokens;
              word->correct_text[ch].split(' ', &tokens);

              tokens[0] = CHAR_FRAGMENT::to_string(
                  tokens[0].string(), frag, word->best_state[ch],
                  pieces_all_natural);

              STRING full_string;
              for (int i = 0; i < tokens.size(); i++) {
                full_string += tokens[i];
                if (i != tokens.size() - 1)
                  full_string += ' ';
              }
              LearnPieces(fontname, start_blob + frag, 1, threshold,
                          CST_FRAGMENT, full_string.string(), word);
            }
          }
        }
      }

      // TODO(rays): re-enable this part of the code when we switch to the
      // new classifier that needs to see examples of garbage.
      /*
      if (word->best_state[ch] > 1) {
        // If the next blob is good, make junk with the rightmost fragment.
        if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
          LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
                      word->best_state[ch + 1] + 1,
                      threshold, CST_IMPROPER, INVALID_UNICHAR, word);
        }
        // If the previous blob is good, make junk with the leftmost fragment.
        if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
          LearnPieces(fontname, start_blob - word->best_state[ch - 1],
                      word->best_state[ch - 1] + 1,
                      threshold, CST_IMPROPER, INVALID_UNICHAR, word);
        }
      }
      // If the next blob is good, make a join with it.
      if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
        STRING joined_text = word->correct_text[ch];
        joined_text += word->correct_text[ch + 1];
        LearnPieces(fontname, start_blob,
                    word->best_state[ch] + word->best_state[ch + 1],
                    threshold, CST_NGRAM, joined_text.string(), word);
      }
      */
    }
    start_blob += word->best_state[ch];
  }
  delete [] thresholds;
}  // LearnWord.
bool tesseract::Classify::LooksLikeGarbage ( TBLOB blob)

Definition at line 1680 of file adaptmatch.cpp.

                                           {
  BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST();
  AdaptiveClassifier(blob, ratings);
  BLOB_CHOICE_IT ratings_it(ratings);
  const UNICHARSET &unicharset = getDict().getUnicharset();
  if (classify_debug_character_fragments) {
    print_ratings_list("======================\nLooksLikeGarbage() got ",
                       ratings, unicharset);
  }
  for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list();
       ratings_it.forward()) {
    if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != NULL) {
      continue;
    }
    float certainty = ratings_it.data()->certainty();
    delete ratings;
    return certainty <
            classify_character_fragments_garbage_certainty_threshold;
  }
  delete ratings;
  return true;  // no whole characters in ratings
}
int tesseract::Classify::MakeNewTemporaryConfig ( ADAPT_TEMPLATES  Templates,
CLASS_ID  ClassId,
int  FontinfoId,
int  NumFeatures,
INT_FEATURE_ARRAY  Features,
FEATURE_SET  FloatFeatures 
)
Parameters:
Templatesadapted templates to add new config to
ClassIdclass id to associate with new config
FontinfoIdfont information inferred from pre-trained templates
NumFeaturesnumber of features in IntFeatures
Featuresfeatures describing model for new config
FloatFeaturesfloating-pt representation of features
Returns:
The id of the new config created, a negative integer in case of error.
Note:
Exceptions: none
History: Fri Mar 15 08:49:46 1991, DSJ, Created.

Definition at line 1791 of file adaptmatch.cpp.

                                                      {
  INT_CLASS IClass;
  ADAPT_CLASS Class;
  PROTO_ID OldProtos[MAX_NUM_PROTOS];
  FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
  int NumOldProtos;
  int NumBadFeatures;
  int MaxProtoId, OldMaxProtoId;
  int BlobLength = 0;
  int MaskSize;
  int ConfigId;
  TEMP_CONFIG Config;
  int i;
  int debug_level = NO_DEBUG;

  if (classify_learning_debug_level >= 3)
    debug_level =
        PRINT_MATCH_SUMMARY | PRINT_FEATURE_MATCHES | PRINT_PROTO_MATCHES;

  IClass = ClassForClassId(Templates->Templates, ClassId);
  Class = Templates->Class[ClassId];

  if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
    ++NumAdaptationsFailed;
    if (classify_learning_debug_level >= 1)
      cprintf("Cannot make new temporary config: maximum number exceeded.\n");
    return -1;
  }

  OldMaxProtoId = IClass->NumProtos - 1;

  NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff,
                                    BlobLength, NumFeatures, Features,
                                    OldProtos, classify_adapt_proto_threshold,
                                    debug_level);

  MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
  zero_all_bits(TempProtoMask, MaskSize);
  for (i = 0; i < NumOldProtos; i++)
    SET_BIT(TempProtoMask, OldProtos[i]);

  NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn,
                                       BlobLength, NumFeatures, Features,
                                       BadFeatures,
                                       classify_adapt_feature_threshold,
                                       debug_level);

  MaxProtoId = MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures,
                                 IClass, Class, TempProtoMask);
  if (MaxProtoId == NO_PROTO) {
    ++NumAdaptationsFailed;
    if (classify_learning_debug_level >= 1)
      cprintf("Cannot make new temp protos: maximum number exceeded.\n");
    return -1;
  }

  ConfigId = AddIntConfig(IClass);
  ConvertConfig(TempProtoMask, ConfigId, IClass);
  Config = NewTempConfig(MaxProtoId, FontinfoId);
  TempConfigFor(Class, ConfigId) = Config;
  copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);

  if (classify_learning_debug_level >= 1)
    cprintf("Making new temp config %d fontinfo id %d"
            " using %d old and %d new protos.\n",
            ConfigId, Config->FontinfoId,
            NumOldProtos, MaxProtoId - OldMaxProtoId);

  return ConfigId;
}                              /* MakeNewTemporaryConfig */
PROTO_ID tesseract::Classify::MakeNewTempProtos ( FEATURE_SET  Features,
int  NumBadFeat,
FEATURE_ID  BadFeat[],
INT_CLASS  IClass,
ADAPT_CLASS  Class,
BIT_VECTOR  TempProtoMask 
)

This routine finds sets of sequential bad features that all have the same angle and converts each set into a new temporary proto. The temp proto is added to the proto pruner for IClass, pushed onto the list of temp protos in Class, and added to TempProtoMask.

Parameters:
Featuresfloating-pt features describing new character
NumBadFeatnumber of bad features to turn into protos
BadFeatfeature id's of bad features
IClassinteger class templates to add new protos to
Classadapted class templates to add new protos to
TempProtoMaskproto mask to add new protos to

Globals: none

Returns:
Max proto id in class after all protos have been added. Exceptions: none History: Fri Mar 15 11:39:38 1991, DSJ, Created.

Definition at line 1888 of file adaptmatch.cpp.

                                                               {
  FEATURE_ID *ProtoStart;
  FEATURE_ID *ProtoEnd;
  FEATURE_ID *LastBad;
  TEMP_PROTO TempProto;
  PROTO Proto;
  FEATURE F1, F2;
  FLOAT32 X1, X2, Y1, Y2;
  FLOAT32 A1, A2, AngleDelta;
  FLOAT32 SegmentLength;
  PROTO_ID Pid;

  for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat;
       ProtoStart < LastBad; ProtoStart = ProtoEnd) {
    F1 = Features->Features[*ProtoStart];
    X1 = F1->Params[PicoFeatX];
    Y1 = F1->Params[PicoFeatY];
    A1 = F1->Params[PicoFeatDir];

    for (ProtoEnd = ProtoStart + 1,
         SegmentLength = GetPicoFeatureLength();
         ProtoEnd < LastBad;
         ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
      F2 = Features->Features[*ProtoEnd];
      X2 = F2->Params[PicoFeatX];
      Y2 = F2->Params[PicoFeatY];
      A2 = F2->Params[PicoFeatDir];

      AngleDelta = fabs(A1 - A2);
      if (AngleDelta > 0.5)
        AngleDelta = 1.0 - AngleDelta;

      if (AngleDelta > matcher_clustering_max_angle_delta ||
          fabs(X1 - X2) > SegmentLength ||
          fabs(Y1 - Y2) > SegmentLength)
        break;
    }

    F2 = Features->Features[*(ProtoEnd - 1)];
    X2 = F2->Params[PicoFeatX];
    Y2 = F2->Params[PicoFeatY];
    A2 = F2->Params[PicoFeatDir];

    Pid = AddIntProto(IClass);
    if (Pid == NO_PROTO)
      return (NO_PROTO);

    TempProto = NewTempProto();
    Proto = &(TempProto->Proto);

    /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
       ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
       instead of the -0.25 to 0.75 used in baseline normalization */
    Proto->Length = SegmentLength;
    Proto->Angle = A1;
    Proto->X = (X1 + X2) / 2.0;
    Proto->Y = (Y1 + Y2) / 2.0 - Y_DIM_OFFSET;
    FillABC(Proto);

    TempProto->ProtoId = Pid;
    SET_BIT(TempProtoMask, Pid);

    ConvertProto(Proto, Pid, IClass);
    AddProtoToProtoPruner(Proto, Pid, IClass,
                          classify_learning_debug_level >= 2);

    Class->TempProtos = push(Class->TempProtos, TempProto);
  }
  return IClass->NumProtos - 1;
}                              /* MakeNewTempProtos */
void tesseract::Classify::MakePermanent ( ADAPT_TEMPLATES  Templates,
CLASS_ID  ClassId,
int  ConfigId,
TBLOB Blob 
)
Parameters:
Templatescurrent set of adaptive templates
ClassIdclass containing config to be made permanent
ConfigIdconfig to be made permanent
Blobcurrent blob being adapted to

Globals: none

Note:
Exceptions: none
History: Thu Mar 14 15:54:08 1991, DSJ, Created.

Definition at line 1977 of file adaptmatch.cpp.

                                          {
  UNICHAR_ID *Ambigs;
  TEMP_CONFIG Config;
  ADAPT_CLASS Class;
  PROTO_KEY ProtoKey;

  Class = Templates->Class[ClassId];
  Config = TempConfigFor(Class, ConfigId);

  MakeConfigPermanent(Class, ConfigId);
  if (Class->NumPermConfigs == 0)
    Templates->NumPermClasses++;
  Class->NumPermConfigs++;

  // Initialize permanent config.
  Ambigs = GetAmbiguities(Blob, ClassId);
  PERM_CONFIG Perm = (PERM_CONFIG) alloc_struct(sizeof(PERM_CONFIG_STRUCT),
                                                "PERM_CONFIG_STRUCT");
  Perm->Ambigs = Ambigs;
  Perm->FontinfoId = Config->FontinfoId;

  // Free memory associated with temporary config (since ADAPTED_CONFIG
  // is a union we need to clean up before we record permanent config).
  ProtoKey.Templates = Templates;
  ProtoKey.ClassId = ClassId;
  ProtoKey.ConfigId = ConfigId;
  Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);
  FreeTempConfig(Config);

  // Record permanent config.
  PermConfigFor(Class, ConfigId) = Perm;

  if (classify_learning_debug_level >= 1) {
    tprintf("Making config %d for %s (ClassId %d) permanent:"
            " fontinfo id %d, ambiguities '",
            ConfigId, getDict().getUnicharset().debug_str(ClassId).string(),
            ClassId, PermConfigFor(Class, ConfigId)->FontinfoId);
    for (UNICHAR_ID *AmbigsPointer = Ambigs;
        *AmbigsPointer >= 0; ++AmbigsPointer)
      tprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
    tprintf("'.\n");
  }
}                              /* MakePermanent */
void tesseract::Classify::MasterMatcher ( INT_TEMPLATES  templates,
inT16  num_features,
const INT_FEATURE_STRUCT features,
const uinT8 norm_factors,
ADAPT_CLASS classes,
int  debug,
int  matcher_multiplier,
const TBOX blob_box,
const GenericVector< CP_RESULT_STRUCT > &  results,
ADAPT_RESULTS final_results 
)

Factored-out calls to IntegerMatcher based on class pruner results. Returns integer matcher results inside CLASS_PRUNER_RESULTS structure.

Definition at line 1126 of file adaptmatch.cpp.

                                                           {
  int top = blob_box.top();
  int bottom = blob_box.bottom();
  UnicharRating int_result;
  for (int c = 0; c < results.size(); c++) {
    CLASS_ID class_id = results[c].Class;
    BIT_VECTOR protos = classes != NULL ? classes[class_id]->PermProtos
                                        : AllProtosOn;
    BIT_VECTOR configs = classes != NULL ? classes[class_id]->PermConfigs
                                         : AllConfigsOn;

    int_result.unichar_id = class_id;
    im_.Match(ClassForClassId(templates, class_id),
              protos, configs,
              num_features, features,
              &int_result, classify_adapt_feature_threshold, debug,
              matcher_debug_separate_windows);
    bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
    ExpandShapesAndApplyCorrections(classes, debug, class_id, bottom, top,
                                    results[c].Rating,
                                    final_results->BlobLength,
                                    matcher_multiplier, norm_factors,
                                    &int_result, final_results);
  }
}
ADAPT_TEMPLATES tesseract::Classify::NewAdaptedTemplates ( bool  InitFromUnicharset)

Allocates memory for adapted tempates. each char in unicharset to the newly created templates

Parameters:
InitFromUnicharsetif true, add an empty class for
Returns:
Ptr to new adapted templates.
Note:
Globals: none
Exceptions: none
History: Fri Mar 8 10:15:28 1991, DSJ, Created.

Definition at line 167 of file adaptive.cpp.

                                                                     {
  ADAPT_TEMPLATES Templates;
  int i;

  Templates = (ADAPT_TEMPLATES) Emalloc (sizeof (ADAPT_TEMPLATES_STRUCT));

  Templates->Templates = NewIntTemplates ();
  Templates->NumPermClasses = 0;
  Templates->NumNonEmptyClasses = 0;

  /* Insert an empty class for each unichar id in unicharset */
  for (i = 0; i < MAX_NUM_CLASSES; i++) {
    Templates->Class[i] = NULL;
    if (InitFromUnicharset && i < unicharset.size()) {
      AddAdaptedClass(Templates, NewAdaptedClass(), i);
    }
  }

  return (Templates);

}                                /* NewAdaptedTemplates */
void tesseract::Classify::NormalizeOutlines ( LIST  Outlines,
FLOAT32 XScale,
FLOAT32 YScale 
)

This routine normalizes every outline in Outlines according to the currently selected normalization method. It also returns the scale factors that it used to do this scaling. The scale factors returned represent the x and y sizes in the normalized coordinate system that correspond to 1 pixel in the original coordinate system.

Globals:

  • classify_norm_method method being used for normalization
  • classify_char_norm_range map radius of gyration to this value
    Parameters:
    Outlineslist of outlines to be normalized
    XScalex-direction scale factor used by routine
    YScaley-direction scale factor used by routine
    Returns:
    none (Outlines are changed and XScale and YScale are updated)
    Note:
    Exceptions: none
    History: Fri Dec 14 08:14:55 1990, DSJ, Created.

Definition at line 300 of file mfoutline.cpp.

                                                  {
  MFOUTLINE Outline;

  switch (classify_norm_method) {
    case character:
      ASSERT_HOST(!"How did NormalizeOutlines get called in character mode?");
      break;

    case baseline:
      iterate(Outlines) {
        Outline = (MFOUTLINE) first_node(Outlines);
        NormalizeOutline(Outline, 0.0);
      }
      *XScale = *YScale = MF_SCALE_FACTOR;
      break;
  }
}                                /* NormalizeOutlines */
void tesseract::Classify::PrintAdaptedTemplates ( FILE *  File,
ADAPT_TEMPLATES  Templates 
)

This routine prints a summary of the adapted templates in Templates to File.

Parameters:
Fileopen text file to print Templates to
Templatesadapted templates to print to File
Note:
Globals: none
Exceptions: none
History: Wed Mar 20 13:35:29 1991, DSJ, Created.

Definition at line 273 of file adaptive.cpp.

                                                                          {
  int i;
  INT_CLASS IClass;
  ADAPT_CLASS AClass;

  fprintf (File, "\n\nSUMMARY OF ADAPTED TEMPLATES:\n\n");
  fprintf (File, "Num classes = %d;  Num permanent classes = %d\n\n",
           Templates->NumNonEmptyClasses, Templates->NumPermClasses);
  fprintf (File, "   Id  NC NPC  NP NPP\n");
  fprintf (File, "------------------------\n");

  for (i = 0; i < (Templates->Templates)->NumClasses; i++) {
    IClass = Templates->Templates->Class[i];
    AClass = Templates->Class[i];
    if (!IsEmptyAdaptedClass (AClass)) {
      fprintf (File, "%5d  %s %3d %3d %3d %3d\n",
        i, unicharset.id_to_unichar(i),
      IClass->NumConfigs, AClass->NumPermConfigs,
      IClass->NumProtos,
      IClass->NumProtos - count (AClass->TempProtos));
    }
  }
  fprintf (File, "\n");

}                                /* PrintAdaptedTemplates */
void tesseract::Classify::PrintAdaptiveMatchResults ( const ADAPT_RESULTS results)

This routine writes the matches in Results to File.

Parameters:
resultsmatch results to write to File

Globals: none

Note:
Exceptions: none
History: Mon Mar 18 09:24:53 1991, DSJ, Created.

Definition at line 2076 of file adaptmatch.cpp.

                                                                     {
  for (int i = 0; i < results.match.size(); ++i) {
    tprintf("%s  ", unicharset.debug_str(results.match[i].unichar_id).string());
    results.match[i].Print();
  }
}                              /* PrintAdaptiveMatchResults */
int tesseract::Classify::PruneClasses ( const INT_TEMPLATES_STRUCT int_templates,
int  num_features,
int  keep_this,
const INT_FEATURE_STRUCT features,
const uinT8 normalization_factors,
const uinT16 expected_num_features,
GenericVector< CP_RESULT_STRUCT > *  results 
)

Runs the class pruner from int_templates on the given features, returning the number of classes output in results.

Parameters:
int_templatesClass pruner tables
num_featuresNumber of features in blob
featuresArray of features
normalization_factorsArray of fudge factors from blob normalization process (by CLASS_INDEX)
expected_num_featuresArray of expected number of features for each class (by CLASS_INDEX)
resultsSorted Array of pruned classes. Must be an array of size at least int_templates->NumClasses.
keep_this

Definition at line 409 of file intmatcher.cpp.

                                                                     {
  ClassPruner pruner(int_templates->NumClasses);
  // Compute initial match scores for all classes.
  pruner.ComputeScores(int_templates, num_features, features);
  // Adjust match scores for number of expected features.
  pruner.AdjustForExpectedNumFeatures(expected_num_features,
                                      classify_cp_cutoff_strength);
  // Apply disabled classes in unicharset - only works without a shape_table.
  if (shape_table_ == NULL)
    pruner.DisableDisabledClasses(unicharset);
  // If fragments are disabled, remove them, also only without a shape table.
  if (disable_character_fragments && shape_table_ == NULL)
    pruner.DisableFragments(unicharset);

  // If we have good x-heights, apply the given normalization factors.
  if (normalization_factors != NULL) {
    pruner.NormalizeForXheight(classify_class_pruner_multiplier,
                               normalization_factors);
  } else {
    pruner.NoNormalization();
  }
  // Do the actual pruning and sort the short-list.
  pruner.PruneAndSort(classify_class_pruner_threshold, keep_this,
                      shape_table_ == NULL, unicharset);

  if (classify_debug_level > 2) {
    pruner.DebugMatch(*this, int_templates, features);
  }
  if (classify_debug_level > 1) {
    pruner.SummarizeResult(*this, int_templates, expected_num_features,
                           classify_class_pruner_multiplier,
                           normalization_factors);
  }
  // Convert to the expected output format.
  return pruner.SetupResults(results);
}
ADAPT_TEMPLATES tesseract::Classify::ReadAdaptedTemplates ( FILE *  File)

Read a set of adapted templates from File and return a ptr to the templates.

Parameters:
Fileopen text file to read adapted templates from
Returns:
Ptr to adapted templates read from File.
Note:
Globals: none
Exceptions: none
History: Mon Mar 18 15:18:10 1991, DSJ, Created.

Definition at line 369 of file adaptive.cpp.

                                                         {
  int i;
  ADAPT_TEMPLATES Templates;

  /* first read the high level adaptive template struct */
  Templates = (ADAPT_TEMPLATES) Emalloc (sizeof (ADAPT_TEMPLATES_STRUCT));
  fread ((char *) Templates, sizeof (ADAPT_TEMPLATES_STRUCT), 1, File);

  /* then read in the basic integer templates */
  Templates->Templates = ReadIntTemplates (File);

  /* then read in the adaptive info for each class */
  for (i = 0; i < (Templates->Templates)->NumClasses; i++) {
    Templates->Class[i] = ReadAdaptedClass (File);
  }
  return (Templates);

}                                /* ReadAdaptedTemplates */
INT_TEMPLATES tesseract::Classify::ReadIntTemplates ( FILE *  File)

This routine reads a set of integer templates from File. File must already be open and must be in the correct binary format.

Parameters:
Fileopen file to read templates from
Returns:
Pointer to integer templates read from File.
Note:
Globals: none
Exceptions: none
History: Wed Feb 27 11:48:46 1991, DSJ, Created.

Definition at line 770 of file intproto.cpp.

                                                   {
  int i, j, w, x, y, z;
  BOOL8 swap;
  int nread;
  int unicharset_size;
  int version_id = 0;
  INT_TEMPLATES Templates;
  CLASS_PRUNER_STRUCT* Pruner;
  INT_CLASS Class;
  uinT8 *Lengths;
  PROTO_SET ProtoSet;

  /* variables for conversion from older inttemp formats */
  int b, bit_number, last_cp_bit_number, new_b, new_i, new_w;
  CLASS_ID class_id, max_class_id;
  inT16 *IndexFor = new inT16[MAX_NUM_CLASSES];
  CLASS_ID *ClassIdFor = new CLASS_ID[MAX_NUM_CLASSES];
  CLASS_PRUNER_STRUCT **TempClassPruner =
      new CLASS_PRUNER_STRUCT*[MAX_NUM_CLASS_PRUNERS];
  uinT32 SetBitsForMask =           // word with NUM_BITS_PER_CLASS
    (1 << NUM_BITS_PER_CLASS) - 1;  // set starting at bit 0
  uinT32 Mask, NewMask, ClassBits;
  int MaxNumConfigs = MAX_NUM_CONFIGS;
  int WerdsPerConfigVec = WERDS_PER_CONFIG_VEC;

  /* first read the high level template struct */
  Templates = NewIntTemplates();
  // Read Templates in parts for 64 bit compatibility.
  if (fread(&unicharset_size, sizeof(int), 1, File) != 1)
    cprintf("Bad read of inttemp!\n");
  if (fread(&Templates->NumClasses,
            sizeof(Templates->NumClasses), 1, File) != 1 ||
      fread(&Templates->NumClassPruners,
            sizeof(Templates->NumClassPruners), 1, File) != 1)
    cprintf("Bad read of inttemp!\n");
  // Swap status is determined automatically.
  swap = Templates->NumClassPruners < 0 ||
    Templates->NumClassPruners > MAX_NUM_CLASS_PRUNERS;
  if (swap) {
    Reverse32(&Templates->NumClassPruners);
    Reverse32(&Templates->NumClasses);
    Reverse32(&unicharset_size);
  }
  if (Templates->NumClasses < 0) {
    // This file has a version id!
    version_id = -Templates->NumClasses;
    if (fread(&Templates->NumClasses, sizeof(Templates->NumClasses),
              1, File) != 1)
      cprintf("Bad read of inttemp!\n");
    if (swap)
      Reverse32(&Templates->NumClasses);
  }

  if (version_id < 3) {
    MaxNumConfigs = OLD_MAX_NUM_CONFIGS;
    WerdsPerConfigVec = OLD_WERDS_PER_CONFIG_VEC;
  }

  if (version_id < 2) {
    for (i = 0; i < unicharset_size; ++i) {
      if (fread(&IndexFor[i], sizeof(inT16), 1, File) != 1)
        cprintf("Bad read of inttemp!\n");
    }
    for (i = 0; i < Templates->NumClasses; ++i) {
      if (fread(&ClassIdFor[i], sizeof(CLASS_ID), 1, File) != 1)
        cprintf("Bad read of inttemp!\n");
    }
    if (swap) {
      for (i = 0; i < Templates->NumClasses; i++)
        Reverse16(&IndexFor[i]);
      for (i = 0; i < Templates->NumClasses; i++)
        Reverse32(&ClassIdFor[i]);
    }
  }

  /* then read in the class pruners */
  for (i = 0; i < Templates->NumClassPruners; i++) {
    Pruner = new CLASS_PRUNER_STRUCT;
    if ((nread =
         fread(Pruner, 1, sizeof(CLASS_PRUNER_STRUCT),
                File)) != sizeof(CLASS_PRUNER_STRUCT))
      cprintf("Bad read of inttemp!\n");
    if (swap) {
      for (x = 0; x < NUM_CP_BUCKETS; x++) {
        for (y = 0; y < NUM_CP_BUCKETS; y++) {
          for (z = 0; z < NUM_CP_BUCKETS; z++) {
            for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {
              Reverse32(&Pruner->p[x][y][z][w]);
            }
          }
        }
      }
    }
    if (version_id < 2) {
      TempClassPruner[i] = Pruner;
    } else {
      Templates->ClassPruners[i] = Pruner;
    }
  }

  /* fix class pruners if they came from an old version of inttemp */
  if (version_id < 2) {
    // Allocate enough class pruners to cover all the class ids.
    max_class_id = 0;
    for (i = 0; i < Templates->NumClasses; i++)
      if (ClassIdFor[i] > max_class_id)
        max_class_id = ClassIdFor[i];
    for (i = 0; i <= CPrunerIdFor(max_class_id); i++) {
      Templates->ClassPruners[i] = new CLASS_PRUNER_STRUCT;
      memset(Templates->ClassPruners[i], 0, sizeof(CLASS_PRUNER_STRUCT));
    }
    // Convert class pruners from the old format (indexed by class index)
    // to the new format (indexed by class id).
    last_cp_bit_number = NUM_BITS_PER_CLASS * Templates->NumClasses - 1;
    for (i = 0; i < Templates->NumClassPruners; i++) {
      for (x = 0; x < NUM_CP_BUCKETS; x++)
        for (y = 0; y < NUM_CP_BUCKETS; y++)
          for (z = 0; z < NUM_CP_BUCKETS; z++)
            for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {
              if (TempClassPruner[i]->p[x][y][z][w] == 0)
                continue;
              for (b = 0; b < BITS_PER_WERD; b += NUM_BITS_PER_CLASS) {
                bit_number = i * BITS_PER_CP_VECTOR + w * BITS_PER_WERD + b;
                if (bit_number > last_cp_bit_number)
                  break; // the rest of the bits in this word are not used
                class_id = ClassIdFor[bit_number / NUM_BITS_PER_CLASS];
                // Single out NUM_BITS_PER_CLASS bits relating to class_id.
                Mask = SetBitsForMask << b;
                ClassBits = TempClassPruner[i]->p[x][y][z][w] & Mask;
                // Move these bits to the new position in which they should
                // appear (indexed corresponding to the class_id).
                new_i = CPrunerIdFor(class_id);
                new_w = CPrunerWordIndexFor(class_id);
                new_b = CPrunerBitIndexFor(class_id) * NUM_BITS_PER_CLASS;
                if (new_b > b) {
                  ClassBits <<= (new_b - b);
                } else {
                  ClassBits >>= (b - new_b);
                }
                // Copy bits relating to class_id to the correct position
                // in Templates->ClassPruner.
                NewMask = SetBitsForMask << new_b;
                Templates->ClassPruners[new_i]->p[x][y][z][new_w] &= ~NewMask;
                Templates->ClassPruners[new_i]->p[x][y][z][new_w] |= ClassBits;
              }
            }
    }
    for (i = 0; i < Templates->NumClassPruners; i++) {
      delete TempClassPruner[i];
    }
  }

  /* then read in each class */
  for (i = 0; i < Templates->NumClasses; i++) {
    /* first read in the high level struct for the class */
    Class = (INT_CLASS) Emalloc (sizeof (INT_CLASS_STRUCT));
    if (fread(&Class->NumProtos, sizeof(Class->NumProtos), 1, File) != 1 ||
        fread(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File) != 1 ||
        fread(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File) != 1)
      cprintf ("Bad read of inttemp!\n");
    if (version_id == 0) {
      // Only version 0 writes 5 pointless pointers to the file.
      for (j = 0; j < 5; ++j) {
        int junk;
        if (fread(&junk, sizeof(junk), 1, File) != 1)
          cprintf ("Bad read of inttemp!\n");
      }
    }
    if (version_id < 4) {
      for (j = 0; j < MaxNumConfigs; ++j) {
        if (fread(&Class->ConfigLengths[j], sizeof(uinT16), 1, File) != 1)
          cprintf ("Bad read of inttemp!\n");
      }
      if (swap) {
        Reverse16(&Class->NumProtos);
        for (j = 0; j < MaxNumConfigs; j++)
          Reverse16(&Class->ConfigLengths[j]);
      }
    } else {
      ASSERT_HOST(Class->NumConfigs < MaxNumConfigs);
      for (j = 0; j < Class->NumConfigs; ++j) {
        if (fread(&Class->ConfigLengths[j], sizeof(uinT16), 1, File) != 1)
          cprintf ("Bad read of inttemp!\n");
      }
      if (swap) {
        Reverse16(&Class->NumProtos);
        for (j = 0; j < MaxNumConfigs; j++)
          Reverse16(&Class->ConfigLengths[j]);
      }
    }
    if (version_id < 2) {
      ClassForClassId (Templates, ClassIdFor[i]) = Class;
    } else {
      ClassForClassId (Templates, i) = Class;
    }

    /* then read in the proto lengths */
    Lengths = NULL;
    if (MaxNumIntProtosIn (Class) > 0) {
      Lengths = (uinT8 *)Emalloc(sizeof(uinT8) * MaxNumIntProtosIn(Class));
      if ((nread =
           fread((char *)Lengths, sizeof(uinT8),
                 MaxNumIntProtosIn(Class), File)) != MaxNumIntProtosIn (Class))
        cprintf ("Bad read of inttemp!\n");
    }
    Class->ProtoLengths = Lengths;

    /* then read in the proto sets */
    for (j = 0; j < Class->NumProtoSets; j++) {
      ProtoSet = (PROTO_SET)Emalloc(sizeof(PROTO_SET_STRUCT));
      if (version_id < 3) {
        if ((nread =
             fread((char *) &ProtoSet->ProtoPruner, 1,
                    sizeof(PROTO_PRUNER), File)) != sizeof(PROTO_PRUNER))
          cprintf("Bad read of inttemp!\n");
        for (x = 0; x < PROTOS_PER_PROTO_SET; x++) {
          if ((nread = fread((char *) &ProtoSet->Protos[x].A, 1,
                             sizeof(inT8), File)) != sizeof(inT8) ||
              (nread = fread((char *) &ProtoSet->Protos[x].B, 1,
                             sizeof(uinT8), File)) != sizeof(uinT8) ||
              (nread = fread((char *) &ProtoSet->Protos[x].C, 1,
                             sizeof(inT8), File)) != sizeof(inT8) ||
              (nread = fread((char *) &ProtoSet->Protos[x].Angle, 1,
                             sizeof(uinT8), File)) != sizeof(uinT8))
            cprintf("Bad read of inttemp!\n");
          for (y = 0; y < WerdsPerConfigVec; y++)
            if ((nread = fread((char *) &ProtoSet->Protos[x].Configs[y], 1,
                               sizeof(uinT32), File)) != sizeof(uinT32))
              cprintf("Bad read of inttemp!\n");
        }
      } else {
        if ((nread =
             fread((char *) ProtoSet, 1, sizeof(PROTO_SET_STRUCT),
                   File)) != sizeof(PROTO_SET_STRUCT))
          cprintf("Bad read of inttemp!\n");
      }
      if (swap) {
        for (x = 0; x < NUM_PP_PARAMS; x++)
          for (y = 0; y < NUM_PP_BUCKETS; y++)
            for (z = 0; z < WERDS_PER_PP_VECTOR; z++)
              Reverse32(&ProtoSet->ProtoPruner[x][y][z]);
        for (x = 0; x < PROTOS_PER_PROTO_SET; x++)
          for (y = 0; y < WerdsPerConfigVec; y++)
            Reverse32(&ProtoSet->Protos[x].Configs[y]);
      }
      Class->ProtoSets[j] = ProtoSet;
    }
    if (version_id < 4)
      Class->font_set_id = -1;
    else {
      fread(&Class->font_set_id, sizeof(int), 1, File);
      if (swap)
        Reverse32(&Class->font_set_id);
    }
  }

  if (version_id < 2) {
    /* add an empty NULL class with class id 0 */
    assert(UnusedClassIdIn (Templates, 0));
    ClassForClassId (Templates, 0) = NewIntClass (1, 1);
    ClassForClassId (Templates, 0)->font_set_id = -1;
    Templates->NumClasses++;
    /* make sure the classes are contiguous */
    for (i = 0; i < MAX_NUM_CLASSES; i++) {
      if (i < Templates->NumClasses) {
        if (ClassForClassId (Templates, i) == NULL) {
          fprintf(stderr, "Non-contiguous class ids in inttemp\n");
          exit(1);
        }
      } else {
        if (ClassForClassId (Templates, i) != NULL) {
          fprintf(stderr, "Class id %d exceeds NumClassesIn (Templates) %d\n",
                  i, Templates->NumClasses);
          exit(1);
        }
      }
    }
  }
  if (version_id >= 4) {
    this->fontinfo_table_.read(File, NewPermanentTessCallback(read_info), swap);
    if (version_id >= 5) {
      this->fontinfo_table_.read(File,
                                 NewPermanentTessCallback(read_spacing_info),
                                 swap);
    }
    this->fontset_table_.read(File, NewPermanentTessCallback(read_set), swap);
  }

  // Clean up.
  delete[] IndexFor;
  delete[] ClassIdFor;
  delete[] TempClassPruner;

  return (Templates);
}                                /* ReadIntTemplates */
void tesseract::Classify::ReadNewCutoffs ( FILE *  CutoffFile,
bool  swap,
inT64  end_offset,
CLASS_CUTOFF_ARRAY  Cutoffs 
)

Open Filename, read in all of the class-id/cutoff pairs and insert them into the Cutoffs array. Cutoffs are indexed in the array by class id. Unused entries in the array are set to an arbitrarily high cutoff value.

Parameters:
CutoffFilename of file containing cutoff definitions
Cutoffsarray to put cutoffs into
swap
end_offset
Returns:
none
Note:
Globals: none
Exceptions: none
History: Wed Feb 20 09:38:26 1991, DSJ, Created.

Definition at line 52 of file cutoffs.cpp.

                                                          {
  char Class[UNICHAR_LEN + 1];
  CLASS_ID ClassId;
  int Cutoff;
  int i;

  if (shape_table_ != NULL) {
    if (!shapetable_cutoffs_.DeSerialize(swap, CutoffFile)) {
      tprintf("Error during read of shapetable pffmtable!\n");
    }
  }
  for (i = 0; i < MAX_NUM_CLASSES; i++)
    Cutoffs[i] = MAX_CUTOFF;

  while ((end_offset < 0 || ftell(CutoffFile) < end_offset) &&
         tfscanf(CutoffFile, "%" REALLY_QUOTE_IT(UNICHAR_LEN) "s %d",
                Class, &Cutoff) == 2) {
    if (strcmp(Class, "NULL") == 0) {
      ClassId = unicharset.unichar_to_id(" ");
    } else {
      ClassId = unicharset.unichar_to_id(Class);
    }
    Cutoffs[ClassId] = Cutoff;
    SkipNewline(CutoffFile);
  }
}
NORM_PROTOS * tesseract::Classify::ReadNormProtos ( FILE *  File,
inT64  end_offset 
)

This routine allocates a new data structure to hold a set of character normalization protos. It then fills in the data structure by reading from the specified File.

Parameters:
Fileopen text file to read normalization protos from
end_offsetGlobals: none
Returns:
Character normalization protos.
Note:
Exceptions: none
History: Wed Dec 19 16:38:49 1990, DSJ, Created.

Definition at line 245 of file normmatch.cpp.

                                                                  {
  NORM_PROTOS *NormProtos;
  int i;
  char unichar[2 * UNICHAR_LEN + 1];
  UNICHAR_ID unichar_id;
  LIST Protos;
  int NumProtos;

  /* allocate and initialization data structure */
  NormProtos = (NORM_PROTOS *) Emalloc (sizeof (NORM_PROTOS));
  NormProtos->NumProtos = unicharset.size();
  NormProtos->Protos = (LIST *) Emalloc (NormProtos->NumProtos * sizeof(LIST));
  for (i = 0; i < NormProtos->NumProtos; i++)
    NormProtos->Protos[i] = NIL_LIST;

  /* read file header and save in data structure */
  NormProtos->NumParams = ReadSampleSize (File);
  NormProtos->ParamDesc = ReadParamDesc (File, NormProtos->NumParams);

  /* read protos for each class into a separate list */
  while ((end_offset < 0 || ftell(File) < end_offset) &&
         tfscanf(File, "%s %d", unichar, &NumProtos) == 2) {
    if (unicharset.contains_unichar(unichar)) {
      unichar_id = unicharset.unichar_to_id(unichar);
      Protos = NormProtos->Protos[unichar_id];
      for (i = 0; i < NumProtos; i++)
        Protos =
            push_last (Protos, ReadPrototype (File, NormProtos->NumParams));
      NormProtos->Protos[unichar_id] = Protos;
    } else {
      cprintf("Error: unichar %s in normproto file is not in unichar set.\n",
              unichar);
      for (i = 0; i < NumProtos; i++)
        FreePrototype(ReadPrototype (File, NormProtos->NumParams));
    }
    SkipNewline(File);
  }
  return (NormProtos);
}                                /* ReadNormProtos */
void tesseract::Classify::RefreshDebugWindow ( ScrollView **  win,
const char *  msg,
int  y_offset,
const TBOX wbox 
)

Definition at line 220 of file adaptmatch.cpp.

                                                                  {
  #ifndef GRAPHICS_DISABLED
  const int kSampleSpaceWidth = 500;
  if (*win == NULL) {
    *win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200,
                          kSampleSpaceWidth * 2, 200, true);
  }
  (*win)->Clear();
  (*win)->Pen(64, 64, 64);
  (*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset,
               kSampleSpaceWidth, kBlnBaselineOffset);
  (*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset,
               kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset);
  (*win)->ZoomToRectangle(wbox.left(), wbox.top(),
                          wbox.right(), wbox.bottom());
  #endif  // GRAPHICS_DISABLED
}
void tesseract::Classify::RemoveBadMatches ( ADAPT_RESULTS Results)

This routine steps through each matching class in Results and removes it from the match list if its rating is worse than the BestRating plus a pad. In other words, all good matches get moved to the front of the classes array.

Parameters:
Resultscontains matches to be filtered

Globals:

  • matcher_bad_match_pad defines a "bad match"
Note:
Exceptions: none
History: Tue Mar 12 13:51:03 1991, DSJ, Created.

Definition at line 2099 of file adaptmatch.cpp.

                                                      {
  int Next, NextGood;
  FLOAT32 BadMatchThreshold;
  static const char* romans = "i v x I V X";
  BadMatchThreshold = Results->best_rating - matcher_bad_match_pad;

  if (classify_bln_numeric_mode) {
    UNICHAR_ID unichar_id_one = unicharset.contains_unichar("1") ?
        unicharset.unichar_to_id("1") : -1;
    UNICHAR_ID unichar_id_zero = unicharset.contains_unichar("0") ?
        unicharset.unichar_to_id("0") : -1;
    float scored_one = ScoredUnichar(unichar_id_one, *Results);
    float scored_zero = ScoredUnichar(unichar_id_zero, *Results);

    for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
      const UnicharRating& match = Results->match[Next];
      if (match.rating >= BadMatchThreshold) {
        if (!unicharset.get_isalpha(match.unichar_id) ||
            strstr(romans,
                   unicharset.id_to_unichar(match.unichar_id)) != NULL) {
        } else if (unicharset.eq(match.unichar_id, "l") &&
                   scored_one < BadMatchThreshold) {
          Results->match[Next].unichar_id = unichar_id_one;
        } else if (unicharset.eq(match.unichar_id, "O") &&
                   scored_zero < BadMatchThreshold) {
          Results->match[Next].unichar_id = unichar_id_zero;
        } else {
          Results->match[Next].unichar_id = INVALID_UNICHAR_ID;  // Don't copy.
        }
        if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) {
          if (NextGood == Next) {
            ++NextGood;
          } else {
            Results->match[NextGood++] = Results->match[Next];
          }
        }
      }
    }
  } else {
    for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
      if (Results->match[Next].rating >= BadMatchThreshold) {
        if (NextGood == Next) {
          ++NextGood;
        } else {
          Results->match[NextGood++] = Results->match[Next];
        }
      }
    }
  }
  Results->match.truncate(NextGood);
}                              /* RemoveBadMatches */
void tesseract::Classify::RemoveExtraPuncs ( ADAPT_RESULTS Results)

This routine discards extra digits or punctuation from the results. We keep only the top 2 punctuation answers and the top 1 digit answer if present.

Parameters:
Resultscontains matches to be filtered
Note:
History: Tue Mar 12 13:51:03 1991, DSJ, Created.

Definition at line 2161 of file adaptmatch.cpp.

                                                      {
  int Next, NextGood;
  int punc_count;              /*no of garbage characters */
  int digit_count;
  /*garbage characters */
  static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
  static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";

  punc_count = 0;
  digit_count = 0;
  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
    const UnicharRating& match = Results->match[Next];
    bool keep = true;
    if (strstr(punc_chars,
               unicharset.id_to_unichar(match.unichar_id)) != NULL) {
      if (punc_count >= 2)
        keep = false;
      punc_count++;
    } else {
      if (strstr(digit_chars,
                 unicharset.id_to_unichar(match.unichar_id)) != NULL) {
        if (digit_count >= 1)
          keep = false;
        digit_count++;
      }
    }
    if (keep) {
      if (NextGood == Next) {
        ++NextGood;
      } else {
        Results->match[NextGood++] = match;
      }
    }
  }
  Results->match.truncate(NextGood);
}                              /* RemoveExtraPuncs */
void tesseract::Classify::ResetAdaptiveClassifierInternal ( )

Definition at line 613 of file adaptmatch.cpp.

                                               {
  if (classify_learning_debug_level > 0) {
    tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n",
            NumAdaptationsFailed);
  }
  free_adapted_templates(AdaptedTemplates);
  AdaptedTemplates = NewAdaptedTemplates(true);
  if (BackupAdaptedTemplates != NULL)
    free_adapted_templates(BackupAdaptedTemplates);
  BackupAdaptedTemplates = NULL;
  NumAdaptationsFailed = 0;
}
void tesseract::Classify::SetAdaptiveThreshold ( FLOAT32  Threshold)

This routine resets the internal thresholds inside the integer matcher to correspond to the specified threshold.

Parameters:
Thresholdthreshold for creating new templates

Globals:

  • matcher_good_threshold default good match rating
Note:
Exceptions: none
History: Tue Apr 9 08:33:13 1991, DSJ, Created.

Definition at line 2212 of file adaptmatch.cpp.

                                                     {
  Threshold = (Threshold == matcher_good_threshold) ? 0.9: (1.0 - Threshold);
  classify_adapt_proto_threshold.set_value(
      ClipToRange<int>(255 * Threshold, 0, 255));
  classify_adapt_feature_threshold.set_value(
      ClipToRange<int>(255 * Threshold, 0, 255));
}                              /* SetAdaptiveThreshold */
void tesseract::Classify::SetStaticClassifier ( ShapeClassifier static_classifier)

Definition at line 204 of file classify.cpp.

                                                                     {
  delete static_classifier_;
  static_classifier_ = static_classifier;
}
void tesseract::Classify::SettupPass1 ( )

This routine prepares the adaptive matcher for the start of the first pass. Learning is enabled (unless it is disabled for the whole program).

Note:
this is somewhat redundant, it simply says that if learning is enabled then it will remain enabled on the first pass. If it is disabled, then it will remain disabled. This is only put here to make it very clear that learning is controlled directly by the global setting of EnableLearning.

Globals:

Note:
Exceptions: none
History: Mon Apr 15 16:39:29 1991, DSJ, Created.

Definition at line 670 of file adaptmatch.cpp.

                           {
  EnableLearning = classify_enable_learning;

  getDict().SettupStopperPass1();

}                                /* SettupPass1 */
void tesseract::Classify::SettupPass2 ( )

This routine prepares the adaptive matcher for the start of the second pass. Further learning is disabled.

Globals:

Note:
Exceptions: none
History: Mon Apr 15 16:39:29 1991, DSJ, Created.

Definition at line 690 of file adaptmatch.cpp.

                           {
  EnableLearning = FALSE;
  getDict().SettupStopperPass2();

}                                /* SettupPass2 */
void tesseract::Classify::SetupBLCNDenorms ( const TBLOB blob,
bool  nonlinear_norm,
DENORM bl_denorm,
DENORM cn_denorm,
INT_FX_RESULT_STRUCT fx_info 
) [static]

Definition at line 133 of file intfx.cpp.

                                                               {
  // Compute 1st and 2nd moments of the original outline.
  FCOORD center, second_moments;
  int length = blob.ComputeMoments(&center, &second_moments);
  if (fx_info != NULL) {
    fx_info->Length = length;
    fx_info->Rx = IntCastRounded(second_moments.y());
    fx_info->Ry = IntCastRounded(second_moments.x());

    fx_info->Xmean = IntCastRounded(center.x());
    fx_info->Ymean = IntCastRounded(center.y());
  }
  // Setup the denorm for Baseline normalization.
  bl_denorm->SetupNormalization(NULL, NULL, &blob.denorm(), center.x(), 128.0f,
                                1.0f, 1.0f, 128.0f, 128.0f);
  // Setup the denorm for character normalization.
  if (nonlinear_norm) {
    GenericVector<GenericVector<int> > x_coords;
    GenericVector<GenericVector<int> > y_coords;
    TBOX box;
    blob.GetPreciseBoundingBox(&box);
    box.pad(1, 1);
    blob.GetEdgeCoords(box, &x_coords, &y_coords);
    cn_denorm->SetupNonLinear(&blob.denorm(), box, MAX_UINT8, MAX_UINT8,
                              0.0f, 0.0f, x_coords, y_coords);
  } else {
    cn_denorm->SetupNormalization(NULL, NULL, &blob.denorm(),
                                  center.x(), center.y(),
                                  51.2f / second_moments.x(),
                                  51.2f / second_moments.y(),
                                  128.0f, 128.0f);
  }
}
const ShapeTable* tesseract::Classify::shape_table ( ) const [inline]

Definition at line 69 of file classify.h.

                                        {
    return shape_table_;
  }
int tesseract::Classify::ShapeIDToClassID ( int  shape_id) const

Definition at line 2296 of file adaptmatch.cpp.

                                                 {
  for (int id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
    int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
    ASSERT_HOST(font_set_id >= 0);
    const FontSet &fs = fontset_table_.get(font_set_id);
    for (int config = 0; config < fs.size; ++config) {
      if (fs.configs[config] == shape_id)
        return id;
    }
  }
  tprintf("Shape %d not found\n", shape_id);
  return -1;
}
void tesseract::Classify::ShowBestMatchFor ( int  shape_id,
const INT_FEATURE_STRUCT features,
int  num_features 
)

This routine displays debug information for the best config of the given shape_id for the given set of features.

Parameters:
shape_idclassifier id to work with
featuresfeatures of the unknown character
num_featuresNumber of features in the features array.
Note:
Exceptions: none
History: Fri Mar 22 08:43:52 1991, DSJ, Created.

Definition at line 2233 of file adaptmatch.cpp.

                                                  {
#ifndef GRAPHICS_DISABLED
  uinT32 config_mask;
  if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
    tprintf("No built-in templates for class/shape %d\n", shape_id);
    return;
  }
  if (num_features <= 0) {
    tprintf("Illegal blob (char norm features)!\n");
    return;
  }
  UnicharRating cn_result;
  classify_norm_method.set_value(character);
  im_.Match(ClassForClassId(PreTrainedTemplates, shape_id),
            AllProtosOn, AllConfigsOn,
            num_features, features, &cn_result,
            classify_adapt_feature_threshold, NO_DEBUG,
            matcher_debug_separate_windows);
  tprintf("\n");
  config_mask = 1 << cn_result.config;

  tprintf("Static Shape ID: %d\n", shape_id);
  ShowMatchDisplay();
  im_.Match(ClassForClassId(PreTrainedTemplates, shape_id),
            AllProtosOn, reinterpret_cast<BIT_VECTOR>(&config_mask),
            num_features, features, &cn_result,
            classify_adapt_feature_threshold,
            matcher_debug_flags,
            matcher_debug_separate_windows);
  UpdateMatchDisplay();
#endif  // GRAPHICS_DISABLED
}                              /* ShowBestMatchFor */
void tesseract::Classify::ShowMatchDisplay ( )

This routine sends the shapes in the global display lists to the match debugger window.

Globals:

  • FeatureShapes display list containing feature matches
  • ProtoShapes display list containing proto matches
    Returns:
    none
    Note:
    Exceptions: none
    History: Thu Mar 21 15:47:33 1991, DSJ, Created.

Definition at line 1079 of file intproto.cpp.

void tesseract::Classify::StartBackupAdaptiveClassifier ( )
void tesseract::Classify::SwitchAdaptiveClassifier ( )

Definition at line 628 of file adaptmatch.cpp.

                                        {
  if (BackupAdaptedTemplates == NULL) {
    ResetAdaptiveClassifierInternal();
    return;
  }
  if (classify_learning_debug_level > 0) {
    tprintf("Switch to backup adaptive classifier (NumAdaptationsFailed=%d)\n",
            NumAdaptationsFailed);
  }
  free_adapted_templates(AdaptedTemplates);
  AdaptedTemplates = BackupAdaptedTemplates;
  BackupAdaptedTemplates = NULL;
  NumAdaptationsFailed = 0;
}
bool tesseract::Classify::TempConfigReliable ( CLASS_ID  class_id,
const TEMP_CONFIG config 
)

Definition at line 2312 of file adaptmatch.cpp.

                                                             {
  if (classify_learning_debug_level >= 1) {
    tprintf("NumTimesSeen for config of %s is %d\n",
            getDict().getUnicharset().debug_str(class_id).string(),
            config->NumTimesSeen);
  }
  if (config->NumTimesSeen >= matcher_sufficient_examples_for_prototyping) {
    return true;
  } else if (config->NumTimesSeen < matcher_min_examples_for_prototyping) {
    return false;
  } else if (use_ambigs_for_adaption) {
    // Go through the ambigs vector and see whether we have already seen
    // enough times all the characters represented by the ambigs vector.
    const UnicharIdVector *ambigs =
      getDict().getUnicharAmbigs().AmbigsForAdaption(class_id);
    int ambigs_size = (ambigs == NULL) ? 0 : ambigs->size();
    for (int ambig = 0; ambig < ambigs_size; ++ambig) {
      ADAPT_CLASS ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
      assert(ambig_class != NULL);
      if (ambig_class->NumPermConfigs == 0 &&
          ambig_class->MaxNumTimesSeen <
          matcher_min_examples_for_prototyping) {
        if (classify_learning_debug_level >= 1) {
          tprintf("Ambig %s has not been seen enough times,"
                  " not making config for %s permanent\n",
                  getDict().getUnicharset().debug_str(
                      (*ambigs)[ambig]).string(),
                  getDict().getUnicharset().debug_str(class_id).string());
        }
        return false;
      }
    }
  }
  return true;
}
void tesseract::Classify::UpdateAmbigsGroup ( CLASS_ID  class_id,
TBLOB Blob 
)

Definition at line 2349 of file adaptmatch.cpp.

                                                               {
  const UnicharIdVector *ambigs =
    getDict().getUnicharAmbigs().ReverseAmbigsForAdaption(class_id);
  int ambigs_size = (ambigs == NULL) ? 0 : ambigs->size();
  if (classify_learning_debug_level >= 1) {
    tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
            getDict().getUnicharset().debug_str(class_id).string(), class_id);
  }
  for (int ambig = 0; ambig < ambigs_size; ++ambig) {
    CLASS_ID ambig_class_id = (*ambigs)[ambig];
    const ADAPT_CLASS ambigs_class = AdaptedTemplates->Class[ambig_class_id];
    for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
      if (ConfigIsPermanent(ambigs_class, cfg)) continue;
      const TEMP_CONFIG config =
        TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
      if (config != NULL && TempConfigReliable(ambig_class_id, config)) {
        if (classify_learning_debug_level >= 1) {
          tprintf("Making config %d of %s permanent\n", cfg,
                  getDict().getUnicharset().debug_str(
                      ambig_class_id).string());
        }
        MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);
      }
    }
  }
}
void tesseract::Classify::WriteAdaptedTemplates ( FILE *  File,
ADAPT_TEMPLATES  Templates 
)

This routine saves Templates to File in a binary format.

Parameters:
Fileopen text file to write Templates to
Templatesset of adapted templates to write to File
Note:
Globals: none
Exceptions: none
History: Mon Mar 18 15:07:32 1991, DSJ, Created.

Definition at line 505 of file adaptive.cpp.

                                                                          {
  int i;

  /* first write the high level adaptive template struct */
  fwrite ((char *) Templates, sizeof (ADAPT_TEMPLATES_STRUCT), 1, File);

  /* then write out the basic integer templates */
  WriteIntTemplates (File, Templates->Templates, unicharset);

  /* then write out the adaptive info for each class */
  for (i = 0; i < (Templates->Templates)->NumClasses; i++) {
    WriteAdaptedClass (File, Templates->Class[i],
      Templates->Templates->Class[i]->NumConfigs);
  }
}                                /* WriteAdaptedTemplates */
void tesseract::Classify::WriteIntTemplates ( FILE *  File,
INT_TEMPLATES  Templates,
const UNICHARSET target_unicharset 
)

This routine writes Templates to File. The format is an efficient binary format. File must already be open for writing.

Parameters:
Fileopen file to write templates to
Templatestemplates to save into File
target_unicharsetthe UNICHARSET to use
Returns:
none
Note:
Globals: none
Exceptions: none
History: Wed Feb 27 11:48:46 1991, DSJ, Created.

Definition at line 1138 of file intproto.cpp.

                                                                      {
  int i, j;
  INT_CLASS Class;
  int unicharset_size = target_unicharset.size();
  int version_id = -5;  // When negated by the reader -1 becomes +1 etc.

  if (Templates->NumClasses != unicharset_size) {
    cprintf("Warning: executing WriteIntTemplates() with %d classes in"
            " Templates, while target_unicharset size is %d\n",
            Templates->NumClasses, unicharset_size);
  }

  /* first write the high level template struct */
  fwrite(&unicharset_size, sizeof(unicharset_size), 1, File);
  fwrite(&version_id, sizeof(version_id), 1, File);
  fwrite(&Templates->NumClassPruners, sizeof(Templates->NumClassPruners),
         1, File);
  fwrite(&Templates->NumClasses, sizeof(Templates->NumClasses), 1, File);

  /* then write out the class pruners */
  for (i = 0; i < Templates->NumClassPruners; i++)
    fwrite(Templates->ClassPruners[i],
           sizeof(CLASS_PRUNER_STRUCT), 1, File);

  /* then write out each class */
  for (i = 0; i < Templates->NumClasses; i++) {
    Class = Templates->Class[i];

    /* first write out the high level struct for the class */
    fwrite(&Class->NumProtos, sizeof(Class->NumProtos), 1, File);
    fwrite(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File);
    ASSERT_HOST(Class->NumConfigs == this->fontset_table_.get(Class->font_set_id).size);
    fwrite(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File);
    for (j = 0; j < Class->NumConfigs; ++j) {
      fwrite(&Class->ConfigLengths[j], sizeof(uinT16), 1, File);
    }

    /* then write out the proto lengths */
    if (MaxNumIntProtosIn (Class) > 0) {
      fwrite ((char *) (Class->ProtoLengths), sizeof (uinT8),
              MaxNumIntProtosIn (Class), File);
    }

    /* then write out the proto sets */
    for (j = 0; j < Class->NumProtoSets; j++)
      fwrite ((char *) Class->ProtoSets[j],
              sizeof (PROTO_SET_STRUCT), 1, File);

    /* then write the fonts info */
    fwrite(&Class->font_set_id, sizeof(int), 1, File);
  }

  /* Write the fonts info tables */
  this->fontinfo_table_.write(File, NewPermanentTessCallback(write_info));
  this->fontinfo_table_.write(File,
                              NewPermanentTessCallback(write_spacing_info));
  this->fontset_table_.write(File, NewPermanentTessCallback(write_set));
}                                /* WriteIntTemplates */
bool tesseract::Classify::WriteTRFile ( const STRING filename)

Definition at line 97 of file blobclass.cpp.

                                                 {
  STRING tr_filename = filename + ".tr";
  FILE* fp = Efopen(tr_filename.string(), "wb");
  int len = tr_file_data_.length();
  bool result =
      fwrite(&tr_file_data_[0], sizeof(tr_file_data_[0]), len, fp) == len;
  fclose(fp);
  tr_file_data_.truncate_at(0);
  return result;
}

Member Data Documentation

"Use divisible blobs chopping"

Definition at line 382 of file classify.h.

"Certainty scaling factor"

Definition at line 437 of file classify.h.

"Threshold for good features during adaptive 0-255"

Definition at line 447 of file classify.h.

"Threshold for good protos during adaptive 0-255"

Definition at line 445 of file classify.h.

"Prune poor adapted results this much worse than best result"

Definition at line 441 of file classify.h.

"Threshold at which classify_adapted_pruning_factor starts"

Definition at line 443 of file classify.h.

"Assume the input is numbers [0-9]."

Definition at line 500 of file classify.h.

"Character Normalization Range ..."

Definition at line 396 of file classify.h.

"Exclude fragments that do not match any whole character" " with at least this certainty"

Definition at line 453 of file classify.h.

"Class Pruner Multiplier 0-255: "

Definition at line 465 of file classify.h.

"Class Pruner Threshold 0-255"

Definition at line 463 of file classify.h.

"Class Pruner CutoffStrength: "

Definition at line 467 of file classify.h.

"Bring up graphical debugging windows for fragments training"

Definition at line 455 of file classify.h.

"Classify debug level"

Definition at line 390 of file classify.h.

"Enable match debugger"

Definition at line 414 of file classify.h.

"Enable adaptive classifier"

Definition at line 409 of file classify.h.

"Enable adaptive classifier"

Definition at line 389 of file classify.h.

"Integer Matcher Multiplier 0-255: "

Definition at line 469 of file classify.h.

"Class str to debug learning"

Definition at line 459 of file classify.h.

"Learning Debug Level: "

Definition at line 419 of file classify.h.

"Veto difference between classifier certainties"

Definition at line 404 of file classify.h.

"Max char x-norm scale ..."

Definition at line 398 of file classify.h.

"Max char y-norm scale ..."

Definition at line 400 of file classify.h.

"Veto ratio between classifier ratings"

Definition at line 402 of file classify.h.

"Min char x-norm scale ..."

Definition at line 397 of file classify.h.

"Min char y-norm scale ..."

Definition at line 399 of file classify.h.

"Penalty to apply when a non-alnum is vertically out of " "its expected textline position"

Definition at line 435 of file classify.h.

"Non-linear stroke-density normalization"

Definition at line 416 of file classify.h.

"Normalization Method ..."

Definition at line 394 of file classify.h.

"Save adapted templates to a file"

Definition at line 413 of file classify.h.

"Use pre-adapted classifier templates"

Definition at line 411 of file classify.h.

"Do not include character fragments in the" " results of the classifier"

Definition at line 450 of file classify.h.

Definition at line 484 of file classify.h.

Definition at line 507 of file classify.h.

"Don't adapt to i/I at beginning of word"

Definition at line 498 of file classify.h.

Definition at line 503 of file classify.h.

"Avg. noise blob length: "

Definition at line 425 of file classify.h.

"Bad Match Pad (0-1)"

Definition at line 423 of file classify.h.

"Maximum angle delta for prototype clustering"

Definition at line 432 of file classify.h.

"Matcher Debug Flags"

Definition at line 418 of file classify.h.

"Matcher Debug Level"

Definition at line 417 of file classify.h.

"Use two different windows for debugging the matching: " "One for the protos and one for the features."

Definition at line 458 of file classify.h.

"Good Match (0-1)"

Definition at line 420 of file classify.h.

"Reliable Config Threshold"

Definition at line 428 of file classify.h.

"Perfect Match (0-1)"

Definition at line 422 of file classify.h.

"Min # of permanent classes"

Definition at line 426 of file classify.h.

"New template margin (0-1)"

Definition at line 424 of file classify.h.

"Great Match (0-1)"

Definition at line 421 of file classify.h.

"Enable adaption even if the ambiguities have not been seen"

Definition at line 430 of file classify.h.

"Prioritize blob division over chopping"

Definition at line 387 of file classify.h.

"Rating scaling factor"

Definition at line 436 of file classify.h.

Definition at line 512 of file classify.h.

"Max large speckle size"

Definition at line 501 of file classify.h.

"Penalty to add to worst rating for noise"

Definition at line 503 of file classify.h.

"Baseline Normalized Matching"

Definition at line 408 of file classify.h.

"Character Normalized Matching"

Definition at line 407 of file classify.h.

"Scale factor for features not used"

Definition at line 439 of file classify.h.

"Top choice only from CP"

Definition at line 388 of file classify.h.


The documentation for this class was generated from the following files:
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines