|
tesseract 3.04.01
|
#include "cluster.h"#include "commandlineflags.h"#include "featdefs.h"#include "intproto.h"#include "oldlist.h"Go to the source code of this file.
Classes | |
| struct | LABELEDLISTNODE |
| struct | MERGE_CLASS_NODE |
Namespaces | |
| namespace | tesseract |
Typedefs | |
| typedef struct LABELEDLISTNODE * | LABELEDLIST |
| typedef MERGE_CLASS_NODE * | MERGE_CLASS |
Functions | |
| void | ParseArguments (int *argc, char ***argv) |
| ShapeTable * | tesseract::LoadShapeTable (const STRING &file_prefix) |
| void | tesseract::WriteShapeTable (const STRING &file_prefix, const ShapeTable &shape_table) |
| MasterTrainer * | tesseract::LoadTrainingData (int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix) |
| const char * | GetNextFilename (int argc, const char *const *argv) |
| LABELEDLIST | FindList (LIST List, char *Label) |
| LABELEDLIST | NewLabeledList (const char *Label) |
| void | ReadTrainingSamples (const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples) |
| void | WriteTrainingSamples (const FEATURE_DEFS_STRUCT &FeatureDefs, char *Directory, LIST CharList, const char *program_feature_type) |
| void | FreeTrainingSamples (LIST CharList) |
| void | FreeLabeledList (LABELEDLIST LabeledList) |
| void | FreeLabeledClassList (LIST ClassListList) |
| CLUSTERER * | SetUpForClustering (const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST CharSample, const char *program_feature_type) |
| LIST | RemoveInsignificantProtos (LIST ProtoList, BOOL8 KeepSigProtos, BOOL8 KeepInsigProtos, int N) |
| void | CleanUpUnusedData (LIST ProtoList) |
| void | MergeInsignificantProtos (LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *Config) |
| MERGE_CLASS | FindClass (LIST List, const char *Label) |
| MERGE_CLASS | NewLabeledClass (const char *Label) |
| CLASS_STRUCT * | SetUpForFloat2Int (const UNICHARSET &unicharset, LIST LabeledClassList) |
| void | Normalize (float *Values) |
| void | FreeNormProtoList (LIST CharList) |
| void | AddToNormProtosList (LIST *NormProtoList, LIST ProtoList, char *CharName) |
| int | NumberOfProtos (LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos) |
| void | allocNormProtos () |
Variables | |
| FEATURE_DEFS_STRUCT | feature_defs |
| CLUSTERCONFIG | Config |
| typedef struct LABELEDLISTNODE * LABELEDLIST |
| typedef MERGE_CLASS_NODE* MERGE_CLASS |
Definition at line 56 of file commontraining.h.
Definition at line 854 of file commontraining.cpp.
{
PROTOTYPE* Proto;
LABELEDLIST LabeledProtoList;
LabeledProtoList = NewLabeledList(CharName);
iterate(ProtoList)
{
Proto = (PROTOTYPE *) first_node (ProtoList);
LabeledProtoList->List = push(LabeledProtoList->List, Proto);
}
*NormProtoList = push(*NormProtoList, LabeledProtoList);
}
| void allocNormProtos | ( | ) |
| void CleanUpUnusedData | ( | LIST | ProtoList | ) |
Definition at line 606 of file commontraining.cpp.
{
PROTOTYPE* Prototype;
iterate(ProtoList)
{
Prototype = (PROTOTYPE *) first_node (ProtoList);
if(Prototype->Variance.Elliptical != NULL)
{
memfree(Prototype->Variance.Elliptical);
Prototype->Variance.Elliptical = NULL;
}
if(Prototype->Magnitude.Elliptical != NULL)
{
memfree(Prototype->Magnitude.Elliptical);
Prototype->Magnitude.Elliptical = NULL;
}
if(Prototype->Weight.Elliptical != NULL)
{
memfree(Prototype->Weight.Elliptical);
Prototype->Weight.Elliptical = NULL;
}
}
}
| MERGE_CLASS FindClass | ( | LIST | List, |
| const char * | Label | ||
| ) |
Definition at line 701 of file commontraining.cpp.
{
MERGE_CLASS MergeClass;
iterate (List)
{
MergeClass = (MERGE_CLASS) first_node (List);
if (strcmp (MergeClass->Label, Label) == 0)
return (MergeClass);
}
return (NULL);
} /* FindClass */
| LABELEDLIST FindList | ( | LIST | List, |
| char * | Label | ||
| ) |
This routine searches through a list of labeled lists to find a list with the specified label. If a matching labeled list cannot be found, NULL is returned.
| List | list to search |
| Label | label to search for |
Definition at line 331 of file commontraining.cpp.
{
LABELEDLIST LabeledList;
iterate (List)
{
LabeledList = (LABELEDLIST) first_node (List);
if (strcmp (LabeledList->Label, Label) == 0)
return (LabeledList);
}
return (NULL);
} /* FindList */
| void FreeLabeledClassList | ( | LIST | ClassList | ) |
This routine deallocates all of the space allocated to the specified list of training samples.
| ClassList | list of all fonts in document |
Definition at line 741 of file commontraining.cpp.
{
MERGE_CLASS MergeClass;
iterate (ClassList) /* iterate through all of the fonts */
{
MergeClass = (MERGE_CLASS) first_node (ClassList);
free (MergeClass->Label);
FreeClass(MergeClass->Class);
delete MergeClass;
}
destroy (ClassList);
} /* FreeLabeledClassList */
| void FreeLabeledList | ( | LABELEDLIST | LabeledList | ) |
This routine deallocates all of the memory consumed by a labeled list. It does not free any memory which may be consumed by the items in the list.
| LabeledList | labeled list to be freed |
Definition at line 487 of file commontraining.cpp.
| void FreeNormProtoList | ( | LIST | CharList | ) |
Definition at line 838 of file commontraining.cpp.
{
LABELEDLIST char_sample;
iterate (CharList) /* iterate through all of the fonts */
{
char_sample = (LABELEDLIST) first_node (CharList);
FreeLabeledList (char_sample);
}
destroy (CharList);
} // FreeNormProtoList
| void FreeTrainingSamples | ( | LIST | CharList | ) |
This routine deallocates all of the space allocated to the specified list of training samples.
| CharList | list of all fonts in document |
Definition at line 458 of file commontraining.cpp.
{
LABELEDLIST char_sample;
FEATURE_SET FeatureSet;
LIST FeatureList;
iterate(CharList) { /* iterate through all of the fonts */
char_sample = (LABELEDLIST) first_node(CharList);
FeatureList = char_sample->List;
iterate(FeatureList) { /* iterate through all of the classes */
FeatureSet = (FEATURE_SET) first_node(FeatureList);
FreeFeatureSet(FeatureSet);
}
FreeLabeledList(char_sample);
}
destroy(CharList);
} /* FreeTrainingSamples */
| const char* GetNextFilename | ( | int | argc, |
| const char *const * | argv | ||
| ) |
This routine returns the next command line argument. If there are no remaining command line arguments, it returns NULL. This routine should only be called after all option arguments have been parsed and removed with ParseArguments.
Globals:
Definition at line 310 of file commontraining.cpp.
{
if (tessoptind < argc)
return argv[tessoptind++];
else
return NULL;
} /* GetNextFilename */
| void MergeInsignificantProtos | ( | LIST | ProtoList, |
| const char * | label, | ||
| CLUSTERER * | Clusterer, | ||
| CLUSTERCONFIG * | Config | ||
| ) |
Definition at line 541 of file commontraining.cpp.
{
PROTOTYPE *Prototype;
bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
LIST pProtoList = ProtoList;
iterate(pProtoList) {
Prototype = (PROTOTYPE *) first_node (pProtoList);
if (Prototype->Significant || Prototype->Merged)
continue;
FLOAT32 best_dist = 0.125;
PROTOTYPE* best_match = NULL;
// Find the nearest alive prototype.
LIST list_it = ProtoList;
iterate(list_it) {
PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it);
if (test_p != Prototype && !test_p->Merged) {
FLOAT32 dist = ComputeDistance(Clusterer->SampleSize,
Clusterer->ParamDesc,
Prototype->Mean, test_p->Mean);
if (dist < best_dist) {
best_match = test_p;
best_dist = dist;
}
}
}
if (best_match != NULL && !best_match->Significant) {
if (debug)
tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
best_match->NumSamples, Prototype->NumSamples,
best_match->Mean[0], best_match->Mean[1],
Prototype->Mean[0], Prototype->Mean[1]);
best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
Clusterer->ParamDesc,
best_match->NumSamples,
Prototype->NumSamples,
best_match->Mean,
best_match->Mean, Prototype->Mean);
Prototype->NumSamples = 0;
Prototype->Merged = 1;
} else if (best_match != NULL) {
if (debug)
tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
Prototype->Mean[0], Prototype->Mean[1],
best_match->Mean[0], best_match->Mean[1]);
Prototype->Merged = 1;
}
}
// Mark significant those that now have enough samples.
int min_samples = (inT32) (Config->MinSamples * Clusterer->NumChar);
pProtoList = ProtoList;
iterate(pProtoList) {
Prototype = (PROTOTYPE *) first_node (pProtoList);
// Process insignificant protos that do not match a green one
if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
!Prototype->Merged) {
if (debug)
tprintf("Red proto at %g,%g becoming green\n",
Prototype->Mean[0], Prototype->Mean[1]);
Prototype->Significant = true;
}
}
} /* MergeInsignificantProtos */
| MERGE_CLASS NewLabeledClass | ( | const char * | Label | ) |
Definition at line 718 of file commontraining.cpp.
{
MERGE_CLASS MergeClass;
MergeClass = new MERGE_CLASS_NODE;
MergeClass->Label = (char*)Emalloc (strlen (Label)+1);
strcpy (MergeClass->Label, Label);
MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS);
return (MergeClass);
} /* NewLabeledClass */
| LABELEDLIST NewLabeledList | ( | const char * | Label | ) |
This routine allocates a new, empty labeled list and gives it the specified label.
| Label | label for new list |
Definition at line 357 of file commontraining.cpp.
{
LABELEDLIST LabeledList;
LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE));
LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
strcpy (LabeledList->Label, Label);
LabeledList->List = NIL_LIST;
LabeledList->SampleCount = 0;
LabeledList->font_sample_count = 0;
return (LabeledList);
} /* NewLabeledList */
| void Normalize | ( | float * | Values | ) |
Definition at line 821 of file commontraining.cpp.
{
register float Slope;
register float Intercept;
register float Normalizer;
Slope = tan (Values [2] * 2 * PI);
Intercept = Values [1] - Slope * Values [0];
Normalizer = 1 / sqrt (Slope * Slope + 1.0);
Values [0] = Slope * Normalizer;
Values [1] = - Normalizer;
Values [2] = Intercept * Normalizer;
} // Normalize
Definition at line 872 of file commontraining.cpp.
{
int N = 0;
PROTOTYPE *Proto;
iterate(ProtoList)
{
Proto = (PROTOTYPE *) first_node ( ProtoList );
if (( Proto->Significant && CountSigProtos ) ||
( ! Proto->Significant && CountInsigProtos ) )
N++;
}
return(N);
}
| void ParseArguments | ( | int * | argc, |
| char *** | argv | ||
| ) |
This routine parses the command line arguments that were passed to the program and ses them to set relevant training-related global parameters
Globals:
| argc | number of command line arguments to parse |
| argv | command line arguments |
Definition at line 88 of file commontraining.cpp.
{
STRING usage;
if (*argc) {
usage += (*argv)[0];
}
usage += " [.tr files ...]";
tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
// Record the index of the first non-flag argument to 1, since we set
// remove_flags to true when parsing the flags.
tessoptind = 1;
// Set some global values based on the flags.
Config.MinSamples =
MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
Config.MaxIllegal =
MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_max_illegal)));
Config.Independence =
MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_independence)));
Config.Confidence =
MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_confidence)));
// Set additional parameters from config file if specified.
if (!FLAGS_configfile.empty()) {
tesseract::ParamUtils::ReadParamsFile(
FLAGS_configfile.c_str(),
tesseract::SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
ccutil.params());
}
}
| void ReadTrainingSamples | ( | const FEATURE_DEFS_STRUCT & | feature_defs, |
| const char * | feature_name, | ||
| int | max_samples, | ||
| UNICHARSET * | unicharset, | ||
| FILE * | file, | ||
| LIST * | training_samples | ||
| ) |
This routine reads training samples from a file and places them into a data structure which organizes the samples by FontName and CharName. It then returns this data structure.
| file | open text file to read samples from |
| feature_defs | |
| feature_name | |
| max_samples | |
| unicharset | |
| training_samples |
Definition at line 394 of file commontraining.cpp.
{
char buffer[2048];
char unichar[UNICHAR_LEN + 1];
LABELEDLIST char_sample;
FEATURE_SET feature_samples;
CHAR_DESC char_desc;
int i;
int feature_type = ShortNameToFeatureType(feature_defs, feature_name);
// Zero out the font_sample_count for all the classes.
LIST it = *training_samples;
iterate(it) {
char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
char_sample->font_sample_count = 0;
}
while (fgets(buffer, 2048, file) != NULL) {
if (buffer[0] == '\n')
continue;
sscanf(buffer, "%*s %s", unichar);
if (unicharset != NULL && !unicharset->contains_unichar(unichar)) {
unicharset->unichar_insert(unichar);
if (unicharset->size() > MAX_NUM_CLASSES) {
tprintf("Error: Size of unicharset in training is "
"greater than MAX_NUM_CLASSES\n");
exit(1);
}
}
char_sample = FindList(*training_samples, unichar);
if (char_sample == NULL) {
char_sample = NewLabeledList(unichar);
*training_samples = push(*training_samples, char_sample);
}
char_desc = ReadCharDescription(feature_defs, file);
feature_samples = char_desc->FeatureSets[feature_type];
if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
char_sample->List = push(char_sample->List, feature_samples);
char_sample->SampleCount++;
char_sample->font_sample_count++;
} else {
FreeFeatureSet(feature_samples);
}
for (i = 0; i < char_desc->NumFeatureSets; i++) {
if (feature_type != i)
FreeFeatureSet(char_desc->FeatureSets[i]);
}
free(char_desc);
}
} // ReadTrainingSamples
| LIST RemoveInsignificantProtos | ( | LIST | ProtoList, |
| BOOL8 | KeepSigProtos, | ||
| BOOL8 | KeepInsigProtos, | ||
| int | N | ||
| ) |
Definition at line 633 of file commontraining.cpp.
{
LIST NewProtoList = NIL_LIST;
LIST pProtoList;
PROTOTYPE* Proto;
PROTOTYPE* NewProto;
int i;
pProtoList = ProtoList;
iterate(pProtoList)
{
Proto = (PROTOTYPE *) first_node (pProtoList);
if ((Proto->Significant && KeepSigProtos) ||
(!Proto->Significant && KeepInsigProtos))
{
NewProto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
NewProto->Mean = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
NewProto->Significant = Proto->Significant;
NewProto->Style = Proto->Style;
NewProto->NumSamples = Proto->NumSamples;
NewProto->Cluster = NULL;
NewProto->Distrib = NULL;
for (i=0; i < N; i++)
NewProto->Mean[i] = Proto->Mean[i];
if (Proto->Variance.Elliptical != NULL)
{
NewProto->Variance.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
for (i=0; i < N; i++)
NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
}
else
NewProto->Variance.Elliptical = NULL;
//---------------------------------------------
if (Proto->Magnitude.Elliptical != NULL)
{
NewProto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
for (i=0; i < N; i++)
NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
}
else
NewProto->Magnitude.Elliptical = NULL;
//------------------------------------------------
if (Proto->Weight.Elliptical != NULL)
{
NewProto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
for (i=0; i < N; i++)
NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
}
else
NewProto->Weight.Elliptical = NULL;
NewProto->TotalMagnitude = Proto->TotalMagnitude;
NewProto->LogMagnitude = Proto->LogMagnitude;
NewProtoList = push_last(NewProtoList, NewProto);
}
}
FreeProtoList(&ProtoList);
return (NewProtoList);
} /* RemoveInsignificantProtos */
| CLUSTERER* SetUpForClustering | ( | const FEATURE_DEFS_STRUCT & | FeatureDefs, |
| LABELEDLIST | char_sample, | ||
| const char * | program_feature_type | ||
| ) |
This routine reads samples from a LABELEDLIST and enters those samples into a clusterer data structure. This data structure is then returned to the caller.
| char_sample,: | LABELEDLIST that holds all the feature information for a |
| FeatureDefs | |
| program_feature_type | given character. |
Definition at line 507 of file commontraining.cpp.
{
uinT16 N;
int i, j;
FLOAT32 *Sample = NULL;
CLUSTERER *Clusterer;
inT32 CharID;
LIST FeatureList = NULL;
FEATURE_SET FeatureSet = NULL;
int desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);
N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
FeatureList = char_sample->List;
CharID = 0;
iterate(FeatureList) {
FeatureSet = (FEATURE_SET) first_node(FeatureList);
for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
if (Sample == NULL)
Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
for (j = 0; j < N; j++)
Sample[j] = FeatureSet->Features[i]->Params[j];
MakeSample (Clusterer, Sample, CharID);
}
CharID++;
}
if ( Sample != NULL ) free( Sample );
return( Clusterer );
} /* SetUpForClustering */
| CLASS_STRUCT* SetUpForFloat2Int | ( | const UNICHARSET & | unicharset, |
| LIST | LabeledClassList | ||
| ) |
Definition at line 758 of file commontraining.cpp.
{
MERGE_CLASS MergeClass;
CLASS_TYPE Class;
int NumProtos;
int NumConfigs;
int NumWords;
int i, j;
float Values[3];
PROTO NewProto;
PROTO OldProto;
BIT_VECTOR NewConfig;
BIT_VECTOR OldConfig;
// printf("Float2Int ...\n");
CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
iterate(LabeledClassList)
{
UnicityTableEqEq<int> font_set;
MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
NumProtos = MergeClass->Class->NumProtos;
NumConfigs = MergeClass->Class->NumConfigs;
font_set.move(&MergeClass->Class->font_set);
Class->NumProtos = NumProtos;
Class->MaxNumProtos = NumProtos;
Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos);
for(i=0; i < NumProtos; i++)
{
NewProto = ProtoIn(Class, i);
OldProto = ProtoIn(MergeClass->Class, i);
Values[0] = OldProto->X;
Values[1] = OldProto->Y;
Values[2] = OldProto->Angle;
Normalize(Values);
NewProto->X = OldProto->X;
NewProto->Y = OldProto->Y;
NewProto->Length = OldProto->Length;
NewProto->Angle = OldProto->Angle;
NewProto->A = Values[0];
NewProto->B = Values[1];
NewProto->C = Values[2];
}
Class->NumConfigs = NumConfigs;
Class->MaxNumConfigs = NumConfigs;
Class->font_set.move(&font_set);
Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs);
NumWords = WordsInVectorOfSize(NumProtos);
for(i=0; i < NumConfigs; i++)
{
NewConfig = NewBitVector(NumProtos);
OldConfig = MergeClass->Class->Configurations[i];
for(j=0; j < NumWords; j++)
NewConfig[j] = OldConfig[j];
Class->Configurations[i] = NewConfig;
}
}
return float_classes;
} // SetUpForFloat2Int
| void WriteTrainingSamples | ( | const FEATURE_DEFS_STRUCT & | FeatureDefs, |
| char * | Directory, | ||
| LIST | CharList, | ||
| const char * | program_feature_type | ||
| ) |
Definition at line 51 of file commontraining.cpp.
Definition at line 52 of file commontraining.cpp.