tesseract 3.04.01

classify/normmatch.cpp

Go to the documentation of this file.
00001 /******************************************************************************
00002  **     Filename:    normmatch.c
00003  **     Purpose:     Simple matcher based on character normalization features.
00004  **     Author:      Dan Johnson
00005  **     History:     Wed Dec 19 16:18:06 1990, DSJ, Created.
00006  **
00007  **     (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00018 /*----------------------------------------------------------------------------
00019           Include Files and Type Defines
00020 ----------------------------------------------------------------------------*/
00021 #include "normmatch.h"
00022 
00023 #include <stdio.h>
00024 #include <math.h>
00025 
00026 #include "classify.h"
00027 #include "clusttool.h"
00028 #include "const.h"
00029 #include "efio.h"
00030 #include "emalloc.h"
00031 #include "globals.h"
00032 #include "helpers.h"
00033 #include "normfeat.h"
00034 #include "scanutils.h"
00035 #include "unicharset.h"
00036 #include "params.h"
00037 
00038 struct NORM_PROTOS
00039 {
00040   int NumParams;
00041   PARAM_DESC *ParamDesc;
00042   LIST* Protos;
00043   int NumProtos;
00044 };
00045 
00046 /*----------------------------------------------------------------------------
00047           Private Function Prototypes
00048 ----------------------------------------------------------------------------*/
00049 double NormEvidenceOf(register double NormAdj);
00050 
00051 void PrintNormMatch(FILE *File,
00052                     int NumParams,
00053                     PROTOTYPE *Proto,
00054                     FEATURE Feature);
00055 
00056 NORM_PROTOS *ReadNormProtos(FILE *File);
00057 
00058 /*----------------------------------------------------------------------------
00059         Variables
00060 ----------------------------------------------------------------------------*/
00061 
00063 double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
00064 double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
00066 const double kWidthErrorWeighting = 0.125;
00067 
00068 /*----------------------------------------------------------------------------
00069               Public Code
00070 ----------------------------------------------------------------------------*/
00071 /*---------------------------------------------------------------------------*/
00072 namespace tesseract {
00088 FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId,
00089                                    const FEATURE_STRUCT& feature,
00090                                    BOOL8 DebugMatch) {
00091   LIST Protos;
00092   FLOAT32 BestMatch;
00093   FLOAT32 Match;
00094   FLOAT32 Delta;
00095   PROTOTYPE *Proto;
00096   int ProtoId;
00097 
00098   if (ClassId >= NormProtos->NumProtos) {
00099     ClassId = NO_CLASS;
00100   }
00101 
00102   /* handle requests for classification as noise */
00103   if (ClassId == NO_CLASS) {
00104     /* kludge - clean up constants and make into control knobs later */
00105     Match = (feature.Params[CharNormLength] *
00106       feature.Params[CharNormLength] * 500.0 +
00107       feature.Params[CharNormRx] *
00108       feature.Params[CharNormRx] * 8000.0 +
00109       feature.Params[CharNormRy] *
00110       feature.Params[CharNormRy] * 8000.0);
00111     return (1.0 - NormEvidenceOf (Match));
00112   }
00113 
00114   BestMatch = MAX_FLOAT32;
00115   Protos = NormProtos->Protos[ClassId];
00116 
00117   if (DebugMatch) {
00118     tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
00119   }
00120 
00121   ProtoId = 0;
00122   iterate(Protos) {
00123     Proto = (PROTOTYPE *) first_node (Protos);
00124     Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
00125     Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
00126     if (DebugMatch) {
00127       tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
00128               Proto->Mean[CharNormY], Delta,
00129               Proto->Weight.Elliptical[CharNormY], Match);
00130     }
00131     Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
00132     Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
00133     if (DebugMatch) {
00134       tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
00135               Proto->Mean[CharNormRx], Delta,
00136               Proto->Weight.Elliptical[CharNormRx], Match);
00137     }
00138     // Ry is width! See intfx.cpp.
00139     Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
00140     if (DebugMatch) {
00141       tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
00142               Proto->Mean[CharNormRy], Delta,
00143               Proto->Weight.Elliptical[CharNormRy]);
00144     }
00145     Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
00146     Delta *= kWidthErrorWeighting;
00147     Match += Delta;
00148     if (DebugMatch) {
00149       tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
00150               Match, Match / classify_norm_adj_midpoint,
00151               NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
00152     }
00153 
00154     if (Match < BestMatch)
00155       BestMatch = Match;
00156 
00157     ProtoId++;
00158   }
00159   return 1.0 - NormEvidenceOf(BestMatch);
00160 }                                /* ComputeNormMatch */
00161 
00162 void Classify::FreeNormProtos() {
00163   if (NormProtos != NULL) {
00164     for (int i = 0; i < NormProtos->NumProtos; i++)
00165       FreeProtoList(&NormProtos->Protos[i]);
00166     Efree(NormProtos->Protos);
00167     Efree(NormProtos->ParamDesc);
00168     Efree(NormProtos);
00169     NormProtos = NULL;
00170   }
00171 }
00172 }  // namespace tesseract
00173 
00174 /*----------------------------------------------------------------------------
00175               Private Code
00176 ----------------------------------------------------------------------------*/
00184 double NormEvidenceOf(register double NormAdj) {
00185   NormAdj /= classify_norm_adj_midpoint;
00186 
00187   if (classify_norm_adj_curl == 3)
00188     NormAdj = NormAdj * NormAdj * NormAdj;
00189   else if (classify_norm_adj_curl == 2)
00190     NormAdj = NormAdj * NormAdj;
00191   else
00192     NormAdj = pow (NormAdj, classify_norm_adj_curl);
00193   return (1.0 / (1.0 + NormAdj));
00194 }
00195 
00196 
00197 /*---------------------------------------------------------------------------*/
00209 void PrintNormMatch(FILE *File,
00210                     int NumParams,
00211                     PROTOTYPE *Proto,
00212                     FEATURE Feature) {
00213   int i;
00214   FLOAT32 ParamMatch;
00215   FLOAT32 TotalMatch;
00216 
00217   for (i = 0, TotalMatch = 0.0; i < NumParams; i++) {
00218     ParamMatch = (Feature->Params[i] - Mean(Proto, i)) /
00219       StandardDeviation(Proto, i);
00220 
00221     fprintf (File, " %6.1f", ParamMatch);
00222 
00223     if (i == CharNormY || i == CharNormRx)
00224       TotalMatch += ParamMatch * ParamMatch;
00225   }
00226   fprintf (File, " --> %6.1f (%4.2f)\n",
00227     TotalMatch, NormEvidenceOf (TotalMatch));
00228 
00229 }                                /* PrintNormMatch */
00230 
00231 
00232 /*---------------------------------------------------------------------------*/
00233 namespace tesseract {
00245 NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) {
00246   NORM_PROTOS *NormProtos;
00247   int i;
00248   char unichar[2 * UNICHAR_LEN + 1];
00249   UNICHAR_ID unichar_id;
00250   LIST Protos;
00251   int NumProtos;
00252 
00253   /* allocate and initialization data structure */
00254   NormProtos = (NORM_PROTOS *) Emalloc (sizeof (NORM_PROTOS));
00255   NormProtos->NumProtos = unicharset.size();
00256   NormProtos->Protos = (LIST *) Emalloc (NormProtos->NumProtos * sizeof(LIST));
00257   for (i = 0; i < NormProtos->NumProtos; i++)
00258     NormProtos->Protos[i] = NIL_LIST;
00259 
00260   /* read file header and save in data structure */
00261   NormProtos->NumParams = ReadSampleSize (File);
00262   NormProtos->ParamDesc = ReadParamDesc (File, NormProtos->NumParams);
00263 
00264   /* read protos for each class into a separate list */
00265   while ((end_offset < 0 || ftell(File) < end_offset) &&
00266          tfscanf(File, "%s %d", unichar, &NumProtos) == 2) {
00267     if (unicharset.contains_unichar(unichar)) {
00268       unichar_id = unicharset.unichar_to_id(unichar);
00269       Protos = NormProtos->Protos[unichar_id];
00270       for (i = 0; i < NumProtos; i++)
00271         Protos =
00272             push_last (Protos, ReadPrototype (File, NormProtos->NumParams));
00273       NormProtos->Protos[unichar_id] = Protos;
00274     } else {
00275       cprintf("Error: unichar %s in normproto file is not in unichar set.\n",
00276               unichar);
00277       for (i = 0; i < NumProtos; i++)
00278         FreePrototype(ReadPrototype (File, NormProtos->NumParams));
00279     }
00280     SkipNewline(File);
00281   }
00282   return (NormProtos);
00283 }                                /* ReadNormProtos */
00284 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines