|
tesseract 3.04.01
|
00001 /****************************************************************************** 00002 ** Filename: normmatch.c 00003 ** Purpose: Simple matcher based on character normalization features. 00004 ** Author: Dan Johnson 00005 ** History: Wed Dec 19 16:18:06 1990, DSJ, Created. 00006 ** 00007 ** (c) Copyright Hewlett-Packard Company, 1988. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 ******************************************************************************/ 00018 /*---------------------------------------------------------------------------- 00019 Include Files and Type Defines 00020 ----------------------------------------------------------------------------*/ 00021 #include "normmatch.h" 00022 00023 #include <stdio.h> 00024 #include <math.h> 00025 00026 #include "classify.h" 00027 #include "clusttool.h" 00028 #include "const.h" 00029 #include "efio.h" 00030 #include "emalloc.h" 00031 #include "globals.h" 00032 #include "helpers.h" 00033 #include "normfeat.h" 00034 #include "scanutils.h" 00035 #include "unicharset.h" 00036 #include "params.h" 00037 00038 struct NORM_PROTOS 00039 { 00040 int NumParams; 00041 PARAM_DESC *ParamDesc; 00042 LIST* Protos; 00043 int NumProtos; 00044 }; 00045 00046 /*---------------------------------------------------------------------------- 00047 Private Function Prototypes 00048 ----------------------------------------------------------------------------*/ 00049 double NormEvidenceOf(register double NormAdj); 00050 00051 void PrintNormMatch(FILE *File, 00052 int NumParams, 00053 PROTOTYPE *Proto, 00054 FEATURE Feature); 00055 00056 NORM_PROTOS *ReadNormProtos(FILE *File); 00057 00058 /*---------------------------------------------------------------------------- 00059 Variables 00060 ----------------------------------------------------------------------------*/ 00061 00063 double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ..."); 00064 double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ..."); 00066 const double kWidthErrorWeighting = 0.125; 00067 00068 /*---------------------------------------------------------------------------- 00069 Public Code 00070 ----------------------------------------------------------------------------*/ 00071 /*---------------------------------------------------------------------------*/ 00072 namespace tesseract { 00088 FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId, 00089 const FEATURE_STRUCT& feature, 00090 BOOL8 DebugMatch) { 00091 LIST Protos; 00092 FLOAT32 BestMatch; 00093 FLOAT32 Match; 00094 FLOAT32 Delta; 00095 PROTOTYPE *Proto; 00096 int ProtoId; 00097 00098 if (ClassId >= NormProtos->NumProtos) { 00099 ClassId = NO_CLASS; 00100 } 00101 00102 /* handle requests for classification as noise */ 00103 if (ClassId == NO_CLASS) { 00104 /* kludge - clean up constants and make into control knobs later */ 00105 Match = (feature.Params[CharNormLength] * 00106 feature.Params[CharNormLength] * 500.0 + 00107 feature.Params[CharNormRx] * 00108 feature.Params[CharNormRx] * 8000.0 + 00109 feature.Params[CharNormRy] * 00110 feature.Params[CharNormRy] * 8000.0); 00111 return (1.0 - NormEvidenceOf (Match)); 00112 } 00113 00114 BestMatch = MAX_FLOAT32; 00115 Protos = NormProtos->Protos[ClassId]; 00116 00117 if (DebugMatch) { 00118 tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId)); 00119 } 00120 00121 ProtoId = 0; 00122 iterate(Protos) { 00123 Proto = (PROTOTYPE *) first_node (Protos); 00124 Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY]; 00125 Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY]; 00126 if (DebugMatch) { 00127 tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", 00128 Proto->Mean[CharNormY], Delta, 00129 Proto->Weight.Elliptical[CharNormY], Match); 00130 } 00131 Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx]; 00132 Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx]; 00133 if (DebugMatch) { 00134 tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", 00135 Proto->Mean[CharNormRx], Delta, 00136 Proto->Weight.Elliptical[CharNormRx], Match); 00137 } 00138 // Ry is width! See intfx.cpp. 00139 Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy]; 00140 if (DebugMatch) { 00141 tprintf("Width: Proto=%g, Delta=%g, Var=%g\n", 00142 Proto->Mean[CharNormRy], Delta, 00143 Proto->Weight.Elliptical[CharNormRy]); 00144 } 00145 Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy]; 00146 Delta *= kWidthErrorWeighting; 00147 Match += Delta; 00148 if (DebugMatch) { 00149 tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n", 00150 Match, Match / classify_norm_adj_midpoint, 00151 NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match))); 00152 } 00153 00154 if (Match < BestMatch) 00155 BestMatch = Match; 00156 00157 ProtoId++; 00158 } 00159 return 1.0 - NormEvidenceOf(BestMatch); 00160 } /* ComputeNormMatch */ 00161 00162 void Classify::FreeNormProtos() { 00163 if (NormProtos != NULL) { 00164 for (int i = 0; i < NormProtos->NumProtos; i++) 00165 FreeProtoList(&NormProtos->Protos[i]); 00166 Efree(NormProtos->Protos); 00167 Efree(NormProtos->ParamDesc); 00168 Efree(NormProtos); 00169 NormProtos = NULL; 00170 } 00171 } 00172 } // namespace tesseract 00173 00174 /*---------------------------------------------------------------------------- 00175 Private Code 00176 ----------------------------------------------------------------------------*/ 00184 double NormEvidenceOf(register double NormAdj) { 00185 NormAdj /= classify_norm_adj_midpoint; 00186 00187 if (classify_norm_adj_curl == 3) 00188 NormAdj = NormAdj * NormAdj * NormAdj; 00189 else if (classify_norm_adj_curl == 2) 00190 NormAdj = NormAdj * NormAdj; 00191 else 00192 NormAdj = pow (NormAdj, classify_norm_adj_curl); 00193 return (1.0 / (1.0 + NormAdj)); 00194 } 00195 00196 00197 /*---------------------------------------------------------------------------*/ 00209 void PrintNormMatch(FILE *File, 00210 int NumParams, 00211 PROTOTYPE *Proto, 00212 FEATURE Feature) { 00213 int i; 00214 FLOAT32 ParamMatch; 00215 FLOAT32 TotalMatch; 00216 00217 for (i = 0, TotalMatch = 0.0; i < NumParams; i++) { 00218 ParamMatch = (Feature->Params[i] - Mean(Proto, i)) / 00219 StandardDeviation(Proto, i); 00220 00221 fprintf (File, " %6.1f", ParamMatch); 00222 00223 if (i == CharNormY || i == CharNormRx) 00224 TotalMatch += ParamMatch * ParamMatch; 00225 } 00226 fprintf (File, " --> %6.1f (%4.2f)\n", 00227 TotalMatch, NormEvidenceOf (TotalMatch)); 00228 00229 } /* PrintNormMatch */ 00230 00231 00232 /*---------------------------------------------------------------------------*/ 00233 namespace tesseract { 00245 NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) { 00246 NORM_PROTOS *NormProtos; 00247 int i; 00248 char unichar[2 * UNICHAR_LEN + 1]; 00249 UNICHAR_ID unichar_id; 00250 LIST Protos; 00251 int NumProtos; 00252 00253 /* allocate and initialization data structure */ 00254 NormProtos = (NORM_PROTOS *) Emalloc (sizeof (NORM_PROTOS)); 00255 NormProtos->NumProtos = unicharset.size(); 00256 NormProtos->Protos = (LIST *) Emalloc (NormProtos->NumProtos * sizeof(LIST)); 00257 for (i = 0; i < NormProtos->NumProtos; i++) 00258 NormProtos->Protos[i] = NIL_LIST; 00259 00260 /* read file header and save in data structure */ 00261 NormProtos->NumParams = ReadSampleSize (File); 00262 NormProtos->ParamDesc = ReadParamDesc (File, NormProtos->NumParams); 00263 00264 /* read protos for each class into a separate list */ 00265 while ((end_offset < 0 || ftell(File) < end_offset) && 00266 tfscanf(File, "%s %d", unichar, &NumProtos) == 2) { 00267 if (unicharset.contains_unichar(unichar)) { 00268 unichar_id = unicharset.unichar_to_id(unichar); 00269 Protos = NormProtos->Protos[unichar_id]; 00270 for (i = 0; i < NumProtos; i++) 00271 Protos = 00272 push_last (Protos, ReadPrototype (File, NormProtos->NumParams)); 00273 NormProtos->Protos[unichar_id] = Protos; 00274 } else { 00275 cprintf("Error: unichar %s in normproto file is not in unichar set.\n", 00276 unichar); 00277 for (i = 0; i < NumProtos; i++) 00278 FreePrototype(ReadPrototype (File, NormProtos->NumParams)); 00279 } 00280 SkipNewline(File); 00281 } 00282 return (NormProtos); 00283 } /* ReadNormProtos */ 00284 } // namespace tesseract