|
tesseract 3.04.01
|
00001 /****************************************************************************** 00002 ** Filename: cntraining.cpp 00003 ** Purpose: Generates a normproto and pffmtable. 00004 ** Author: Dan Johnson 00005 ** Revisment: Christy Russon 00006 ** History: Fri Aug 18 08:53:50 1989, DSJ, Created. 00007 ** 5/25/90, DSJ, Adapted to multiple feature types. 00008 ** Tuesday, May 17, 1998 Changes made to make feature specific and 00009 ** simplify structures. First step in simplifying training process. 00010 ** 00011 ** (c) Copyright Hewlett-Packard Company, 1988. 00012 ** Licensed under the Apache License, Version 2.0 (the "License"); 00013 ** you may not use this file except in compliance with the License. 00014 ** You may obtain a copy of the License at 00015 ** http://www.apache.org/licenses/LICENSE-2.0 00016 ** Unless required by applicable law or agreed to in writing, software 00017 ** distributed under the License is distributed on an "AS IS" BASIS, 00018 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00019 ** See the License for the specific language governing permissions and 00020 ** limitations under the License. 00021 ******************************************************************************/ 00022 00023 00024 /*---------------------------------------------------------------------------- 00025 Include Files and Type Defines 00026 ----------------------------------------------------------------------------*/ 00027 #include "oldlist.h" 00028 #include "efio.h" 00029 #include "emalloc.h" 00030 #include "featdefs.h" 00031 #include "tessopt.h" 00032 #include "ocrfeatures.h" 00033 #include "clusttool.h" 00034 #include "cluster.h" 00035 #include <string.h> 00036 #include <stdio.h> 00037 #include <math.h> 00038 #include "unichar.h" 00039 #include "commontraining.h" 00040 00041 #define PROGRAM_FEATURE_TYPE "cn" 00042 00043 DECLARE_STRING_PARAM_FLAG(D); 00044 00045 /*---------------------------------------------------------------------------- 00046 Public Function Prototypes 00047 ----------------------------------------------------------------------------*/ 00048 int main ( 00049 int argc, 00050 char **argv); 00051 00052 /*---------------------------------------------------------------------------- 00053 Private Function Prototypes 00054 ----------------------------------------------------------------------------*/ 00055 00056 void WriteNormProtos ( 00057 const char *Directory, 00058 LIST LabeledProtoList, 00059 CLUSTERER *Clusterer); 00060 00061 /* 00062 PARAMDESC *ConvertToPARAMDESC( 00063 PARAM_DESC* Param_Desc, 00064 int N); 00065 */ 00066 00067 void WriteProtos( 00068 FILE *File, 00069 uinT16 N, 00070 LIST ProtoList, 00071 BOOL8 WriteSigProtos, 00072 BOOL8 WriteInsigProtos); 00073 00074 /*---------------------------------------------------------------------------- 00075 Global Data Definitions and Declarations 00076 ----------------------------------------------------------------------------*/ 00077 /* global variable to hold configuration parameters to control clustering */ 00078 //-M 0.025 -B 0.05 -I 0.8 -C 1e-3 00079 CLUSTERCONFIG CNConfig = 00080 { 00081 elliptical, 0.025, 0.05, 0.8, 1e-3, 0 00082 }; 00083 00084 00085 /*---------------------------------------------------------------------------- 00086 Public Code 00087 ----------------------------------------------------------------------------*/ 00088 /*---------------------------------------------------------------------------*/ 00137 int main(int argc, char* argv[]) 00138 { 00139 // Set the global Config parameters before parsing the command line. 00140 Config = CNConfig; 00141 00142 const char *PageName; 00143 FILE *TrainingPage; 00144 LIST CharList = NIL_LIST; 00145 CLUSTERER *Clusterer = NULL; 00146 LIST ProtoList = NIL_LIST; 00147 LIST NormProtoList = NIL_LIST; 00148 LIST pCharList; 00149 LABELEDLIST CharSample; 00150 FEATURE_DEFS_STRUCT FeatureDefs; 00151 InitFeatureDefs(&FeatureDefs); 00152 00153 ParseArguments(&argc, &argv); 00154 int num_fonts = 0; 00155 while ((PageName = GetNextFilename(argc, argv)) != NULL) { 00156 printf("Reading %s ...\n", PageName); 00157 TrainingPage = Efopen(PageName, "rb"); 00158 ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 00159 100, NULL, TrainingPage, &CharList); 00160 fclose(TrainingPage); 00161 ++num_fonts; 00162 } 00163 printf("Clustering ...\n"); 00164 // To allow an individual font to form a separate cluster, 00165 // reduce the min samples: 00166 // Config.MinSamples = 0.5 / num_fonts; 00167 pCharList = CharList; 00168 iterate(pCharList) { 00169 //Cluster 00170 if (Clusterer) 00171 FreeClusterer(Clusterer); 00172 CharSample = (LABELEDLIST)first_node(pCharList); 00173 Clusterer = 00174 SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE); 00175 float SavedMinSamples = Config.MinSamples; 00176 // To disable the tendency to produce a single cluster for all fonts, 00177 // make MagicSamples an impossible to achieve number: 00178 // Config.MagicSamples = CharSample->SampleCount * 10; 00179 Config.MagicSamples = CharSample->SampleCount; 00180 while (Config.MinSamples > 0.001) { 00181 ProtoList = ClusterSamples(Clusterer, &Config); 00182 if (NumberOfProtos(ProtoList, 1, 0) > 0) { 00183 break; 00184 } else { 00185 Config.MinSamples *= 0.95; 00186 printf("0 significant protos for %s." 00187 " Retrying clustering with MinSamples = %f%%\n", 00188 CharSample->Label, Config.MinSamples); 00189 } 00190 } 00191 Config.MinSamples = SavedMinSamples; 00192 AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label); 00193 } 00194 FreeTrainingSamples(CharList); 00195 if (Clusterer == NULL) { // To avoid a SIGSEGV 00196 fprintf(stderr, "Error: NULL clusterer!\n"); 00197 return 1; 00198 } 00199 WriteNormProtos(FLAGS_D.c_str(), NormProtoList, Clusterer); 00200 FreeNormProtoList(NormProtoList); 00201 FreeProtoList(&ProtoList); 00202 FreeClusterer(Clusterer); 00203 printf ("\n"); 00204 return 0; 00205 } // main 00206 00207 00208 /*---------------------------------------------------------------------------- 00209 Private Code 00210 ----------------------------------------------------------------------------*/ 00211 00212 /*----------------------------------------------------------------------------*/ 00224 void WriteNormProtos ( 00225 const char *Directory, 00226 LIST LabeledProtoList, 00227 CLUSTERER *Clusterer) 00228 { 00229 FILE *File; 00230 STRING Filename; 00231 LABELEDLIST LabeledProto; 00232 int N; 00233 00234 Filename = ""; 00235 if (Directory != NULL && Directory[0] != '\0') 00236 { 00237 Filename += Directory; 00238 Filename += "/"; 00239 } 00240 Filename += "normproto"; 00241 printf ("\nWriting %s ...", Filename.string()); 00242 File = Efopen (Filename.string(), "wb"); 00243 fprintf(File,"%0d\n",Clusterer->SampleSize); 00244 WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc); 00245 iterate(LabeledProtoList) 00246 { 00247 LabeledProto = (LABELEDLIST) first_node (LabeledProtoList); 00248 N = NumberOfProtos(LabeledProto->List, true, false); 00249 if (N < 1) { 00250 printf ("\nError! Not enough protos for %s: %d protos" 00251 " (%d significant protos" 00252 ", %d insignificant protos)\n", 00253 LabeledProto->Label, N, 00254 NumberOfProtos(LabeledProto->List, 1, 0), 00255 NumberOfProtos(LabeledProto->List, 0, 1)); 00256 exit(1); 00257 } 00258 fprintf(File, "\n%s %d\n", LabeledProto->Label, N); 00259 WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false); 00260 } 00261 fclose (File); 00262 00263 } // WriteNormProtos 00264 00265 /*-------------------------------------------------------------------------*/ 00266 void WriteProtos( 00267 FILE *File, 00268 uinT16 N, 00269 LIST ProtoList, 00270 BOOL8 WriteSigProtos, 00271 BOOL8 WriteInsigProtos) 00272 { 00273 PROTOTYPE *Proto; 00274 00275 // write prototypes 00276 iterate(ProtoList) 00277 { 00278 Proto = (PROTOTYPE *) first_node ( ProtoList ); 00279 if (( Proto->Significant && WriteSigProtos ) || 00280 ( ! Proto->Significant && WriteInsigProtos ) ) 00281 WritePrototype( File, N, Proto ); 00282 } 00283 } // WriteProtos