tesseract 3.04.01

training/cntraining.cpp

Go to the documentation of this file.
00001 /******************************************************************************
00002 **  Filename:  cntraining.cpp
00003 **  Purpose:  Generates a normproto and pffmtable.
00004 **  Author:    Dan Johnson
00005 **  Revisment:  Christy Russon
00006 **  History:     Fri Aug 18 08:53:50 1989, DSJ, Created.
00007 **         5/25/90, DSJ, Adapted to multiple feature types.
00008 **        Tuesday, May 17, 1998 Changes made to make feature specific and
00009 **        simplify structures. First step in simplifying training process.
00010 **
00011  **  (c) Copyright Hewlett-Packard Company, 1988.
00012  ** Licensed under the Apache License, Version 2.0 (the "License");
00013  ** you may not use this file except in compliance with the License.
00014  ** You may obtain a copy of the License at
00015  ** http://www.apache.org/licenses/LICENSE-2.0
00016  ** Unless required by applicable law or agreed to in writing, software
00017  ** distributed under the License is distributed on an "AS IS" BASIS,
00018  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00019  ** See the License for the specific language governing permissions and
00020  ** limitations under the License.
00021 ******************************************************************************/
00022 
00023 
00024 /*----------------------------------------------------------------------------
00025           Include Files and Type Defines
00026 ----------------------------------------------------------------------------*/
00027 #include "oldlist.h"
00028 #include "efio.h"
00029 #include "emalloc.h"
00030 #include "featdefs.h"
00031 #include "tessopt.h"
00032 #include "ocrfeatures.h"
00033 #include "clusttool.h"
00034 #include "cluster.h"
00035 #include <string.h>
00036 #include <stdio.h>
00037 #include <math.h>
00038 #include "unichar.h"
00039 #include "commontraining.h"
00040 
00041 #define PROGRAM_FEATURE_TYPE "cn"
00042 
00043 DECLARE_STRING_PARAM_FLAG(D);
00044 
00045 /*----------------------------------------------------------------------------
00046           Public Function Prototypes
00047 ----------------------------------------------------------------------------*/
00048 int main (
00049      int  argc,
00050      char  **argv);
00051 
00052 /*----------------------------------------------------------------------------
00053           Private Function Prototypes
00054 ----------------------------------------------------------------------------*/
00055 
00056 void WriteNormProtos (
00057      const char  *Directory,
00058      LIST  LabeledProtoList,
00059    CLUSTERER *Clusterer);
00060 
00061 /*
00062 PARAMDESC *ConvertToPARAMDESC(
00063   PARAM_DESC* Param_Desc,
00064   int N);
00065 */
00066 
00067 void WriteProtos(
00068      FILE  *File,
00069      uinT16  N,
00070      LIST  ProtoList,
00071      BOOL8  WriteSigProtos,
00072      BOOL8  WriteInsigProtos);
00073 
00074 /*----------------------------------------------------------------------------
00075           Global Data Definitions and Declarations
00076 ----------------------------------------------------------------------------*/
00077 /* global variable to hold configuration parameters to control clustering */
00078 //-M 0.025   -B 0.05   -I 0.8   -C 1e-3
00079 CLUSTERCONFIG  CNConfig =
00080 {
00081   elliptical, 0.025, 0.05, 0.8, 1e-3, 0
00082 };
00083 
00084 
00085 /*----------------------------------------------------------------------------
00086               Public Code
00087 ----------------------------------------------------------------------------*/
00088 /*---------------------------------------------------------------------------*/
00137 int main(int  argc, char* argv[])
00138 {
00139   // Set the global Config parameters before parsing the command line.
00140   Config = CNConfig;
00141 
00142   const char  *PageName;
00143   FILE  *TrainingPage;
00144   LIST  CharList = NIL_LIST;
00145   CLUSTERER  *Clusterer = NULL;
00146   LIST    ProtoList = NIL_LIST;
00147   LIST    NormProtoList = NIL_LIST;
00148   LIST pCharList;
00149   LABELEDLIST CharSample;
00150   FEATURE_DEFS_STRUCT FeatureDefs;
00151   InitFeatureDefs(&FeatureDefs);
00152 
00153   ParseArguments(&argc, &argv);
00154   int num_fonts = 0;
00155   while ((PageName = GetNextFilename(argc, argv)) != NULL) {
00156     printf("Reading %s ...\n", PageName);
00157     TrainingPage = Efopen(PageName, "rb");
00158     ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE,
00159                         100, NULL, TrainingPage, &CharList);
00160     fclose(TrainingPage);
00161     ++num_fonts;
00162   }
00163   printf("Clustering ...\n");
00164   // To allow an individual font to form a separate cluster,
00165   // reduce the min samples:
00166   // Config.MinSamples = 0.5 / num_fonts;
00167   pCharList = CharList;
00168   iterate(pCharList) {
00169     //Cluster
00170     if (Clusterer)
00171        FreeClusterer(Clusterer);
00172     CharSample = (LABELEDLIST)first_node(pCharList);
00173     Clusterer =
00174       SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
00175     float SavedMinSamples = Config.MinSamples;
00176     // To disable the tendency to produce a single cluster for all fonts,
00177     // make MagicSamples an impossible to achieve number:
00178     // Config.MagicSamples = CharSample->SampleCount * 10;
00179     Config.MagicSamples = CharSample->SampleCount;
00180     while (Config.MinSamples > 0.001) {
00181       ProtoList = ClusterSamples(Clusterer, &Config);
00182       if (NumberOfProtos(ProtoList, 1, 0) > 0) {
00183         break;
00184       } else {
00185         Config.MinSamples *= 0.95;
00186         printf("0 significant protos for %s."
00187                " Retrying clustering with MinSamples = %f%%\n",
00188                CharSample->Label, Config.MinSamples);
00189       }
00190     }
00191     Config.MinSamples = SavedMinSamples;
00192     AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
00193   }
00194   FreeTrainingSamples(CharList);
00195   if (Clusterer == NULL) { // To avoid a SIGSEGV
00196     fprintf(stderr, "Error: NULL clusterer!\n");
00197     return 1;
00198   }
00199   WriteNormProtos(FLAGS_D.c_str(), NormProtoList, Clusterer);
00200   FreeNormProtoList(NormProtoList);
00201   FreeProtoList(&ProtoList);
00202   FreeClusterer(Clusterer);
00203   printf ("\n");
00204   return 0;
00205 }  // main
00206 
00207 
00208 /*----------------------------------------------------------------------------
00209               Private Code
00210 ----------------------------------------------------------------------------*/
00211 
00212 /*----------------------------------------------------------------------------*/
00224 void WriteNormProtos (
00225      const char  *Directory,
00226      LIST  LabeledProtoList,
00227      CLUSTERER *Clusterer)
00228 {
00229   FILE    *File;
00230   STRING Filename;
00231   LABELEDLIST LabeledProto;
00232   int N;
00233 
00234   Filename = "";
00235   if (Directory != NULL && Directory[0] != '\0')
00236   {
00237     Filename += Directory;
00238     Filename += "/";
00239   }
00240   Filename += "normproto";
00241   printf ("\nWriting %s ...", Filename.string());
00242   File = Efopen (Filename.string(), "wb");
00243   fprintf(File,"%0d\n",Clusterer->SampleSize);
00244   WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc);
00245   iterate(LabeledProtoList)
00246   {
00247     LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
00248     N = NumberOfProtos(LabeledProto->List, true, false);
00249     if (N < 1) {
00250       printf ("\nError! Not enough protos for %s: %d protos"
00251               " (%d significant protos"
00252               ", %d insignificant protos)\n",
00253               LabeledProto->Label, N,
00254               NumberOfProtos(LabeledProto->List, 1, 0),
00255               NumberOfProtos(LabeledProto->List, 0, 1));
00256       exit(1);
00257     }
00258     fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
00259     WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false);
00260   }
00261   fclose (File);
00262 
00263 }  // WriteNormProtos
00264 
00265 /*-------------------------------------------------------------------------*/
00266 void WriteProtos(
00267      FILE  *File,
00268      uinT16  N,
00269      LIST  ProtoList,
00270      BOOL8  WriteSigProtos,
00271      BOOL8  WriteInsigProtos)
00272 {
00273   PROTOTYPE  *Proto;
00274 
00275   // write prototypes
00276   iterate(ProtoList)
00277   {
00278     Proto = (PROTOTYPE *) first_node ( ProtoList );
00279     if (( Proto->Significant && WriteSigProtos )  ||
00280       ( ! Proto->Significant && WriteInsigProtos ) )
00281       WritePrototype( File, N, Proto );
00282   }
00283 }  // WriteProtos
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines