tesseract  3.04.01
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cntraining.cpp
Go to the documentation of this file.
1 /******************************************************************************
2 ** Filename: cntraining.cpp
3 ** Purpose: Generates a normproto and pffmtable.
4 ** Author: Dan Johnson
5 ** Revisment: Christy Russon
6 ** History: Fri Aug 18 08:53:50 1989, DSJ, Created.
7 ** 5/25/90, DSJ, Adapted to multiple feature types.
8 ** Tuesday, May 17, 1998 Changes made to make feature specific and
9 ** simplify structures. First step in simplifying training process.
10 **
11  ** (c) Copyright Hewlett-Packard Company, 1988.
12  ** Licensed under the Apache License, Version 2.0 (the "License");
13  ** you may not use this file except in compliance with the License.
14  ** You may obtain a copy of the License at
15  ** http://www.apache.org/licenses/LICENSE-2.0
16  ** Unless required by applicable law or agreed to in writing, software
17  ** distributed under the License is distributed on an "AS IS" BASIS,
18  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19  ** See the License for the specific language governing permissions and
20  ** limitations under the License.
21 ******************************************************************************/
22 
23 
24 /*----------------------------------------------------------------------------
25  Include Files and Type Defines
26 ----------------------------------------------------------------------------*/
27 #include "oldlist.h"
28 #include "efio.h"
29 #include "emalloc.h"
30 #include "featdefs.h"
31 #include "tessopt.h"
32 #include "ocrfeatures.h"
33 #include "clusttool.h"
34 #include "cluster.h"
35 #include <string.h>
36 #include <stdio.h>
37 #include <math.h>
38 #include "unichar.h"
39 #include "commontraining.h"
40 
41 #define PROGRAM_FEATURE_TYPE "cn"
42 
44 
45 /*----------------------------------------------------------------------------
46  Public Function Prototypes
47 ----------------------------------------------------------------------------*/
48 int main (
49  int argc,
50  char **argv);
51 
52 /*----------------------------------------------------------------------------
53  Private Function Prototypes
54 ----------------------------------------------------------------------------*/
55 
56 void WriteNormProtos (
57  const char *Directory,
58  LIST LabeledProtoList,
59  CLUSTERER *Clusterer);
60 
61 /*
62 PARAMDESC *ConvertToPARAMDESC(
63  PARAM_DESC* Param_Desc,
64  int N);
65 */
66 
67 void WriteProtos(
68  FILE *File,
69  uinT16 N,
70  LIST ProtoList,
71  BOOL8 WriteSigProtos,
72  BOOL8 WriteInsigProtos);
73 
74 /*----------------------------------------------------------------------------
75  Global Data Definitions and Declarations
76 ----------------------------------------------------------------------------*/
77 /* global variable to hold configuration parameters to control clustering */
78 //-M 0.025 -B 0.05 -I 0.8 -C 1e-3
80 {
81  elliptical, 0.025, 0.05, 0.8, 1e-3, 0
82 };
83 
84 
85 /*----------------------------------------------------------------------------
86  Public Code
87 ----------------------------------------------------------------------------*/
88 /*---------------------------------------------------------------------------*/
137 int main(int argc, char* argv[])
138 {
139  // Set the global Config parameters before parsing the command line.
140  Config = CNConfig;
141 
142  const char *PageName;
143  FILE *TrainingPage;
144  LIST CharList = NIL_LIST;
145  CLUSTERER *Clusterer = NULL;
146  LIST ProtoList = NIL_LIST;
147  LIST NormProtoList = NIL_LIST;
148  LIST pCharList;
149  LABELEDLIST CharSample;
150  FEATURE_DEFS_STRUCT FeatureDefs;
151  InitFeatureDefs(&FeatureDefs);
152 
153  ParseArguments(&argc, &argv);
154  int num_fonts = 0;
155  while ((PageName = GetNextFilename(argc, argv)) != NULL) {
156  printf("Reading %s ...\n", PageName);
157  TrainingPage = Efopen(PageName, "rb");
159  100, NULL, TrainingPage, &CharList);
160  fclose(TrainingPage);
161  ++num_fonts;
162  }
163  printf("Clustering ...\n");
164  // To allow an individual font to form a separate cluster,
165  // reduce the min samples:
166  // Config.MinSamples = 0.5 / num_fonts;
167  pCharList = CharList;
168  iterate(pCharList) {
169  //Cluster
170  if (Clusterer)
171  FreeClusterer(Clusterer);
172  CharSample = (LABELEDLIST)first_node(pCharList);
173  Clusterer =
174  SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
175  float SavedMinSamples = Config.MinSamples;
176  // To disable the tendency to produce a single cluster for all fonts,
177  // make MagicSamples an impossible to achieve number:
178  // Config.MagicSamples = CharSample->SampleCount * 10;
179  Config.MagicSamples = CharSample->SampleCount;
180  while (Config.MinSamples > 0.001) {
181  ProtoList = ClusterSamples(Clusterer, &Config);
182  if (NumberOfProtos(ProtoList, 1, 0) > 0) {
183  break;
184  } else {
185  Config.MinSamples *= 0.95;
186  printf("0 significant protos for %s."
187  " Retrying clustering with MinSamples = %f%%\n",
188  CharSample->Label, Config.MinSamples);
189  }
190  }
191  Config.MinSamples = SavedMinSamples;
192  AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
193  }
194  FreeTrainingSamples(CharList);
195  if (Clusterer == NULL) { // To avoid a SIGSEGV
196  fprintf(stderr, "Error: NULL clusterer!\n");
197  return 1;
198  }
199  WriteNormProtos(FLAGS_D.c_str(), NormProtoList, Clusterer);
200  FreeNormProtoList(NormProtoList);
201  FreeProtoList(&ProtoList);
202  FreeClusterer(Clusterer);
203  printf ("\n");
204  return 0;
205 } // main
206 
207 
208 /*----------------------------------------------------------------------------
209  Private Code
210 ----------------------------------------------------------------------------*/
211 
212 /*----------------------------------------------------------------------------*/
225  const char *Directory,
226  LIST LabeledProtoList,
227  CLUSTERER *Clusterer)
228 {
229  FILE *File;
230  STRING Filename;
231  LABELEDLIST LabeledProto;
232  int N;
233 
234  Filename = "";
235  if (Directory != NULL && Directory[0] != '\0')
236  {
237  Filename += Directory;
238  Filename += "/";
239  }
240  Filename += "normproto";
241  printf ("\nWriting %s ...", Filename.string());
242  File = Efopen (Filename.string(), "wb");
243  fprintf(File,"%0d\n",Clusterer->SampleSize);
244  WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc);
245  iterate(LabeledProtoList)
246  {
247  LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
248  N = NumberOfProtos(LabeledProto->List, true, false);
249  if (N < 1) {
250  printf ("\nError! Not enough protos for %s: %d protos"
251  " (%d significant protos"
252  ", %d insignificant protos)\n",
253  LabeledProto->Label, N,
254  NumberOfProtos(LabeledProto->List, 1, 0),
255  NumberOfProtos(LabeledProto->List, 0, 1));
256  exit(1);
257  }
258  fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
259  WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false);
260  }
261  fclose (File);
262 
263 } // WriteNormProtos
264 
265 /*-------------------------------------------------------------------------*/
267  FILE *File,
268  uinT16 N,
269  LIST ProtoList,
270  BOOL8 WriteSigProtos,
271  BOOL8 WriteInsigProtos)
272 {
273  PROTOTYPE *Proto;
274 
275  // write prototypes
276  iterate(ProtoList)
277  {
278  Proto = (PROTOTYPE *) first_node ( ProtoList );
279  if (( Proto->Significant && WriteSigProtos ) ||
280  ( ! Proto->Significant && WriteInsigProtos ) )
281  WritePrototype( File, N, Proto );
282  }
283 } // WriteProtos
void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto)
Definition: clusttool.cpp:348
void WriteNormProtos(const char *Directory, LIST LabeledProtoList, CLUSTERER *Clusterer)
Definition: cntraining.cpp:224
#define first_node(l)
Definition: oldlist.h:139
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
int main(int argc, char **argv)
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:121
struct LABELEDLISTNODE * LABELEDLIST
void ParseArguments(int *argc, char ***argv)
DECLARE_STRING_PARAM_FLAG(D)
#define iterate(l)
Definition: oldlist.h:159
CLUSTERCONFIG Config
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
void FreeNormProtoList(LIST CharList)
void FreeTrainingSamples(LIST CharList)
unsigned Significant
Definition: cluster.h:68
void WriteProtos(FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
Definition: cntraining.cpp:266
int MagicSamples
Definition: cluster.h:55
#define PROGRAM_FEATURE_TYPE
Definition: cntraining.cpp:41
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:515
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:43
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:571
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
FLOAT32 MinSamples
Definition: cluster.h:50
unsigned short uinT16
Definition: host.h:101
#define NIL_LIST
Definition: oldlist.h:126
unsigned char BOOL8
Definition: host.h:113
PARAM_DESC * ParamDesc
Definition: cluster.h:88
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:543
CLUSTERCONFIG CNConfig
Definition: cntraining.cpp:79
Definition: strngs.h:44
const char * string() const
Definition: strngs.cpp:193
inT16 SampleSize
Definition: cluster.h:87
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
void WriteParamDesc(FILE *File, uinT16 N, PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:319
const char * GetNextFilename(int argc, const char *const *argv)