tesseract  3.04.01
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cntraining.cpp File Reference
#include "oldlist.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "tessopt.h"
#include "ocrfeatures.h"
#include "clusttool.h"
#include "cluster.h"
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "unichar.h"
#include "commontraining.h"

Go to the source code of this file.

Macros

#define PROGRAM_FEATURE_TYPE   "cn"
 

Functions

 DECLARE_STRING_PARAM_FLAG (D)
 
int main (int argc, char **argv)
 
void WriteNormProtos (const char *Directory, LIST LabeledProtoList, CLUSTERER *Clusterer)
 
void WriteProtos (FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
 
int main (int argc, char *argv[])
 

Variables

CLUSTERCONFIG CNConfig
 

Macro Definition Documentation

#define PROGRAM_FEATURE_TYPE   "cn"

Definition at line 41 of file cntraining.cpp.

Function Documentation

DECLARE_STRING_PARAM_FLAG ( )
int main ( int  argc,
char **  argv 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName UTF8-char-str xmin ymin xmax ymax page-number
    NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

The result of this program is a binary inttemp file used by the OCR engine.

Parameters
argcnumber of command line arguments
argvarray of command line arguments
Returns
none
Note
Exceptions: none
History: Fri Aug 18 08:56:17 1989, DSJ, Created.
History: Mon May 18 1998, Christy Russson, Revistion started.

Definition at line 338 of file tesseractmain.cpp.

338  {
339  const char* lang = "eng";
340  const char* image = NULL;
341  const char* outputbase = NULL;
342  const char* datapath = NULL;
343  bool list_langs = false;
344  bool print_parameters = false;
345  GenericVector<STRING> vars_vec, vars_values;
346  int arg_i = 1;
348 
349  ParseArgs(argc, argv,
350  &lang, &image, &outputbase, &datapath,
351  &list_langs, &print_parameters,
352  &vars_vec, &vars_values, &arg_i, &pagesegmode);
353 
354  PERF_COUNT_START("Tesseract:main")
356 
357  api.SetOutputName(outputbase);
358 
359  int init_failed = api.Init(datapath, lang, tesseract::OEM_DEFAULT,
360  &(argv[arg_i]), argc - arg_i, &vars_vec, &vars_values, false);
361  if (init_failed) {
362  fprintf(stderr, "Could not initialize tesseract.\n");
363  exit(1);
364  }
365 
366  SetVariablesFromCLArgs(&api, argc, argv);
367 
368  if (list_langs) {
370  exit(0);
371  }
372 
373  if (print_parameters) {
374  FILE* fout = stdout;
375  fprintf(stdout, "Tesseract parameters:\n");
376  api.PrintVariables(fout);
377  api.End();
378  exit(0);
379  }
380 
381  FixPageSegMode(&api, pagesegmode);
382 
383  if (pagesegmode == tesseract::PSM_AUTO_ONLY) {
384  int ret_val = 0;
385 
386  Pix* pixs = pixRead(image);
387  if (!pixs) {
388  fprintf(stderr, "Cannot open input file: %s\n", image);
389  exit(2);
390  }
391 
392  api.SetImage(pixs);
393 
394  tesseract::Orientation orientation;
397  float deskew_angle;
398 
399  tesseract::PageIterator* it = api.AnalyseLayout();
400  if (it) {
401  it->Orientation(&orientation, &direction, &order, &deskew_angle);
402  tprintf("Orientation: %d\nWritingDirection: %d\nTextlineOrder: %d\n" \
403  "Deskew angle: %.4f\n",
404  orientation, direction, order, deskew_angle);
405  } else {
406  ret_val = 1;
407  }
408 
409  delete it;
410 
411  pixDestroy(&pixs);
412  exit(ret_val);
413  }
414 
415  // set in_training_mode to true when using one of these configs:
416  // ambigs.train, box.train, box.train.stderr, linebox, rebox
417  bool b = false;
418  bool in_training_mode =
419  (api.GetBoolVariable("tessedit_ambigs_training", &b) && b) ||
420  (api.GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) ||
421  (api.GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b);
422 
424 
425  if (in_training_mode) {
426  renderers.push_back(NULL);
427  } else {
428  PreloadRenderers(&api, &renderers, pagesegmode, outputbase);
429  }
430 
431  if (!renderers.empty()) {
432  bool succeed = api.ProcessPages(image, NULL, 0, renderers[0]);
433  if (!succeed) {
434  fprintf(stderr, "Error during processing.\n");
435  exit(1);
436  }
437  }
438 
440  return 0; // Normal exit
441 }
int push_back(T *object)
void PreloadRenderers(tesseract::TessBaseAPI *api, tesseract::PointerVector< tesseract::TessResultRenderer > *renderers, tesseract::PageSegMode pagesegmode, const char *outputbase)
#define tprintf(...)
Definition: tprintf.h:31
void FixPageSegMode(tesseract::TessBaseAPI *api, tesseract::PageSegMode pagesegmode)
#define PERF_COUNT_END
Automatic page segmentation, but no OSD, or OCR.
Definition: publictypes.h:155
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
bool empty() const
Definition: genericvector.h:84
#define PERF_COUNT_START(FUNCT_NAME)
Fully automatic page segmentation, but no OSD.
Definition: publictypes.h:156
void Orientation(tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const
void PrintLangsList(tesseract::TessBaseAPI *api)
void ParseArgs(const int argc, char **argv, const char **lang, const char **image, const char **outputbase, const char **datapath, bool *list_langs, bool *print_parameters, GenericVector< STRING > *vars_vec, GenericVector< STRING > *vars_values, int *arg_i, tesseract::PageSegMode *pagesegmode)
struct TessBaseAPI TessBaseAPI
Definition: capi.h:69
void SetVariablesFromCLArgs(tesseract::TessBaseAPI *api, int argc, char **argv)
int main ( int  argc,
char *  argv[] 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName CharName NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

It then appends these samples into a separate file for each character. The name of the file is

DirectoryName/FontName/CharName.FeatureTypeName

The DirectoryName can be specified via a command line argument. If not specified, it defaults to the current directory. The format of the resulting files is:

   NumberOfFeatures(M)
      Feature1
      ...
      FeatureM
   NumberOfFeatures(M)
   ...

The output files each have a header which describes the type of feature which the file contains. This header is in the format required by the clusterer. A command line argument can also be used to specify that only the first N samples of each class should be used.

Parameters
argcnumber of command line arguments
argvarray of command line arguments
Returns
none
Note
Globals: none
Exceptions: none
History: Fri Aug 18 08:56:17 1989, DSJ, Created.

Definition at line 137 of file cntraining.cpp.

138 {
139  // Set the global Config parameters before parsing the command line.
140  Config = CNConfig;
141 
142  const char *PageName;
143  FILE *TrainingPage;
144  LIST CharList = NIL_LIST;
145  CLUSTERER *Clusterer = NULL;
146  LIST ProtoList = NIL_LIST;
147  LIST NormProtoList = NIL_LIST;
148  LIST pCharList;
149  LABELEDLIST CharSample;
150  FEATURE_DEFS_STRUCT FeatureDefs;
151  InitFeatureDefs(&FeatureDefs);
152 
153  ParseArguments(&argc, &argv);
154  int num_fonts = 0;
155  while ((PageName = GetNextFilename(argc, argv)) != NULL) {
156  printf("Reading %s ...\n", PageName);
157  TrainingPage = Efopen(PageName, "rb");
159  100, NULL, TrainingPage, &CharList);
160  fclose(TrainingPage);
161  ++num_fonts;
162  }
163  printf("Clustering ...\n");
164  // To allow an individual font to form a separate cluster,
165  // reduce the min samples:
166  // Config.MinSamples = 0.5 / num_fonts;
167  pCharList = CharList;
168  iterate(pCharList) {
169  //Cluster
170  if (Clusterer)
171  FreeClusterer(Clusterer);
172  CharSample = (LABELEDLIST)first_node(pCharList);
173  Clusterer =
174  SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
175  float SavedMinSamples = Config.MinSamples;
176  // To disable the tendency to produce a single cluster for all fonts,
177  // make MagicSamples an impossible to achieve number:
178  // Config.MagicSamples = CharSample->SampleCount * 10;
179  Config.MagicSamples = CharSample->SampleCount;
180  while (Config.MinSamples > 0.001) {
181  ProtoList = ClusterSamples(Clusterer, &Config);
182  if (NumberOfProtos(ProtoList, 1, 0) > 0) {
183  break;
184  } else {
185  Config.MinSamples *= 0.95;
186  printf("0 significant protos for %s."
187  " Retrying clustering with MinSamples = %f%%\n",
188  CharSample->Label, Config.MinSamples);
189  }
190  }
191  Config.MinSamples = SavedMinSamples;
192  AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
193  }
194  FreeTrainingSamples(CharList);
195  if (Clusterer == NULL) { // To avoid a SIGSEGV
196  fprintf(stderr, "Error: NULL clusterer!\n");
197  return 1;
198  }
199  WriteNormProtos(FLAGS_D.c_str(), NormProtoList, Clusterer);
200  FreeNormProtoList(NormProtoList);
201  FreeProtoList(&ProtoList);
202  FreeClusterer(Clusterer);
203  printf ("\n");
204  return 0;
205 } // main
void WriteNormProtos(const char *Directory, LIST LabeledProtoList, CLUSTERER *Clusterer)
Definition: cntraining.cpp:224
#define first_node(l)
Definition: oldlist.h:139
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:121
struct LABELEDLISTNODE * LABELEDLIST
void ParseArguments(int *argc, char ***argv)
#define iterate(l)
Definition: oldlist.h:159
CLUSTERCONFIG Config
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
void FreeNormProtoList(LIST CharList)
void FreeTrainingSamples(LIST CharList)
int MagicSamples
Definition: cluster.h:55
#define PROGRAM_FEATURE_TYPE
Definition: cntraining.cpp:41
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:515
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:43
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:571
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
FLOAT32 MinSamples
Definition: cluster.h:50
#define NIL_LIST
Definition: oldlist.h:126
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:543
CLUSTERCONFIG CNConfig
Definition: cntraining.cpp:79
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
const char * GetNextFilename(int argc, const char *const *argv)
void WriteNormProtos ( const char *  Directory,
LIST  LabeledProtoList,
CLUSTERER Clusterer 
)

This routine writes the specified samples into files which are organized according to the font name and character name of the samples.

Parameters
Directorydirectory to place sample files into
LabeledProtoListList of labeled protos
ClustererThe CLUSTERER to use
Returns
none
Note
Exceptions: none
History: Fri Aug 18 16:17:06 1989, DSJ, Created.

Definition at line 224 of file cntraining.cpp.

228 {
229  FILE *File;
230  STRING Filename;
231  LABELEDLIST LabeledProto;
232  int N;
233 
234  Filename = "";
235  if (Directory != NULL && Directory[0] != '\0')
236  {
237  Filename += Directory;
238  Filename += "/";
239  }
240  Filename += "normproto";
241  printf ("\nWriting %s ...", Filename.string());
242  File = Efopen (Filename.string(), "wb");
243  fprintf(File,"%0d\n",Clusterer->SampleSize);
244  WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc);
245  iterate(LabeledProtoList)
246  {
247  LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
248  N = NumberOfProtos(LabeledProto->List, true, false);
249  if (N < 1) {
250  printf ("\nError! Not enough protos for %s: %d protos"
251  " (%d significant protos"
252  ", %d insignificant protos)\n",
253  LabeledProto->Label, N,
254  NumberOfProtos(LabeledProto->List, 1, 0),
255  NumberOfProtos(LabeledProto->List, 0, 1));
256  exit(1);
257  }
258  fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
259  WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false);
260  }
261  fclose (File);
262 
263 } // WriteNormProtos
#define first_node(l)
Definition: oldlist.h:139
struct LABELEDLISTNODE * LABELEDLIST
#define iterate(l)
Definition: oldlist.h:159
void WriteProtos(FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
Definition: cntraining.cpp:266
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:43
PARAM_DESC * ParamDesc
Definition: cluster.h:88
Definition: strngs.h:44
const char * string() const
Definition: strngs.cpp:193
inT16 SampleSize
Definition: cluster.h:87
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
void WriteParamDesc(FILE *File, uinT16 N, PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:319
void WriteProtos ( FILE *  File,
uinT16  N,
LIST  ProtoList,
BOOL8  WriteSigProtos,
BOOL8  WriteInsigProtos 
)

Definition at line 266 of file cntraining.cpp.

272 {
273  PROTOTYPE *Proto;
274 
275  // write prototypes
276  iterate(ProtoList)
277  {
278  Proto = (PROTOTYPE *) first_node ( ProtoList );
279  if (( Proto->Significant && WriteSigProtos ) ||
280  ( ! Proto->Significant && WriteInsigProtos ) )
281  WritePrototype( File, N, Proto );
282  }
283 } // WriteProtos
void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto)
Definition: clusttool.cpp:348
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
unsigned Significant
Definition: cluster.h:68

Variable Documentation

CLUSTERCONFIG CNConfig
Initial value:
=
{
elliptical, 0.025, 0.05, 0.8, 1e-3, 0
}

Definition at line 79 of file cntraining.cpp.