tesseract 3.04.01

training/cntraining.cpp File Reference

#include "oldlist.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "tessopt.h"
#include "ocrfeatures.h"
#include "clusttool.h"
#include "cluster.h"
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "unichar.h"
#include "commontraining.h"

Go to the source code of this file.

Defines

#define PROGRAM_FEATURE_TYPE   "cn"

Functions

 DECLARE_STRING_PARAM_FLAG (D)
int main (int argc, char **argv)
void WriteNormProtos (const char *Directory, LIST LabeledProtoList, CLUSTERER *Clusterer)
void WriteProtos (FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
int main (int argc, char *argv[])

Variables

CLUSTERCONFIG CNConfig

Define Documentation

#define PROGRAM_FEATURE_TYPE   "cn"

Definition at line 41 of file cntraining.cpp.


Function Documentation

DECLARE_STRING_PARAM_FLAG ( )
int main ( int  argc,
char *  argv[] 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName CharName NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

It then appends these samples into a separate file for each character. The name of the file is

DirectoryName/FontName/CharName.FeatureTypeName

The DirectoryName can be specified via a command line argument. If not specified, it defaults to the current directory. The format of the resulting files is:

   NumberOfFeatures(M)
      Feature1
      ...
      FeatureM
   NumberOfFeatures(M)
   ...

The output files each have a header which describes the type of feature which the file contains. This header is in the format required by the clusterer. A command line argument can also be used to specify that only the first N samples of each class should be used.

Parameters:
argcnumber of command line arguments
argvarray of command line arguments
Returns:
none
Note:
Globals: none
Exceptions: none
History: Fri Aug 18 08:56:17 1989, DSJ, Created.

Definition at line 137 of file cntraining.cpp.

{
  // Set the global Config parameters before parsing the command line.
  Config = CNConfig;

  const char  *PageName;
  FILE  *TrainingPage;
  LIST  CharList = NIL_LIST;
  CLUSTERER  *Clusterer = NULL;
  LIST    ProtoList = NIL_LIST;
  LIST    NormProtoList = NIL_LIST;
  LIST pCharList;
  LABELEDLIST CharSample;
  FEATURE_DEFS_STRUCT FeatureDefs;
  InitFeatureDefs(&FeatureDefs);

  ParseArguments(&argc, &argv);
  int num_fonts = 0;
  while ((PageName = GetNextFilename(argc, argv)) != NULL) {
    printf("Reading %s ...\n", PageName);
    TrainingPage = Efopen(PageName, "rb");
    ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE,
                        100, NULL, TrainingPage, &CharList);
    fclose(TrainingPage);
    ++num_fonts;
  }
  printf("Clustering ...\n");
  // To allow an individual font to form a separate cluster,
  // reduce the min samples:
  // Config.MinSamples = 0.5 / num_fonts;
  pCharList = CharList;
  iterate(pCharList) {
    //Cluster
    if (Clusterer)
       FreeClusterer(Clusterer);
    CharSample = (LABELEDLIST)first_node(pCharList);
    Clusterer =
      SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
    float SavedMinSamples = Config.MinSamples;
    // To disable the tendency to produce a single cluster for all fonts,
    // make MagicSamples an impossible to achieve number:
    // Config.MagicSamples = CharSample->SampleCount * 10;
    Config.MagicSamples = CharSample->SampleCount;
    while (Config.MinSamples > 0.001) {
      ProtoList = ClusterSamples(Clusterer, &Config);
      if (NumberOfProtos(ProtoList, 1, 0) > 0) {
        break;
      } else {
        Config.MinSamples *= 0.95;
        printf("0 significant protos for %s."
               " Retrying clustering with MinSamples = %f%%\n",
               CharSample->Label, Config.MinSamples);
      }
    }
    Config.MinSamples = SavedMinSamples;
    AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
  }
  FreeTrainingSamples(CharList);
  if (Clusterer == NULL) { // To avoid a SIGSEGV
    fprintf(stderr, "Error: NULL clusterer!\n");
    return 1;
  }
  WriteNormProtos(FLAGS_D.c_str(), NormProtoList, Clusterer);
  FreeNormProtoList(NormProtoList);
  FreeProtoList(&ProtoList);
  FreeClusterer(Clusterer);
  printf ("\n");
  return 0;
}  // main
int main ( int  argc,
char **  argv 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

      FontName UTF8-char-str xmin ymin xmax ymax page-number
       NumberOfFeatureTypes(N)
         FeatureTypeName1 NumberOfFeatures(M)
            Feature1
            ...
            FeatureM
         FeatureTypeName2 NumberOfFeatures(M)
            Feature1
            ...
            FeatureM
         ...
         FeatureTypeNameN NumberOfFeatures(M)
            Feature1
            ...
            FeatureM
      FontName CharName ...
    

The result of this program is a binary inttemp file used by the OCR engine.

Parameters:
argcnumber of command line arguments
argvarray of command line arguments
Returns:
none
Note:
Exceptions: none
History: Fri Aug 18 08:56:17 1989, DSJ, Created.
History: Mon May 18 1998, Christy Russson, Revistion started.

Definition at line 338 of file tesseractmain.cpp.

                                {
  const char* lang = "eng";
  const char* image = NULL;
  const char* outputbase = NULL;
  const char* datapath = NULL;
  bool list_langs = false;
  bool print_parameters = false;
  GenericVector<STRING> vars_vec, vars_values;
  int arg_i = 1;
  tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;

  ParseArgs(argc, argv,
          &lang, &image, &outputbase, &datapath,
          &list_langs, &print_parameters,
          &vars_vec, &vars_values, &arg_i, &pagesegmode);

  PERF_COUNT_START("Tesseract:main")
  tesseract::TessBaseAPI api;

  api.SetOutputName(outputbase);

  int init_failed = api.Init(datapath, lang, tesseract::OEM_DEFAULT,
                &(argv[arg_i]), argc - arg_i, &vars_vec, &vars_values, false);
  if (init_failed) {
    fprintf(stderr, "Could not initialize tesseract.\n");
    exit(1);
  }

  SetVariablesFromCLArgs(&api, argc, argv);

  if (list_langs) {
     PrintLangsList(&api);
     exit(0);
  }

  if (print_parameters) {
     FILE* fout = stdout;
     fprintf(stdout, "Tesseract parameters:\n");
     api.PrintVariables(fout);
     api.End();
     exit(0);
  }

  FixPageSegMode(&api, pagesegmode);

  if (pagesegmode == tesseract::PSM_AUTO_ONLY) {
    int ret_val = 0;

    Pix* pixs = pixRead(image);
    if (!pixs) {
      fprintf(stderr, "Cannot open input file: %s\n", image);
      exit(2);
    }

    api.SetImage(pixs);

    tesseract::Orientation orientation;
    tesseract::WritingDirection direction;
    tesseract::TextlineOrder order;
    float deskew_angle;

    tesseract::PageIterator* it =  api.AnalyseLayout();
    if (it) {
      it->Orientation(&orientation, &direction, &order, &deskew_angle);
      tprintf("Orientation: %d\nWritingDirection: %d\nTextlineOrder: %d\n" \
             "Deskew angle: %.4f\n",
              orientation, direction, order, deskew_angle);
    } else {
      ret_val = 1;
    }

    delete it;

    pixDestroy(&pixs);
    exit(ret_val);
  }

  // set in_training_mode to true when using one of these configs:
  // ambigs.train, box.train, box.train.stderr, linebox, rebox
  bool b = false;
  bool in_training_mode =
        (api.GetBoolVariable("tessedit_ambigs_training", &b) && b) ||
        (api.GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) ||
        (api.GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b);

  tesseract::PointerVector<tesseract::TessResultRenderer> renderers;

  if (in_training_mode) {
    renderers.push_back(NULL);
  } else {
    PreloadRenderers(&api, &renderers, pagesegmode, outputbase);
  }

  if (!renderers.empty()) {
    bool succeed = api.ProcessPages(image, NULL, 0, renderers[0]);
    if (!succeed) {
      fprintf(stderr, "Error during processing.\n");
      exit(1);
    }
  }

  PERF_COUNT_END
  return 0;                      // Normal exit
}
void WriteNormProtos ( const char *  Directory,
LIST  LabeledProtoList,
CLUSTERER Clusterer 
)

This routine writes the specified samples into files which are organized according to the font name and character name of the samples.

Parameters:
Directorydirectory to place sample files into
LabeledProtoListList of labeled protos
ClustererThe CLUSTERER to use
Returns:
none
Note:
Exceptions: none
History: Fri Aug 18 16:17:06 1989, DSJ, Created.

Definition at line 224 of file cntraining.cpp.

{
  FILE    *File;
  STRING Filename;
  LABELEDLIST LabeledProto;
  int N;

  Filename = "";
  if (Directory != NULL && Directory[0] != '\0')
  {
    Filename += Directory;
    Filename += "/";
  }
  Filename += "normproto";
  printf ("\nWriting %s ...", Filename.string());
  File = Efopen (Filename.string(), "wb");
  fprintf(File,"%0d\n",Clusterer->SampleSize);
  WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc);
  iterate(LabeledProtoList)
  {
    LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
    N = NumberOfProtos(LabeledProto->List, true, false);
    if (N < 1) {
      printf ("\nError! Not enough protos for %s: %d protos"
              " (%d significant protos"
              ", %d insignificant protos)\n",
              LabeledProto->Label, N,
              NumberOfProtos(LabeledProto->List, 1, 0),
              NumberOfProtos(LabeledProto->List, 0, 1));
      exit(1);
    }
    fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
    WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false);
  }
  fclose (File);

}  // WriteNormProtos
void WriteProtos ( FILE *  File,
uinT16  N,
LIST  ProtoList,
BOOL8  WriteSigProtos,
BOOL8  WriteInsigProtos 
)

Definition at line 266 of file cntraining.cpp.

{
  PROTOTYPE  *Proto;

  // write prototypes
  iterate(ProtoList)
  {
    Proto = (PROTOTYPE *) first_node ( ProtoList );
    if (( Proto->Significant && WriteSigProtos )  ||
      ( ! Proto->Significant && WriteInsigProtos ) )
      WritePrototype( File, N, Proto );
  }
}  // WriteProtos

Variable Documentation

Initial value:
{
  elliptical, 0.025, 0.05, 0.8, 1e-3, 0
}

Definition at line 79 of file cntraining.cpp.

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines