tesseract 3.04.01

training/ambiguous_words.cpp File Reference

#include <stdio.h>
#include "baseapi.h"
#include "helpers.h"
#include "strngs.h"
#include "dict.h"
#include "tesseractclass.h"

Go to the source code of this file.

Functions

int main (int argc, char **argv)

Function Documentation

int main ( int  argc,
char **  argv 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

      FontName UTF8-char-str xmin ymin xmax ymax page-number
       NumberOfFeatureTypes(N)
         FeatureTypeName1 NumberOfFeatures(M)
            Feature1
            ...
            FeatureM
         FeatureTypeName2 NumberOfFeatures(M)
            Feature1
            ...
            FeatureM
         ...
         FeatureTypeNameN NumberOfFeatures(M)
            Feature1
            ...
            FeatureM
      FontName CharName ...
    

The result of this program is a binary inttemp file used by the OCR engine.

Parameters:
argcnumber of command line arguments
argvarray of command line arguments
Returns:
none
Note:
Exceptions: none
History: Fri Aug 18 08:56:17 1989, DSJ, Created.
History: Mon May 18 1998, Christy Russson, Revistion started.

Definition at line 32 of file ambiguous_words.cpp.

                                {

  // Parse input arguments.
  if (argc != 4 && (argc != 6 || strcmp(argv[1], "-l") != 0)) {
    printf("Usage: %s [-l lang] tessdata_dir wordlist_file"
           " output_ambiguious_wordlist_file\n", argv[0]);
    return 1;
  }
  int argv_offset = 0;
  STRING lang;
  if (argc == 6) {
    lang = argv[2];
    argv_offset = 2;
  } else {
    lang = "eng";
  }
  const char *tessdata_dir = argv[++argv_offset];
  const char *input_file_str = argv[++argv_offset];
  const char *output_file_str = argv[++argv_offset];

  // Initialize Tesseract.
  tesseract::TessBaseAPI api;
  GenericVector<STRING> vars_vec;
  GenericVector<STRING> vars_values;
  vars_vec.push_back("output_ambig_words_file");
  vars_values.push_back(output_file_str);
  api.Init(tessdata_dir, lang.string(), tesseract::OEM_TESSERACT_ONLY,
           NULL, 0, &vars_vec, &vars_values, false);
  tesseract::Dict &dict = api.tesseract()->getDict();
  FILE *input_file = fopen(input_file_str, "rb");
  if (input_file == NULL) {
    tprintf("Failed to open input wordlist file %s\n", input_file_str);
    exit(1);
  }
  char str[CHARS_PER_LINE];

  // Read word list and call Dict::NoDangerousAmbig() for each word
  // to record ambiguities in the output file.
  while (fgets(str, CHARS_PER_LINE, input_file) != NULL) {
    chomp_string(str);  // remove newline
    WERD_CHOICE word(str, dict.getUnicharset());
    dict.NoDangerousAmbig(&word, NULL, false, NULL);
  }
  // Clean up.
  fclose(input_file);
}
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines