tesseract 3.04.01

training/classifier_tester.cpp

Go to the documentation of this file.
00001 // Copyright 2011 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00003 
00004 // Licensed under the Apache License, Version 2.0 (the "License");
00005 // you may not use this file except in compliance with the License.
00006 // You may obtain a copy of the License at
00007 // http://www.apache.org/licenses/LICENSE-2.0
00008 // Unless required by applicable law or agreed to in writing, software
00009 // distributed under the License is distributed on an "AS IS" BASIS,
00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00011 // See the License for the specific language governing permissions and
00012 // limitations under the License.
00013 
00014 //  Filename: classifier_tester.cpp
00015 //  Purpose:  Tests a character classifier on data as formatted for training,
00016 //            but doesn't have to be the same as the training data.
00017 //  Author:   Ray Smith
00018 
00019 #include <stdio.h>
00020 #ifndef USE_STD_NAMESPACE
00021 #include "base/commandlineflags.h"
00022 #endif  // USE_STD_NAMESPACE
00023 #include "baseapi.h"
00024 #include "commontraining.h"
00025 #ifndef NO_CUBE_BUILD
00026 #include "cubeclassifier.h"
00027 #endif  // NO_CUBE_BUILD
00028 #include "mastertrainer.h"
00029 #include "params.h"
00030 #include "strngs.h"
00031 #include "tessclassifier.h"
00032 
00033 STRING_PARAM_FLAG(classifier, "", "Classifier to test");
00034 STRING_PARAM_FLAG(lang, "eng", "Language to test");
00035 STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files");
00036 DECLARE_INT_PARAM_FLAG(debug_level);
00037 DECLARE_STRING_PARAM_FLAG(T);
00038 
00039 enum ClassifierName {
00040   CN_PRUNER,
00041   CN_FULL,
00042 #ifndef NO_CUBE_BUILD
00043   CN_CUBE,
00044   CN_CUBETESS,
00045 #endif  // NO_CUBE_BUILD
00046   CN_COUNT
00047 };
00048 
00049 const char* names[] = {"pruner", "full",
00050 #ifndef NO_CUBE_BUILD
00051   "cube", "cubetess",
00052 #endif  // NO_CUBE_BUILD
00053   NULL };
00054 
00055 static tesseract::ShapeClassifier* InitializeClassifier(
00056     const char* classifer_name, const UNICHARSET& unicharset,
00057     int argc, char **argv,
00058     tesseract::TessBaseAPI** api) {
00059   // Decode the classifier string.
00060   ClassifierName classifier = CN_COUNT;
00061   for (int c = 0; c < CN_COUNT; ++c) {
00062     if (strcmp(classifer_name, names[c]) == 0) {
00063       classifier = static_cast<ClassifierName>(c);
00064       break;
00065     }
00066   }
00067   if (classifier == CN_COUNT) {
00068     fprintf(stderr, "Invalid classifier name:%s\n", FLAGS_classifier.c_str());
00069     return NULL;
00070   }
00071 
00072   // We need to initialize tesseract to test.
00073   *api = new tesseract::TessBaseAPI;
00074   tesseract::OcrEngineMode engine_mode = tesseract::OEM_TESSERACT_ONLY;
00075 #ifndef NO_CUBE_BUILD
00076   if (classifier == CN_CUBE || classifier == CN_CUBETESS)
00077     engine_mode = tesseract::OEM_TESSERACT_CUBE_COMBINED;
00078 #endif  // NO_CUBE_BUILD
00079   tesseract::Tesseract* tesseract = NULL;
00080   tesseract::Classify* classify = NULL;
00081   if (
00082 #ifndef NO_CUBE_BUILD
00083     classifier == CN_CUBE || classifier == CN_CUBETESS ||
00084 #endif  // NO_CUBE_BUILD
00085       classifier == CN_PRUNER || classifier == CN_FULL) {
00086 #ifndef NO_CUBE_BUILD
00087     (*api)->SetVariable("cube_debug_level", "2");
00088 #endif  // NO_CUBE_BUILD
00089     if ((*api)->Init(FLAGS_tessdata_dir.c_str(), FLAGS_lang.c_str(),
00090                  engine_mode) < 0) {
00091       fprintf(stderr, "Tesseract initialization failed!\n");
00092       return NULL;
00093     }
00094     tesseract = const_cast<tesseract::Tesseract*>((*api)->tesseract());
00095     classify = reinterpret_cast<tesseract::Classify*>(tesseract);
00096     if (classify->shape_table() == NULL) {
00097       fprintf(stderr, "Tesseract must contain a ShapeTable!\n");
00098       return NULL;
00099     }
00100   }
00101   tesseract::ShapeClassifier* shape_classifier = NULL;
00102 
00103   if (!FLAGS_T.empty()) {
00104     const char* config_name;
00105     while ((config_name = GetNextFilename(argc, argv)) != NULL) {
00106       tprintf("Reading config file %s ...\n", config_name);
00107       (*api)->ReadConfigFile(config_name);
00108     }
00109   }
00110   if (classifier == CN_PRUNER) {
00111     shape_classifier = new tesseract::TessClassifier(true, classify);
00112   } else if (classifier == CN_FULL) {
00113     shape_classifier = new tesseract::TessClassifier(false, classify);
00114 #ifndef NO_CUBE_BUILD
00115   } else if (classifier == CN_CUBE) {
00116     shape_classifier = new tesseract::CubeClassifier(tesseract);
00117   } else if (classifier == CN_CUBETESS) {
00118     shape_classifier = new tesseract::CubeTessClassifier(tesseract);
00119 #endif  // NO_CUBE_BUILD
00120   } else {
00121     fprintf(stderr, "%s tester not yet implemented\n", classifer_name);
00122     return NULL;
00123   }
00124   tprintf("Testing classifier %s:\n", classifer_name);
00125   return shape_classifier;
00126 }
00127 
00128 // This program has complex setup requirements, so here is some help:
00129 // Two different modes, tr files and serialized mastertrainer.
00130 // From tr files:
00131 //   classifier_tester -U unicharset -F font_properties -X xheights
00132 //     -classifier x -lang lang [-output_trainer trainer] *.tr
00133 // From a serialized trainer:
00134 //  classifier_tester -input_trainer trainer [-lang lang] -classifier x
00135 //
00136 // In the first case, the unicharset must be the unicharset from within
00137 // the classifier under test, and the font_properties and xheights files must
00138 // match the files used during training.
00139 // In the second case, the trainer file must have been prepared from
00140 // some previous run of shapeclustering, mftraining, or classifier_tester
00141 // using the same conditions as above, ie matching unicharset/font_properties.
00142 //
00143 // Available values of classifier (x above) are:
00144 // pruner   : Tesseract class pruner only.
00145 // full     : Tesseract full classifier.
00146 // cube     : Cube classifier. (Not possible with an input trainer.)
00147 // cubetess : Tesseract class pruner with rescoring by Cube.  (Not possible
00148 //            with an input trainer.)
00149 int main(int argc, char **argv) {
00150   ParseArguments(&argc, &argv);
00151   STRING file_prefix;
00152   tesseract::MasterTrainer* trainer = tesseract::LoadTrainingData(
00153       argc, argv, false, NULL, &file_prefix);
00154   tesseract::TessBaseAPI* api;
00155   // Decode the classifier string.
00156   tesseract::ShapeClassifier* shape_classifier = InitializeClassifier(
00157       FLAGS_classifier.c_str(), trainer->unicharset(), argc, argv, &api);
00158   if (shape_classifier == NULL) {
00159     fprintf(stderr, "Classifier init failed!:%s\n", FLAGS_classifier.c_str());
00160     return 1;
00161   }
00162 
00163   // We want to test junk as well if it is available.
00164   // trainer->IncludeJunk();
00165   // We want to test with replicated samples too.
00166   trainer->ReplicateAndRandomizeSamplesIfRequired();
00167 
00168   trainer->TestClassifierOnSamples(tesseract:: CT_UNICHAR_TOP1_ERR,
00169                                    MAX(3, FLAGS_debug_level), false,
00170                                    shape_classifier, NULL);
00171   delete shape_classifier;
00172   delete api;
00173   delete trainer;
00174 
00175   return 0;
00176 } /* main */
00177 
00178 
00179 
00180 
00181 
00182 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines