tesseract 3.04.01

classify/blobclass.cpp

Go to the documentation of this file.
00001 /******************************************************************************
00002  **      Filename:       blobclass.c
00003  **      Purpose:        High level blob classification and training routines.
00004  **      Author:         Dan Johnson
00005  **      History:        7/21/89, DSJ, Created.
00006  **
00007  **      (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00018 
00022 #include "blobclass.h"
00023 
00024 #include <stdio.h>
00025 
00026 #include "classify.h"
00027 #include "efio.h"
00028 #include "featdefs.h"
00029 #include "mf.h"
00030 #include "normfeat.h"
00031 
00032 static const char kUnknownFontName[] = "UnknownFont";
00033 
00034 STRING_VAR(classify_font_name, kUnknownFontName,
00035            "Default font name to be used in training");
00036 
00037 namespace tesseract {
00041 // Finds the name of the training font and returns it in fontname, by cutting
00042 // it out based on the expectation that the filename is of the form:
00043 // /path/to/dir/[lang].[fontname].exp[num]
00044 // The [lang], [fontname] and [num] fields should not have '.' characters.
00045 // If the global parameter classify_font_name is set, its value is used instead.
00046 void ExtractFontName(const STRING& filename, STRING* fontname) {
00047   *fontname = classify_font_name;
00048   if (*fontname == kUnknownFontName) {
00049     // filename is expected to be of the form [lang].[fontname].exp[num]
00050     // The [lang], [fontname] and [num] fields should not have '.' characters.
00051     const char *basename = strrchr(filename.string(), '/');
00052     const char *firstdot = strchr(basename ? basename : filename.string(), '.');
00053     const char *lastdot  = strrchr(filename.string(), '.');
00054     if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) {
00055       ++firstdot;
00056       *fontname = firstdot;
00057       fontname->truncate_at(lastdot - firstdot);
00058     }
00059   }
00060 }
00061 
00062 /*---------------------------------------------------------------------------*/
00063 // Extracts features from the given blob and saves them in the tr_file_data_
00064 // member variable.
00065 // fontname:  Name of font that this blob was printed in.
00066 // cn_denorm: Character normalization transformation to apply to the blob.
00067 // fx_info:   Character normalization parameters computed with cn_denorm.
00068 // blob_text: Ground truth text for the blob.
00069 void Classify::LearnBlob(const STRING& fontname, TBLOB* blob,
00070                          const DENORM& cn_denorm,
00071                          const INT_FX_RESULT_STRUCT& fx_info,
00072                          const char* blob_text) {
00073   CHAR_DESC CharDesc = NewCharDescription(feature_defs_);
00074   CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);
00075   CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);
00076   CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);
00077   CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);
00078 
00079   if (ValidCharDescription(feature_defs_, CharDesc)) {
00080     // Label the features with a class name and font name.
00081     tr_file_data_ += "\n";
00082     tr_file_data_ += fontname;
00083     tr_file_data_ += " ";
00084     tr_file_data_ += blob_text;
00085     tr_file_data_ += "\n";
00086 
00087     // write micro-features to file and clean up
00088     WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_);
00089   } else {
00090     tprintf("Blob learned was invalid!\n");
00091   }
00092   FreeCharDescription(CharDesc);
00093 }                                // LearnBlob
00094 
00095 // Writes stored training data to a .tr file based on the given filename.
00096 // Returns false on error.
00097 bool Classify::WriteTRFile(const STRING& filename) {
00098   STRING tr_filename = filename + ".tr";
00099   FILE* fp = Efopen(tr_filename.string(), "wb");
00100   int len = tr_file_data_.length();
00101   bool result =
00102       fwrite(&tr_file_data_[0], sizeof(tr_file_data_[0]), len, fp) == len;
00103   fclose(fp);
00104   tr_file_data_.truncate_at(0);
00105   return result;
00106 }
00107 
00108 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines