|
tesseract 3.04.01
|
00001 /****************************************************************************** 00002 ** Filename: blobclass.c 00003 ** Purpose: High level blob classification and training routines. 00004 ** Author: Dan Johnson 00005 ** History: 7/21/89, DSJ, Created. 00006 ** 00007 ** (c) Copyright Hewlett-Packard Company, 1988. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 ******************************************************************************/ 00018 00022 #include "blobclass.h" 00023 00024 #include <stdio.h> 00025 00026 #include "classify.h" 00027 #include "efio.h" 00028 #include "featdefs.h" 00029 #include "mf.h" 00030 #include "normfeat.h" 00031 00032 static const char kUnknownFontName[] = "UnknownFont"; 00033 00034 STRING_VAR(classify_font_name, kUnknownFontName, 00035 "Default font name to be used in training"); 00036 00037 namespace tesseract { 00041 // Finds the name of the training font and returns it in fontname, by cutting 00042 // it out based on the expectation that the filename is of the form: 00043 // /path/to/dir/[lang].[fontname].exp[num] 00044 // The [lang], [fontname] and [num] fields should not have '.' characters. 00045 // If the global parameter classify_font_name is set, its value is used instead. 00046 void ExtractFontName(const STRING& filename, STRING* fontname) { 00047 *fontname = classify_font_name; 00048 if (*fontname == kUnknownFontName) { 00049 // filename is expected to be of the form [lang].[fontname].exp[num] 00050 // The [lang], [fontname] and [num] fields should not have '.' characters. 00051 const char *basename = strrchr(filename.string(), '/'); 00052 const char *firstdot = strchr(basename ? basename : filename.string(), '.'); 00053 const char *lastdot = strrchr(filename.string(), '.'); 00054 if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) { 00055 ++firstdot; 00056 *fontname = firstdot; 00057 fontname->truncate_at(lastdot - firstdot); 00058 } 00059 } 00060 } 00061 00062 /*---------------------------------------------------------------------------*/ 00063 // Extracts features from the given blob and saves them in the tr_file_data_ 00064 // member variable. 00065 // fontname: Name of font that this blob was printed in. 00066 // cn_denorm: Character normalization transformation to apply to the blob. 00067 // fx_info: Character normalization parameters computed with cn_denorm. 00068 // blob_text: Ground truth text for the blob. 00069 void Classify::LearnBlob(const STRING& fontname, TBLOB* blob, 00070 const DENORM& cn_denorm, 00071 const INT_FX_RESULT_STRUCT& fx_info, 00072 const char* blob_text) { 00073 CHAR_DESC CharDesc = NewCharDescription(feature_defs_); 00074 CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm); 00075 CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info); 00076 CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info); 00077 CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info); 00078 00079 if (ValidCharDescription(feature_defs_, CharDesc)) { 00080 // Label the features with a class name and font name. 00081 tr_file_data_ += "\n"; 00082 tr_file_data_ += fontname; 00083 tr_file_data_ += " "; 00084 tr_file_data_ += blob_text; 00085 tr_file_data_ += "\n"; 00086 00087 // write micro-features to file and clean up 00088 WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_); 00089 } else { 00090 tprintf("Blob learned was invalid!\n"); 00091 } 00092 FreeCharDescription(CharDesc); 00093 } // LearnBlob 00094 00095 // Writes stored training data to a .tr file based on the given filename. 00096 // Returns false on error. 00097 bool Classify::WriteTRFile(const STRING& filename) { 00098 STRING tr_filename = filename + ".tr"; 00099 FILE* fp = Efopen(tr_filename.string(), "wb"); 00100 int len = tr_file_data_.length(); 00101 bool result = 00102 fwrite(&tr_file_data_[0], sizeof(tr_file_data_[0]), len, fp) == len; 00103 fclose(fp); 00104 tr_file_data_.truncate_at(0); 00105 return result; 00106 } 00107 00108 } // namespace tesseract.