|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: tessedit.cpp (Formerly tessedit.c) 00003 * Description: (Previously) Main program for merge of tess and editor. 00004 * Now just code to load the language model and various 00005 * engine-specific data files. 00006 * Author: Ray Smith 00007 * Created: Tue Jan 07 15:21:46 GMT 1992 00008 * 00009 * (C) Copyright 1992, Hewlett-Packard Ltd. 00010 ** Licensed under the Apache License, Version 2.0 (the "License"); 00011 ** you may not use this file except in compliance with the License. 00012 ** You may obtain a copy of the License at 00013 ** http://www.apache.org/licenses/LICENSE-2.0 00014 ** Unless required by applicable law or agreed to in writing, software 00015 ** distributed under the License is distributed on an "AS IS" BASIS, 00016 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00017 ** See the License for the specific language governing permissions and 00018 ** limitations under the License. 00019 * 00020 **********************************************************************/ 00021 00022 // Include automatically generated configuration file if running autoconf. 00023 #ifdef HAVE_CONFIG_H 00024 #include "config_auto.h" 00025 #endif 00026 00027 #include "stderr.h" 00028 #include "basedir.h" 00029 #include "tessvars.h" 00030 #include "control.h" 00031 #include "reject.h" 00032 #include "pageres.h" 00033 #include "nwmain.h" 00034 #include "pgedit.h" 00035 #include "tprintf.h" 00036 #include "tessedit.h" 00037 #include "stopper.h" 00038 #include "intmatcher.h" 00039 #include "chop.h" 00040 #include "efio.h" 00041 #include "danerror.h" 00042 #include "globals.h" 00043 #include "tesseractclass.h" 00044 #include "params.h" 00045 00046 #define VARDIR "configs/" /*variables files */ 00047 //config under api 00048 #define API_CONFIG "configs/api_config" 00049 00050 ETEXT_DESC *global_monitor = NULL; // progress monitor 00051 00052 namespace tesseract { 00053 00054 // Read a "config" file containing a set of variable, value pairs. 00055 // Searches the standard places: tessdata/configs, tessdata/tessconfigs 00056 // and also accepts a relative or absolute path name. 00057 void Tesseract::read_config_file(const char *filename, 00058 SetParamConstraint constraint) { 00059 STRING path = datadir; 00060 path += "configs/"; 00061 path += filename; 00062 FILE* fp; 00063 if ((fp = fopen(path.string(), "rb")) != NULL) { 00064 fclose(fp); 00065 } else { 00066 path = datadir; 00067 path += "tessconfigs/"; 00068 path += filename; 00069 if ((fp = fopen(path.string(), "rb")) != NULL) { 00070 fclose(fp); 00071 } else { 00072 path = filename; 00073 } 00074 } 00075 ParamUtils::ReadParamsFile(path.string(), constraint, this->params()); 00076 } 00077 00078 // Returns false if a unicharset file for the specified language was not found 00079 // or was invalid. 00080 // This function initializes TessdataManager. After TessdataManager is 00081 // no longer needed, TessdataManager::End() should be called. 00082 // 00083 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless 00084 // it is OEM_DEFAULT, in which case the value of the variable will be obtained 00085 // from the language-specific config file (stored in [lang].traineddata), from 00086 // the config files specified on the command line or left as the default 00087 // OEM_TESSERACT_ONLY if none of the configs specify this variable. 00088 bool Tesseract::init_tesseract_lang_data( 00089 const char *arg0, const char *textbase, const char *language, 00090 OcrEngineMode oem, char **configs, int configs_size, 00091 const GenericVector<STRING> *vars_vec, 00092 const GenericVector<STRING> *vars_values, 00093 bool set_only_non_debug_params) { 00094 // Set the basename, compute the data directory. 00095 main_setup(arg0, textbase); 00096 00097 // Set the language data path prefix 00098 lang = language != NULL ? language : "eng"; 00099 language_data_path_prefix = datadir; 00100 language_data_path_prefix += lang; 00101 language_data_path_prefix += "."; 00102 00103 // Initialize TessdataManager. 00104 STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix; 00105 if (!tessdata_manager.Init(tessdata_path.string(), 00106 tessdata_manager_debug_level)) { 00107 return false; 00108 } 00109 00110 // If a language specific config file (lang.config) exists, load it in. 00111 if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) { 00112 ParamUtils::ReadParamsFromFp( 00113 tessdata_manager.GetDataFilePtr(), 00114 tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG), 00115 SET_PARAM_CONSTRAINT_NONE, this->params()); 00116 if (tessdata_manager_debug_level) { 00117 tprintf("Loaded language config file\n"); 00118 } 00119 } 00120 00121 SetParamConstraint set_params_constraint = set_only_non_debug_params ? 00122 SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE; 00123 // Load tesseract variables from config files. This is done after loading 00124 // language-specific variables from [lang].traineddata file, so that custom 00125 // config files can override values in [lang].traineddata file. 00126 for (int i = 0; i < configs_size; ++i) { 00127 read_config_file(configs[i], set_params_constraint); 00128 } 00129 00130 // Set params specified in vars_vec (done after setting params from config 00131 // files, so that params in vars_vec can override those from files). 00132 if (vars_vec != NULL && vars_values != NULL) { 00133 for (int i = 0; i < vars_vec->size(); ++i) { 00134 if (!ParamUtils::SetParam((*vars_vec)[i].string(), 00135 (*vars_values)[i].string(), 00136 set_params_constraint, this->params())) { 00137 tprintf("Error setting param %s\n", (*vars_vec)[i].string()); 00138 exit(1); 00139 } 00140 } 00141 } 00142 00143 if (((STRING &)tessedit_write_params_to_file).length() > 0) { 00144 FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb"); 00145 if (params_file != NULL) { 00146 ParamUtils::PrintParams(params_file, this->params()); 00147 fclose(params_file); 00148 if (tessdata_manager_debug_level > 0) { 00149 tprintf("Wrote parameters to %s\n", 00150 tessedit_write_params_to_file.string()); 00151 } 00152 } else { 00153 tprintf("Failed to open %s for writing params.\n", 00154 tessedit_write_params_to_file.string()); 00155 } 00156 } 00157 00158 // Determine which ocr engine(s) should be loaded and used for recognition. 00159 if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem); 00160 if (tessdata_manager_debug_level) { 00161 tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n", 00162 static_cast<int>(tessedit_ocr_engine_mode)); 00163 } 00164 00165 // If we are only loading the config file (and so not planning on doing any 00166 // recognition) then there's nothing else do here. 00167 if (tessedit_init_config_only) { 00168 if (tessdata_manager_debug_level) { 00169 tprintf("Returning after loading config file\n"); 00170 } 00171 return true; 00172 } 00173 00174 // Load the unicharset 00175 if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) || 00176 !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) { 00177 return false; 00178 } 00179 if (unicharset.size() > MAX_NUM_CLASSES) { 00180 tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n"); 00181 return false; 00182 } 00183 if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n"); 00184 right_to_left_ = unicharset.major_right_to_left(); 00185 00186 // Setup initial unichar ambigs table and read universal ambigs. 00187 UNICHARSET encoder_unicharset; 00188 encoder_unicharset.CopyFrom(unicharset); 00189 unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption); 00190 unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset); 00191 00192 if (!tessedit_ambigs_training && 00193 tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) { 00194 TFile ambigs_file; 00195 ambigs_file.Open(tessdata_manager.GetDataFilePtr(), 00196 tessdata_manager.GetEndOffset(TESSDATA_AMBIGS) + 1); 00197 unichar_ambigs.LoadUnicharAmbigs( 00198 encoder_unicharset, 00199 &ambigs_file, 00200 ambigs_debug_level, use_ambigs_for_adaption, &unicharset); 00201 if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n"); 00202 } 00203 00204 // The various OcrEngineMode settings (see publictypes.h) determine which 00205 // engine-specific data files need to be loaded. Currently everything needs 00206 // the base tesseract data, which supplies other useful information, but 00207 // alternative engines, such as cube and LSTM are optional. 00208 #ifndef NO_CUBE_BUILD 00209 if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { 00210 ASSERT_HOST(init_cube_objects(false, &tessdata_manager)); 00211 if (tessdata_manager_debug_level) 00212 tprintf("Loaded Cube w/out combiner\n"); 00213 } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { 00214 ASSERT_HOST(init_cube_objects(true, &tessdata_manager)); 00215 if (tessdata_manager_debug_level) 00216 tprintf("Loaded Cube with combiner\n"); 00217 } 00218 #endif 00219 // Init ParamsModel. 00220 // Load pass1 and pass2 weights (for now these two sets are the same, but in 00221 // the future separate sets of weights can be generated). 00222 for (int p = ParamsModel::PTRAIN_PASS1; 00223 p < ParamsModel::PTRAIN_NUM_PASSES; ++p) { 00224 language_model_->getParamsModel().SetPass( 00225 static_cast<ParamsModel::PassEnum>(p)); 00226 if (tessdata_manager.SeekToStart(TESSDATA_PARAMS_MODEL)) { 00227 if (!language_model_->getParamsModel().LoadFromFp( 00228 lang.string(), tessdata_manager.GetDataFilePtr(), 00229 tessdata_manager.GetEndOffset(TESSDATA_PARAMS_MODEL))) { 00230 return false; 00231 } 00232 } 00233 } 00234 if (tessdata_manager_debug_level) language_model_->getParamsModel().Print(); 00235 00236 return true; 00237 } 00238 00239 // Helper returns true if the given string is in the vector of strings. 00240 static bool IsStrInList(const STRING& str, 00241 const GenericVector<STRING>& str_list) { 00242 for (int i = 0; i < str_list.size(); ++i) { 00243 if (str_list[i] == str) 00244 return true; 00245 } 00246 return false; 00247 } 00248 00249 // Parse a string of the form [~]<lang>[+[~]<lang>]*. 00250 // Langs with no prefix get appended to to_load, provided they 00251 // are not in there already. 00252 // Langs with ~ prefix get appended to not_to_load, provided they are not in 00253 // there already. 00254 void Tesseract::ParseLanguageString(const char* lang_str, 00255 GenericVector<STRING>* to_load, 00256 GenericVector<STRING>* not_to_load) { 00257 STRING remains(lang_str); 00258 while (remains.length() > 0) { 00259 // Find the start of the lang code and which vector to add to. 00260 const char* start = remains.string(); 00261 while (*start == '+') 00262 ++start; 00263 GenericVector<STRING>* target = to_load; 00264 if (*start == '~') { 00265 target = not_to_load; 00266 ++start; 00267 } 00268 // Find the index of the end of the lang code in string start. 00269 int end = strlen(start); 00270 const char* plus = strchr(start, '+'); 00271 if (plus != NULL && plus - start < end) 00272 end = plus - start; 00273 STRING lang_code(start); 00274 lang_code.truncate_at(end); 00275 STRING next(start + end); 00276 remains = next; 00277 // Check whether lang_code is already in the target vector and add. 00278 if (!IsStrInList(lang_code, *target)) { 00279 if (tessdata_manager_debug_level) 00280 tprintf("Adding language '%s' to list\n", lang_code.string()); 00281 target->push_back(lang_code); 00282 } 00283 } 00284 } 00285 00286 // Initialize for potentially a set of languages defined by the language 00287 // string and recursively any additional languages required by any language 00288 // traineddata file (via tessedit_load_sublangs in its config) that is loaded. 00289 // See init_tesseract_internal for args. 00290 int Tesseract::init_tesseract( 00291 const char *arg0, const char *textbase, const char *language, 00292 OcrEngineMode oem, char **configs, int configs_size, 00293 const GenericVector<STRING> *vars_vec, 00294 const GenericVector<STRING> *vars_values, 00295 bool set_only_non_debug_params) { 00296 GenericVector<STRING> langs_to_load; 00297 GenericVector<STRING> langs_not_to_load; 00298 ParseLanguageString(language, &langs_to_load, &langs_not_to_load); 00299 00300 sub_langs_.delete_data_pointers(); 00301 sub_langs_.clear(); 00302 // Find the first loadable lang and load into this. 00303 // Add any languages that this language requires 00304 bool loaded_primary = false; 00305 // Load the rest into sub_langs_. 00306 for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) { 00307 if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) { 00308 const char *lang_str = langs_to_load[lang_index].string(); 00309 Tesseract *tess_to_init; 00310 if (!loaded_primary) { 00311 tess_to_init = this; 00312 } else { 00313 tess_to_init = new Tesseract; 00314 } 00315 00316 int result = tess_to_init->init_tesseract_internal( 00317 arg0, textbase, lang_str, oem, configs, configs_size, 00318 vars_vec, vars_values, set_only_non_debug_params); 00319 00320 if (!loaded_primary) { 00321 if (result < 0) { 00322 tprintf("Failed loading language '%s'\n", lang_str); 00323 } else { 00324 if (tessdata_manager_debug_level) 00325 tprintf("Loaded language '%s' as main language\n", lang_str); 00326 ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), 00327 &langs_to_load, &langs_not_to_load); 00328 loaded_primary = true; 00329 } 00330 } else { 00331 if (result < 0) { 00332 tprintf("Failed loading language '%s'\n", lang_str); 00333 delete tess_to_init; 00334 } else { 00335 if (tessdata_manager_debug_level) 00336 tprintf("Loaded language '%s' as secondary language\n", lang_str); 00337 sub_langs_.push_back(tess_to_init); 00338 // Add any languages that this language requires 00339 ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), 00340 &langs_to_load, &langs_not_to_load); 00341 } 00342 } 00343 } 00344 } 00345 if (!loaded_primary) { 00346 tprintf("Tesseract couldn't load any languages!\n"); 00347 return -1; // Couldn't load any language! 00348 } 00349 if (!sub_langs_.empty()) { 00350 // In multilingual mode word ratings have to be directly comparable, 00351 // so use the same language model weights for all languages: 00352 // use the primary language's params model if 00353 // tessedit_use_primary_params_model is set, 00354 // otherwise use default language model weights. 00355 if (tessedit_use_primary_params_model) { 00356 for (int s = 0; s < sub_langs_.size(); ++s) { 00357 sub_langs_[s]->language_model_->getParamsModel().Copy( 00358 this->language_model_->getParamsModel()); 00359 } 00360 tprintf("Using params model of the primary language\n"); 00361 if (tessdata_manager_debug_level) { 00362 this->language_model_->getParamsModel().Print(); 00363 } 00364 } else { 00365 this->language_model_->getParamsModel().Clear(); 00366 for (int s = 0; s < sub_langs_.size(); ++s) { 00367 sub_langs_[s]->language_model_->getParamsModel().Clear(); 00368 } 00369 if (tessdata_manager_debug_level) 00370 tprintf("Using default language params\n"); 00371 } 00372 } 00373 00374 SetupUniversalFontIds(); 00375 return 0; 00376 } 00377 00378 // Common initialization for a single language. 00379 // arg0 is the datapath for the tessdata directory, which could be the 00380 // path of the tessdata directory with no trailing /, or (if tessdata 00381 // lives in the same directory as the executable, the path of the executable, 00382 // hence the name arg0. 00383 // textbase is an optional output file basename (used only for training) 00384 // language is the language code to load. 00385 // oem controls which engine(s) will operate on the image 00386 // configs (argv) is an array of config filenames to load variables from. 00387 // May be NULL. 00388 // configs_size (argc) is the number of elements in configs. 00389 // vars_vec is an optional vector of variables to set. 00390 // vars_values is an optional corresponding vector of values for the variables 00391 // in vars_vec. 00392 // If set_only_init_params is true, then only the initialization variables 00393 // will be set. 00394 int Tesseract::init_tesseract_internal( 00395 const char *arg0, const char *textbase, const char *language, 00396 OcrEngineMode oem, char **configs, int configs_size, 00397 const GenericVector<STRING> *vars_vec, 00398 const GenericVector<STRING> *vars_values, 00399 bool set_only_non_debug_params) { 00400 if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs, 00401 configs_size, vars_vec, vars_values, 00402 set_only_non_debug_params)) { 00403 return -1; 00404 } 00405 if (tessedit_init_config_only) { 00406 tessdata_manager.End(); 00407 return 0; 00408 } 00409 // If only Cube will be used, skip loading Tesseract classifier's 00410 // pre-trained templates. 00411 bool init_tesseract_classifier = 00412 (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY || 00413 tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED); 00414 // If only Cube will be used and if it has its own Unicharset, 00415 // skip initializing permuter and loading Tesseract Dawgs. 00416 bool init_dict = 00417 !(tessedit_ocr_engine_mode == OEM_CUBE_ONLY && 00418 tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET)); 00419 program_editup(textbase, init_tesseract_classifier, init_dict); 00420 tessdata_manager.End(); 00421 return 0; //Normal exit 00422 } 00423 00424 // Helper builds the all_fonts table by adding new fonts from new_fonts. 00425 static void CollectFonts(const UnicityTable<FontInfo>& new_fonts, 00426 UnicityTable<FontInfo>* all_fonts) { 00427 for (int i = 0; i < new_fonts.size(); ++i) { 00428 // UnicityTable uniques as we go. 00429 all_fonts->push_back(new_fonts.get(i)); 00430 } 00431 } 00432 00433 // Helper assigns an id to lang_fonts using the index in all_fonts table. 00434 static void AssignIds(const UnicityTable<FontInfo>& all_fonts, 00435 UnicityTable<FontInfo>* lang_fonts) { 00436 for (int i = 0; i < lang_fonts->size(); ++i) { 00437 int index = all_fonts.get_id(lang_fonts->get(i)); 00438 lang_fonts->get_mutable(i)->universal_id = index; 00439 } 00440 } 00441 00442 // Set the universal_id member of each font to be unique among all 00443 // instances of the same font loaded. 00444 void Tesseract::SetupUniversalFontIds() { 00445 // Note that we can get away with bitwise copying FontInfo in 00446 // all_fonts, as it is a temporary structure and we avoid setting the 00447 // delete callback. 00448 UnicityTable<FontInfo> all_fonts; 00449 all_fonts.set_compare_callback(NewPermanentTessCallback(CompareFontInfo)); 00450 00451 // Create the universal ID table. 00452 CollectFonts(get_fontinfo_table(), &all_fonts); 00453 for (int i = 0; i < sub_langs_.size(); ++i) { 00454 CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts); 00455 } 00456 // Assign ids from the table to each font table. 00457 AssignIds(all_fonts, &get_fontinfo_table()); 00458 for (int i = 0; i < sub_langs_.size(); ++i) { 00459 AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table()); 00460 } 00461 font_table_size_ = all_fonts.size(); 00462 } 00463 00464 // init the LM component 00465 int Tesseract::init_tesseract_lm(const char *arg0, 00466 const char *textbase, 00467 const char *language) { 00468 if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY, 00469 NULL, 0, NULL, NULL, false)) 00470 return -1; 00471 getDict().Load(Dict::GlobalDawgCache()); 00472 tessdata_manager.End(); 00473 return 0; 00474 } 00475 00476 void Tesseract::end_tesseract() { 00477 end_recog(); 00478 } 00479 00480 /* Define command type identifiers */ 00481 00482 enum CMD_EVENTS 00483 { 00484 ACTION_1_CMD_EVENT, 00485 RECOG_WERDS, 00486 RECOG_PSEUDO, 00487 ACTION_2_CMD_EVENT 00488 }; 00489 } // namespace tesseract