tesseract 3.04.01

ccmain/tessedit.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        tessedit.cpp  (Formerly tessedit.c)
00003  * Description: (Previously) Main program for merge of tess and editor.
00004  *              Now just code to load the language model and various
00005  *              engine-specific data files.
00006  * Author:      Ray Smith
00007  * Created:     Tue Jan 07 15:21:46 GMT 1992
00008  *
00009  * (C) Copyright 1992, Hewlett-Packard Ltd.
00010  ** Licensed under the Apache License, Version 2.0 (the "License");
00011  ** you may not use this file except in compliance with the License.
00012  ** You may obtain a copy of the License at
00013  ** http://www.apache.org/licenses/LICENSE-2.0
00014  ** Unless required by applicable law or agreed to in writing, software
00015  ** distributed under the License is distributed on an "AS IS" BASIS,
00016  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00017  ** See the License for the specific language governing permissions and
00018  ** limitations under the License.
00019  *
00020  **********************************************************************/
00021 
00022 // Include automatically generated configuration file if running autoconf.
00023 #ifdef HAVE_CONFIG_H
00024 #include "config_auto.h"
00025 #endif
00026 
00027 #include          "stderr.h"
00028 #include          "basedir.h"
00029 #include          "tessvars.h"
00030 #include          "control.h"
00031 #include          "reject.h"
00032 #include          "pageres.h"
00033 #include          "nwmain.h"
00034 #include          "pgedit.h"
00035 #include          "tprintf.h"
00036 #include          "tessedit.h"
00037 #include "stopper.h"
00038 #include "intmatcher.h"
00039 #include "chop.h"
00040 #include "efio.h"
00041 #include "danerror.h"
00042 #include "globals.h"
00043 #include "tesseractclass.h"
00044 #include "params.h"
00045 
00046 #define VARDIR        "configs/" /*variables files */
00047                                  //config under api
00048 #define API_CONFIG      "configs/api_config"
00049 
00050 ETEXT_DESC *global_monitor = NULL;  // progress monitor
00051 
00052 namespace tesseract {
00053 
00054 // Read a "config" file containing a set of variable, value pairs.
00055 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
00056 // and also accepts a relative or absolute path name.
00057 void Tesseract::read_config_file(const char *filename,
00058                                  SetParamConstraint constraint) {
00059   STRING path = datadir;
00060   path += "configs/";
00061   path += filename;
00062   FILE* fp;
00063   if ((fp = fopen(path.string(), "rb")) != NULL) {
00064     fclose(fp);
00065   } else {
00066     path = datadir;
00067     path += "tessconfigs/";
00068     path += filename;
00069     if ((fp = fopen(path.string(), "rb")) != NULL) {
00070       fclose(fp);
00071     } else {
00072       path = filename;
00073     }
00074   }
00075   ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
00076 }
00077 
00078 // Returns false if a unicharset file for the specified language was not found
00079 // or was invalid.
00080 // This function initializes TessdataManager. After TessdataManager is
00081 // no longer needed, TessdataManager::End() should be called.
00082 //
00083 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
00084 // it is OEM_DEFAULT, in which case the value of the variable will be obtained
00085 // from the language-specific config file (stored in [lang].traineddata), from
00086 // the config files specified on the command line or left as the default
00087 // OEM_TESSERACT_ONLY if none of the configs specify this variable.
00088 bool Tesseract::init_tesseract_lang_data(
00089     const char *arg0, const char *textbase, const char *language,
00090     OcrEngineMode oem, char **configs, int configs_size,
00091     const GenericVector<STRING> *vars_vec,
00092     const GenericVector<STRING> *vars_values,
00093     bool set_only_non_debug_params) {
00094   // Set the basename, compute the data directory.
00095   main_setup(arg0, textbase);
00096 
00097   // Set the language data path prefix
00098   lang = language != NULL ? language : "eng";
00099   language_data_path_prefix = datadir;
00100   language_data_path_prefix += lang;
00101   language_data_path_prefix += ".";
00102 
00103   // Initialize TessdataManager.
00104   STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
00105   if (!tessdata_manager.Init(tessdata_path.string(),
00106                              tessdata_manager_debug_level)) {
00107     return false;
00108   }
00109 
00110   // If a language specific config file (lang.config) exists, load it in.
00111   if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) {
00112     ParamUtils::ReadParamsFromFp(
00113         tessdata_manager.GetDataFilePtr(),
00114         tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG),
00115         SET_PARAM_CONSTRAINT_NONE, this->params());
00116     if (tessdata_manager_debug_level) {
00117       tprintf("Loaded language config file\n");
00118     }
00119   }
00120 
00121   SetParamConstraint set_params_constraint = set_only_non_debug_params ?
00122       SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
00123   // Load tesseract variables from config files. This is done after loading
00124   // language-specific variables from [lang].traineddata file, so that custom
00125   // config files can override values in [lang].traineddata file.
00126   for (int i = 0; i < configs_size; ++i) {
00127     read_config_file(configs[i], set_params_constraint);
00128   }
00129 
00130   // Set params specified in vars_vec (done after setting params from config
00131   // files, so that params in vars_vec can override those from files).
00132   if (vars_vec != NULL && vars_values != NULL) {
00133     for (int i = 0; i < vars_vec->size(); ++i) {
00134       if (!ParamUtils::SetParam((*vars_vec)[i].string(),
00135                                 (*vars_values)[i].string(),
00136                                 set_params_constraint, this->params())) {
00137         tprintf("Error setting param %s\n", (*vars_vec)[i].string());
00138         exit(1);
00139       }
00140     }
00141   }
00142 
00143   if (((STRING &)tessedit_write_params_to_file).length() > 0) {
00144     FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
00145     if (params_file != NULL) {
00146       ParamUtils::PrintParams(params_file, this->params());
00147       fclose(params_file);
00148       if (tessdata_manager_debug_level > 0) {
00149         tprintf("Wrote parameters to %s\n",
00150                 tessedit_write_params_to_file.string());
00151       }
00152     } else {
00153       tprintf("Failed to open %s for writing params.\n",
00154               tessedit_write_params_to_file.string());
00155     }
00156   }
00157 
00158   // Determine which ocr engine(s) should be loaded and used for recognition.
00159   if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
00160   if (tessdata_manager_debug_level) {
00161     tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
00162             static_cast<int>(tessedit_ocr_engine_mode));
00163   }
00164 
00165   // If we are only loading the config file (and so not planning on doing any
00166   // recognition) then there's nothing else do here.
00167   if (tessedit_init_config_only) {
00168     if (tessdata_manager_debug_level) {
00169       tprintf("Returning after loading config file\n");
00170     }
00171     return true;
00172   }
00173 
00174   // Load the unicharset
00175   if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
00176       !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
00177     return false;
00178   }
00179   if (unicharset.size() > MAX_NUM_CLASSES) {
00180     tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
00181     return false;
00182   }
00183   if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
00184   right_to_left_ = unicharset.major_right_to_left();
00185 
00186   // Setup initial unichar ambigs table and read universal ambigs.
00187   UNICHARSET encoder_unicharset;
00188   encoder_unicharset.CopyFrom(unicharset);
00189   unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
00190   unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
00191 
00192   if (!tessedit_ambigs_training &&
00193       tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
00194     TFile ambigs_file;
00195     ambigs_file.Open(tessdata_manager.GetDataFilePtr(),
00196                      tessdata_manager.GetEndOffset(TESSDATA_AMBIGS) + 1);
00197     unichar_ambigs.LoadUnicharAmbigs(
00198         encoder_unicharset,
00199         &ambigs_file,
00200         ambigs_debug_level, use_ambigs_for_adaption, &unicharset);
00201     if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
00202   }
00203 
00204   // The various OcrEngineMode settings (see publictypes.h) determine which
00205   // engine-specific data files need to be loaded. Currently everything needs
00206   // the base tesseract data, which supplies other useful information, but
00207   // alternative engines, such as cube and LSTM are optional.
00208 #ifndef NO_CUBE_BUILD
00209   if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
00210     ASSERT_HOST(init_cube_objects(false, &tessdata_manager));
00211     if (tessdata_manager_debug_level)
00212       tprintf("Loaded Cube w/out combiner\n");
00213   } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
00214     ASSERT_HOST(init_cube_objects(true, &tessdata_manager));
00215     if (tessdata_manager_debug_level)
00216       tprintf("Loaded Cube with combiner\n");
00217   }
00218 #endif
00219   // Init ParamsModel.
00220   // Load pass1 and pass2 weights (for now these two sets are the same, but in
00221   // the future separate sets of weights can be generated).
00222   for (int p = ParamsModel::PTRAIN_PASS1;
00223       p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
00224     language_model_->getParamsModel().SetPass(
00225         static_cast<ParamsModel::PassEnum>(p));
00226     if (tessdata_manager.SeekToStart(TESSDATA_PARAMS_MODEL)) {
00227       if (!language_model_->getParamsModel().LoadFromFp(
00228           lang.string(), tessdata_manager.GetDataFilePtr(),
00229           tessdata_manager.GetEndOffset(TESSDATA_PARAMS_MODEL))) {
00230         return false;
00231       }
00232     }
00233   }
00234   if (tessdata_manager_debug_level) language_model_->getParamsModel().Print();
00235 
00236   return true;
00237 }
00238 
00239 // Helper returns true if the given string is in the vector of strings.
00240 static bool IsStrInList(const STRING& str,
00241                         const GenericVector<STRING>& str_list) {
00242   for (int i = 0; i < str_list.size(); ++i) {
00243     if (str_list[i] == str)
00244       return true;
00245   }
00246   return false;
00247 }
00248 
00249 // Parse a string of the form [~]<lang>[+[~]<lang>]*.
00250 // Langs with no prefix get appended to to_load, provided they
00251 // are not in there already.
00252 // Langs with ~ prefix get appended to not_to_load, provided they are not in
00253 // there already.
00254 void Tesseract::ParseLanguageString(const char* lang_str,
00255                                     GenericVector<STRING>* to_load,
00256                                     GenericVector<STRING>* not_to_load) {
00257   STRING remains(lang_str);
00258   while (remains.length() > 0) {
00259     // Find the start of the lang code and which vector to add to.
00260     const char* start = remains.string();
00261     while (*start == '+')
00262       ++start;
00263     GenericVector<STRING>* target = to_load;
00264     if (*start == '~') {
00265       target = not_to_load;
00266       ++start;
00267     }
00268     // Find the index of the end of the lang code in string start.
00269     int end = strlen(start);
00270     const char* plus = strchr(start, '+');
00271     if (plus != NULL && plus - start < end)
00272       end = plus - start;
00273     STRING lang_code(start);
00274     lang_code.truncate_at(end);
00275     STRING next(start + end);
00276     remains = next;
00277     // Check whether lang_code is already in the target vector and add.
00278     if (!IsStrInList(lang_code, *target)) {
00279       if (tessdata_manager_debug_level)
00280         tprintf("Adding language '%s' to list\n", lang_code.string());
00281       target->push_back(lang_code);
00282     }
00283   }
00284 }
00285 
00286 // Initialize for potentially a set of languages defined by the language
00287 // string and recursively any additional languages required by any language
00288 // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
00289 // See init_tesseract_internal for args.
00290 int Tesseract::init_tesseract(
00291     const char *arg0, const char *textbase, const char *language,
00292     OcrEngineMode oem, char **configs, int configs_size,
00293     const GenericVector<STRING> *vars_vec,
00294     const GenericVector<STRING> *vars_values,
00295     bool set_only_non_debug_params) {
00296   GenericVector<STRING> langs_to_load;
00297   GenericVector<STRING> langs_not_to_load;
00298   ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
00299 
00300   sub_langs_.delete_data_pointers();
00301   sub_langs_.clear();
00302   // Find the first loadable lang and load into this.
00303   // Add any languages that this language requires
00304   bool loaded_primary = false;
00305   // Load the rest into sub_langs_.
00306   for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
00307     if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
00308       const char *lang_str = langs_to_load[lang_index].string();
00309       Tesseract *tess_to_init;
00310       if (!loaded_primary) {
00311         tess_to_init = this;
00312       } else {
00313         tess_to_init = new Tesseract;
00314       }
00315 
00316       int result = tess_to_init->init_tesseract_internal(
00317           arg0, textbase, lang_str, oem, configs, configs_size,
00318           vars_vec, vars_values, set_only_non_debug_params);
00319 
00320       if (!loaded_primary) {
00321         if (result < 0) {
00322           tprintf("Failed loading language '%s'\n", lang_str);
00323         } else {
00324           if (tessdata_manager_debug_level)
00325             tprintf("Loaded language '%s' as main language\n", lang_str);
00326           ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
00327                               &langs_to_load, &langs_not_to_load);
00328           loaded_primary = true;
00329         }
00330       } else {
00331         if (result < 0) {
00332           tprintf("Failed loading language '%s'\n", lang_str);
00333           delete tess_to_init;
00334         } else {
00335           if (tessdata_manager_debug_level)
00336             tprintf("Loaded language '%s' as secondary language\n", lang_str);
00337           sub_langs_.push_back(tess_to_init);
00338           // Add any languages that this language requires
00339           ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
00340                               &langs_to_load, &langs_not_to_load);
00341         }
00342       }
00343     }
00344   }
00345   if (!loaded_primary) {
00346     tprintf("Tesseract couldn't load any languages!\n");
00347     return -1;  // Couldn't load any language!
00348   }
00349   if (!sub_langs_.empty()) {
00350     // In multilingual mode word ratings have to be directly comparable,
00351     // so use the same language model weights for all languages:
00352     // use the primary language's params model if
00353     // tessedit_use_primary_params_model is set,
00354     // otherwise use default language model weights.
00355     if (tessedit_use_primary_params_model) {
00356       for (int s = 0; s < sub_langs_.size(); ++s) {
00357         sub_langs_[s]->language_model_->getParamsModel().Copy(
00358             this->language_model_->getParamsModel());
00359       }
00360       tprintf("Using params model of the primary language\n");
00361       if (tessdata_manager_debug_level)  {
00362         this->language_model_->getParamsModel().Print();
00363       }
00364     } else {
00365       this->language_model_->getParamsModel().Clear();
00366       for (int s = 0; s < sub_langs_.size(); ++s) {
00367         sub_langs_[s]->language_model_->getParamsModel().Clear();
00368       }
00369       if (tessdata_manager_debug_level)
00370         tprintf("Using default language params\n");
00371     }
00372   }
00373 
00374   SetupUniversalFontIds();
00375   return 0;
00376 }
00377 
00378 // Common initialization for a single language.
00379 // arg0 is the datapath for the tessdata directory, which could be the
00380 // path of the tessdata directory with no trailing /, or (if tessdata
00381 // lives in the same directory as the executable, the path of the executable,
00382 // hence the name arg0.
00383 // textbase is an optional output file basename (used only for training)
00384 // language is the language code to load.
00385 // oem controls which engine(s) will operate on the image
00386 // configs (argv) is an array of config filenames to load variables from.
00387 // May be NULL.
00388 // configs_size (argc) is the number of elements in configs.
00389 // vars_vec is an optional vector of variables to set.
00390 // vars_values is an optional corresponding vector of values for the variables
00391 // in vars_vec.
00392 // If set_only_init_params is true, then only the initialization variables
00393 // will be set.
00394 int Tesseract::init_tesseract_internal(
00395     const char *arg0, const char *textbase, const char *language,
00396     OcrEngineMode oem, char **configs, int configs_size,
00397     const GenericVector<STRING> *vars_vec,
00398     const GenericVector<STRING> *vars_values,
00399     bool set_only_non_debug_params) {
00400   if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
00401                                 configs_size, vars_vec, vars_values,
00402                                 set_only_non_debug_params)) {
00403     return -1;
00404   }
00405   if (tessedit_init_config_only) {
00406     tessdata_manager.End();
00407     return 0;
00408   }
00409   // If only Cube will be used, skip loading Tesseract classifier's
00410   // pre-trained templates.
00411   bool init_tesseract_classifier =
00412     (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY ||
00413      tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED);
00414   // If only Cube will be used and if it has its own Unicharset,
00415   // skip initializing permuter and loading Tesseract Dawgs.
00416   bool init_dict =
00417     !(tessedit_ocr_engine_mode == OEM_CUBE_ONLY &&
00418       tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET));
00419   program_editup(textbase, init_tesseract_classifier, init_dict);
00420   tessdata_manager.End();
00421   return 0;                      //Normal exit
00422 }
00423 
00424 // Helper builds the all_fonts table by adding new fonts from new_fonts.
00425 static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
00426                          UnicityTable<FontInfo>* all_fonts) {
00427   for (int i = 0; i < new_fonts.size(); ++i) {
00428     // UnicityTable uniques as we go.
00429     all_fonts->push_back(new_fonts.get(i));
00430   }
00431 }
00432 
00433 // Helper assigns an id to lang_fonts using the index in all_fonts table.
00434 static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
00435                       UnicityTable<FontInfo>* lang_fonts) {
00436   for (int i = 0; i < lang_fonts->size(); ++i) {
00437     int index = all_fonts.get_id(lang_fonts->get(i));
00438     lang_fonts->get_mutable(i)->universal_id = index;
00439   }
00440 }
00441 
00442 // Set the universal_id member of each font to be unique among all
00443 // instances of the same font loaded.
00444 void Tesseract::SetupUniversalFontIds() {
00445   // Note that we can get away with bitwise copying FontInfo in
00446   // all_fonts, as it is a temporary structure and we avoid setting the
00447   // delete callback.
00448   UnicityTable<FontInfo> all_fonts;
00449   all_fonts.set_compare_callback(NewPermanentTessCallback(CompareFontInfo));
00450 
00451   // Create the universal ID table.
00452   CollectFonts(get_fontinfo_table(), &all_fonts);
00453   for (int i = 0; i < sub_langs_.size(); ++i) {
00454     CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
00455   }
00456   // Assign ids from the table to each font table.
00457   AssignIds(all_fonts, &get_fontinfo_table());
00458   for (int i = 0; i < sub_langs_.size(); ++i) {
00459     AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
00460   }
00461   font_table_size_ = all_fonts.size();
00462 }
00463 
00464 // init the LM component
00465 int Tesseract::init_tesseract_lm(const char *arg0,
00466                    const char *textbase,
00467                    const char *language) {
00468   if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
00469                                 NULL, 0, NULL, NULL, false))
00470     return -1;
00471   getDict().Load(Dict::GlobalDawgCache());
00472   tessdata_manager.End();
00473   return 0;
00474 }
00475 
00476 void Tesseract::end_tesseract() {
00477   end_recog();
00478 }
00479 
00480 /* Define command type identifiers */
00481 
00482 enum CMD_EVENTS
00483 {
00484   ACTION_1_CMD_EVENT,
00485   RECOG_WERDS,
00486   RECOG_PSEUDO,
00487   ACTION_2_CMD_EVENT
00488 };
00489 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines