tesseract  4.1.0
tessedit.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tessedit.cpp (Formerly tessedit.c)
3  * Description: (Previously) Main program for merge of tess and editor.
4  * Now just code to load the language model and various
5  * engine-specific data files.
6  * Author: Ray Smith
7  *
8  * (C) Copyright 1992, Hewlett-Packard Ltd.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  **********************************************************************/
20 
21 // Include automatically generated configuration file if running autoconf.
22 #ifdef HAVE_CONFIG_H
23 # include "config_auto.h"
24 #endif
25 
26 #include "basedir.h"
27 #include "control.h"
28 # include "matchdefs.h"
29 #include "pageres.h"
30 #include "params.h"
31 #include "stopper.h"
32 #include "tesseractclass.h"
33 #include "tessvars.h"
34 #include "tprintf.h"
35 #ifndef DISABLED_LEGACY_ENGINE
36 # include "chop.h"
37 # include "intmatcher.h"
38 # include "reject.h"
39 #endif
40 #ifndef ANDROID_BUILD
41 # include "lstmrecognizer.h"
42 #endif
43 
44 namespace tesseract {
45 
46 // Read a "config" file containing a set of variable, value pairs.
47 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
48 // and also accepts a relative or absolute path name.
49 void Tesseract::read_config_file(const char* filename,
50  SetParamConstraint constraint) {
51  STRING path = datadir;
52  path += "configs/";
53  path += filename;
54  FILE* fp;
55  if ((fp = fopen(path.string(), "rb")) != nullptr) {
56  fclose(fp);
57  } else {
58  path = datadir;
59  path += "tessconfigs/";
60  path += filename;
61  if ((fp = fopen(path.string(), "rb")) != nullptr) {
62  fclose(fp);
63  } else {
64  path = filename;
65  }
66  }
67  ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
68 }
69 
70 // Returns false if a unicharset file for the specified language was not found
71 // or was invalid.
72 // This function initializes TessdataManager. After TessdataManager is
73 // no longer needed, TessdataManager::End() should be called.
74 //
75 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
76 // it is OEM_DEFAULT, in which case the value of the variable will be obtained
77 // from the language-specific config file (stored in [lang].traineddata), from
78 // the config files specified on the command line or left as the default
79 // OEM_TESSERACT_ONLY if none of the configs specify this variable.
81  const char* arg0, const char* textbase, const char* language,
82  OcrEngineMode oem, char** configs, int configs_size,
83  const GenericVector<STRING>* vars_vec,
84  const GenericVector<STRING>* vars_values, bool set_only_non_debug_params,
85  TessdataManager* mgr) {
86  // Set the basename, compute the data directory.
87  main_setup(arg0, textbase);
88 
89  // Set the language data path prefix
90  lang = language != nullptr ? language : "eng";
94 
95  // Initialize TessdataManager.
96  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
97  if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) {
98  tprintf("Error opening data file %s\n", tessdata_path.string());
99  tprintf(
100  "Please make sure the TESSDATA_PREFIX environment variable is set"
101  " to your \"tessdata\" directory.\n");
102  return false;
103  }
104 #ifndef DISABLED_LEGACY_ENGINE
105  if (oem == OEM_DEFAULT) {
106  // Set the engine mode from availability, which can then be overridden by
107  // the config file when we read it below.
108  if (!mgr->IsLSTMAvailable()) {
110  } else if (!mgr->IsBaseAvailable()) {
112  } else {
114  }
115  }
116 #endif // ndef DISABLED_LEGACY_ENGINE
117 
118  // If a language specific config file (lang.config) exists, load it in.
119  TFile fp;
120  if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
122  this->params());
123  }
124 
125  SetParamConstraint set_params_constraint =
126  set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
128  // Load tesseract variables from config files. This is done after loading
129  // language-specific variables from [lang].traineddata file, so that custom
130  // config files can override values in [lang].traineddata file.
131  for (int i = 0; i < configs_size; ++i) {
132  read_config_file(configs[i], set_params_constraint);
133  }
134 
135  // Set params specified in vars_vec (done after setting params from config
136  // files, so that params in vars_vec can override those from files).
137  if (vars_vec != nullptr && vars_values != nullptr) {
138  for (int i = 0; i < vars_vec->size(); ++i) {
139  if (!ParamUtils::SetParam((*vars_vec)[i].string(),
140  (*vars_values)[i].string(),
141  set_params_constraint, this->params())) {
142  tprintf("Error setting param %s\n", (*vars_vec)[i].string());
143  exit(1);
144  }
145  }
146  }
147 
148  if (!tessedit_write_params_to_file.empty()) {
149  FILE* params_file = fopen(tessedit_write_params_to_file.string(), "wb");
150  if (params_file != nullptr) {
151  ParamUtils::PrintParams(params_file, this->params());
152  fclose(params_file);
153  } else {
154  tprintf("Failed to open %s for writing params.\n",
156  }
157  }
158 
159  // Determine which ocr engine(s) should be loaded and used for recognition.
160  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
161 
162  // If we are only loading the config file (and so not planning on doing any
163  // recognition) then there's nothing else do here.
165  return true;
166  }
167 
168 // The various OcrEngineMode settings (see publictypes.h) determine which
169 // engine-specific data files need to be loaded.
170 // If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
171 #ifndef ANDROID_BUILD
172 # ifdef DISABLED_LEGACY_ENGINE
174 # else
177 # endif // ndef DISABLED_LEGACY_ENGINE
179  lstm_recognizer_ = new LSTMRecognizer;
180  ASSERT_HOST(lstm_recognizer_->Load(
181  this->params(), lstm_use_matrix ? language : nullptr, mgr));
182  } else {
183  tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
185  }
186  }
187 #endif // ndef ANDROID_BUILD
188 
189  // Load the unicharset
191  // Avoid requiring a unicharset when we aren't running base tesseract.
192 #ifndef ANDROID_BUILD
193  unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
194 #endif // ndef ANDROID_BUILD
195  }
196 #ifndef DISABLED_LEGACY_ENGINE
197  else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
198  !unicharset.load_from_file(&fp, false)) {
199  return false;
200  }
201 #endif // ndef DISABLED_LEGACY_ENGINE
202  if (unicharset.size() > MAX_NUM_CLASSES) {
203  tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
204  return false;
205  }
206  right_to_left_ = unicharset.major_right_to_left();
207 
208  // Setup initial unichar ambigs table and read universal ambigs.
209  UNICHARSET encoder_unicharset;
210  encoder_unicharset.CopyFrom(unicharset);
212  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
213 
215  unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp,
218  }
219 #ifndef DISABLED_LEGACY_ENGINE
220  // Init ParamsModel.
221  // Load pass1 and pass2 weights (for now these two sets are the same, but in
222  // the future separate sets of weights can be generated).
224  ++p) {
225  language_model_->getParamsModel().SetPass(
226  static_cast<ParamsModel::PassEnum>(p));
227  if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
228  if (!language_model_->getParamsModel().LoadFromFp(lang.string(), &fp)) {
229  return false;
230  }
231  }
232  }
233 #endif // ndef DISABLED_LEGACY_ENGINE
234 
235  return true;
236 }
237 
238 // Helper returns true if the given string is in the vector of strings.
239 static bool IsStrInList(const STRING& str,
240  const GenericVector<STRING>& str_list) {
241  for (int i = 0; i < str_list.size(); ++i) {
242  if (str_list[i] == str) return true;
243  }
244  return false;
245 }
246 
247 // Parse a string of the form [~]<lang>[+[~]<lang>]*.
248 // Langs with no prefix get appended to to_load, provided they
249 // are not in there already.
250 // Langs with ~ prefix get appended to not_to_load, provided they are not in
251 // there already.
252 void Tesseract::ParseLanguageString(const char* lang_str,
253  GenericVector<STRING>* to_load,
254  GenericVector<STRING>* not_to_load) {
255  STRING remains(lang_str);
256  while (remains.length() > 0) {
257  // Find the start of the lang code and which vector to add to.
258  const char* start = remains.string();
259  while (*start == '+') ++start;
260  GenericVector<STRING>* target = to_load;
261  if (*start == '~') {
262  target = not_to_load;
263  ++start;
264  }
265  // Find the index of the end of the lang code in string start.
266  int end = strlen(start);
267  const char* plus = strchr(start, '+');
268  if (plus != nullptr && plus - start < end) end = plus - start;
269  STRING lang_code(start);
270  lang_code.truncate_at(end);
271  STRING next(start + end);
272  remains = next;
273  // Check whether lang_code is already in the target vector and add.
274  if (!IsStrInList(lang_code, *target)) {
275  target->push_back(lang_code);
276  }
277  }
278 }
279 
280 // Initialize for potentially a set of languages defined by the language
281 // string and recursively any additional languages required by any language
282 // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
283 // See init_tesseract_internal for args.
284 int Tesseract::init_tesseract(const char* arg0, const char* textbase,
285  const char* language, OcrEngineMode oem,
286  char** configs, int configs_size,
287  const GenericVector<STRING>* vars_vec,
288  const GenericVector<STRING>* vars_values,
289  bool set_only_non_debug_params,
290  TessdataManager* mgr) {
291  GenericVector<STRING> langs_to_load;
292  GenericVector<STRING> langs_not_to_load;
293  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
294 
295  sub_langs_.delete_data_pointers();
296  sub_langs_.clear();
297  // Find the first loadable lang and load into this.
298  // Add any languages that this language requires
299  bool loaded_primary = false;
300  // Load the rest into sub_langs_.
301  for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
302  if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
303  const char* lang_str = langs_to_load[lang_index].string();
304  Tesseract* tess_to_init;
305  if (!loaded_primary) {
306  tess_to_init = this;
307  } else {
308  tess_to_init = new Tesseract;
309  }
310 
311  int result = tess_to_init->init_tesseract_internal(
312  arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
313  vars_values, set_only_non_debug_params, mgr);
314  // Forget that language, but keep any reader we were given.
315  mgr->Clear();
316 
317  if (!loaded_primary) {
318  if (result < 0) {
319  tprintf("Failed loading language '%s'\n", lang_str);
320  } else {
321  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
322  &langs_to_load, &langs_not_to_load);
323  loaded_primary = true;
324  }
325  } else {
326  if (result < 0) {
327  tprintf("Failed loading language '%s'\n", lang_str);
328  delete tess_to_init;
329  } else {
330  sub_langs_.push_back(tess_to_init);
331  // Add any languages that this language requires
332  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
333  &langs_to_load, &langs_not_to_load);
334  }
335  }
336  }
337  }
338  if (!loaded_primary) {
339  tprintf("Tesseract couldn't load any languages!\n");
340  return -1; // Couldn't load any language!
341  }
342 #ifndef DISABLED_LEGACY_ENGINE
343  if (!sub_langs_.empty()) {
344  // In multilingual mode word ratings have to be directly comparable,
345  // so use the same language model weights for all languages:
346  // use the primary language's params model if
347  // tessedit_use_primary_params_model is set,
348  // otherwise use default language model weights.
350  for (int s = 0; s < sub_langs_.size(); ++s) {
351  sub_langs_[s]->language_model_->getParamsModel().Copy(
352  this->language_model_->getParamsModel());
353  }
354  tprintf("Using params model of the primary language\n");
355  } else {
356  this->language_model_->getParamsModel().Clear();
357  for (int s = 0; s < sub_langs_.size(); ++s) {
358  sub_langs_[s]->language_model_->getParamsModel().Clear();
359  }
360  }
361  }
362 
364 #endif // ndef DISABLED_LEGACY_ENGINE
365  return 0;
366 }
367 
368 // Common initialization for a single language.
369 // arg0 is the datapath for the tessdata directory, which could be the
370 // path of the tessdata directory with no trailing /, or (if tessdata
371 // lives in the same directory as the executable, the path of the executable,
372 // hence the name arg0.
373 // textbase is an optional output file basename (used only for training)
374 // language is the language code to load.
375 // oem controls which engine(s) will operate on the image
376 // configs (argv) is an array of config filenames to load variables from.
377 // May be nullptr.
378 // configs_size (argc) is the number of elements in configs.
379 // vars_vec is an optional vector of variables to set.
380 // vars_values is an optional corresponding vector of values for the variables
381 // in vars_vec.
382 // If set_only_init_params is true, then only the initialization variables
383 // will be set.
384 int Tesseract::init_tesseract_internal(const char* arg0, const char* textbase,
385  const char* language, OcrEngineMode oem,
386  char** configs, int configs_size,
387  const GenericVector<STRING>* vars_vec,
388  const GenericVector<STRING>* vars_values,
389  bool set_only_non_debug_params,
390  TessdataManager* mgr) {
391  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
392  configs_size, vars_vec, vars_values,
393  set_only_non_debug_params, mgr)) {
394  return -1;
395  }
397  return 0;
398  }
399  // If only LSTM will be used, skip loading Tesseract classifier's
400  // pre-trained templates and dictionary.
402  program_editup(textbase, init_tesseract ? mgr : nullptr,
403  init_tesseract ? mgr : nullptr);
404  return 0; // Normal exit
405 }
406 
407 #ifndef DISABLED_LEGACY_ENGINE
408 
409 // Helper builds the all_fonts table by adding new fonts from new_fonts.
410 static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
411  UnicityTable<FontInfo>* all_fonts) {
412  for (int i = 0; i < new_fonts.size(); ++i) {
413  // UnicityTable uniques as we go.
414  all_fonts->push_back(new_fonts.get(i));
415  }
416 }
417 
418 // Helper assigns an id to lang_fonts using the index in all_fonts table.
419 static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
420  UnicityTable<FontInfo>* lang_fonts) {
421  for (int i = 0; i < lang_fonts->size(); ++i) {
422  int index = all_fonts.get_id(lang_fonts->get(i));
423  lang_fonts->get_mutable(i)->universal_id = index;
424  }
425 }
426 
427 // Set the universal_id member of each font to be unique among all
428 // instances of the same font loaded.
430  // Note that we can get away with bitwise copying FontInfo in
431  // all_fonts, as it is a temporary structure and we avoid setting the
432  // delete callback.
433  UnicityTable<FontInfo> all_fonts;
435 
436  // Create the universal ID table.
437  CollectFonts(get_fontinfo_table(), &all_fonts);
438  for (int i = 0; i < sub_langs_.size(); ++i) {
439  CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
440  }
441  // Assign ids from the table to each font table.
442  AssignIds(all_fonts, &get_fontinfo_table());
443  for (int i = 0; i < sub_langs_.size(); ++i) {
444  AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
445  }
446  font_table_size_ = all_fonts.size();
447 }
448 
449 // init the LM component
450 int Tesseract::init_tesseract_lm(const char* arg0, const char* textbase,
451  const char* language, TessdataManager* mgr) {
452  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
453  nullptr, 0, nullptr, nullptr, false, mgr))
454  return -1;
456  getDict().Load(lang, mgr);
457  getDict().FinishLoad();
458  return 0;
459 }
460 
461 #endif // ndef DISABLED_LEGACY_ENGINE
462 
464 
465 /* Define command type identifiers */
466 
472 };
473 } // namespace tesseract
ParamsVectors * params()
Definition: ccutil.h:65
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
void main_setup(const char *argv0, const char *basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: mainblk.cpp:44
void SetupUniversalFontIds()
Definition: tessedit.cpp:429
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:70
char * tessedit_write_params_to_file
SetParamConstraint
Definition: params.h:35
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:448
bool IsComponentAvailable(TessdataType type) const
Definition: strngs.h:45
int ambigs_debug_level
Definition: ccutil.h:83
const UNICHARSET & GetUnicharset() const
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:180
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:72
#define MAX_NUM_CLASSES
Definition: matchdefs.h:30
bool Init(const char *data_file_name)
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:63
UNICHARSET unicharset
Definition: ccutil.h:71
bool FinishLoad()
Definition: dict.cpp:360
int32_t length() const
Definition: strngs.cpp:189
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
Definition: tessedit.cpp:450
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:80
const T & get(int id) const
Return the object from an id.
int size() const
Definition: unicharset.h:341
bool GetComponent(TessdataType type, TFile *fp)
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
Definition: params.cpp:63
void truncate_at(int32_t index)
Definition: strngs.cpp:265
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:201
bool major_right_to_left() const
Definition: unicharset.cpp:992
int push_back(T object)
Add an element in the table.
bool Load(const ParamsVectors *params, const char *lang, TessdataManager *mgr)
void set_compare_callback(TessResultCallback2< bool, T const &, T const & > *cb)
int init_tesseract_internal(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:384
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:219
int size() const
Return the size used.
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:119
const char * string() const
Definition: strngs.cpp:194
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
int push_back(T object)
Dict & getDict() override
int32_t universal_id
Definition: fontinfo.h:123
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:252
STRING lang
Definition: ccutil.h:69
STRING datadir
Definition: ccutil.h:67
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:92
void program_editup(const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
Definition: tface.cpp:40
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:388
#define ASSERT_HOST(x)
Definition: errcode.h:88
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:49
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:284
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
static TESS_API DawgCache * GlobalDawgCache()
Definition: dict.cpp:193
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
T * get_mutable(int id)
int size() const
Definition: genericvector.h:70
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:42
bool use_ambigs_for_adaption
Definition: ccutil.h:87
int get_id(T object) const
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:49
STRING language_data_path_prefix
Definition: ccutil.h:70