tesseract 3.04.01

ccutil/tessdatamanager.cpp

Go to the documentation of this file.
00001 
00002 // File:        tessdatamanager.cpp
00003 // Description: Functions to handle loading/combining tesseract data files.
00004 // Author:      Daria Antonova
00005 // Created:     Wed Jun 03 11:26:43 PST 2009
00006 //
00007 // (C) Copyright 2009, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #endif
00023 
00024 #include "tessdatamanager.h"
00025 
00026 #include <stdio.h>
00027 
00028 #include "helpers.h"
00029 #include "serialis.h"
00030 #include "strngs.h"
00031 #include "tprintf.h"
00032 #include "params.h"
00033 
00034 namespace tesseract {
00035 
00036 bool TessdataManager::Init(const char *data_file_name, int debug_level) {
00037   int i;
00038   debug_level_ = debug_level;
00039   data_file_name_ = data_file_name;
00040   data_file_ = fopen(data_file_name, "rb");
00041   if (data_file_ == NULL) {
00042     tprintf("Error opening data file %s\n", data_file_name);
00043     tprintf("Please make sure the TESSDATA_PREFIX environment variable is set "
00044             "to the parent directory of your \"tessdata\" directory.\n");
00045     return false;
00046   }
00047   fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
00048   swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
00049   if (swap_) {
00050     ReverseN(&actual_tessdata_num_entries_,
00051              sizeof(actual_tessdata_num_entries_));
00052   }
00053   if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) {
00054     // For forward compatibility, truncate to the number we can handle.
00055     actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES;
00056   }
00057   fread(offset_table_, sizeof(inT64),
00058         actual_tessdata_num_entries_, data_file_);
00059   if (swap_) {
00060     for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
00061       ReverseN(&offset_table_[i], sizeof(offset_table_[i]));
00062     }
00063   }
00064   if (debug_level_) {
00065     tprintf("TessdataManager loaded %d types of tesseract data files.\n",
00066             actual_tessdata_num_entries_);
00067     for (i = 0; i < actual_tessdata_num_entries_; ++i) {
00068       tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
00069     }
00070   }
00071   return true;
00072 }
00073 
00074 void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
00075                                bool newline_end, inT64 num_bytes_to_copy) {
00076   if (num_bytes_to_copy == 0) return;
00077   int buffer_size = 1024;
00078   if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
00079     buffer_size = num_bytes_to_copy;
00080   }
00081   inT64 num_bytes_copied = 0;
00082   char *chunk = new char[buffer_size];
00083   int bytes_read;
00084   char last_char = 0x0;
00085   while ((bytes_read = fread(chunk, sizeof(char),
00086                              buffer_size, input_file))) {
00087     fwrite(chunk, sizeof(char), bytes_read, output_file);
00088     last_char = chunk[bytes_read-1];
00089     if (num_bytes_to_copy > 0) {
00090       num_bytes_copied += bytes_read;
00091       if (num_bytes_copied == num_bytes_to_copy) break;
00092       if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
00093         buffer_size = num_bytes_to_copy - num_bytes_copied;
00094       }
00095     }
00096   }
00097   if (newline_end) ASSERT_HOST(last_char == '\n');
00098   delete[] chunk;
00099 }
00100 
00101 bool TessdataManager::WriteMetadata(inT64 *offset_table,
00102                                     const char * language_data_path_prefix,
00103                                     FILE *output_file) {
00104   inT32 num_entries = TESSDATA_NUM_ENTRIES;
00105   bool result = true;
00106   if (fseek(output_file, 0, SEEK_SET) != 0 ||
00107       fwrite(&num_entries, sizeof(inT32), 1, output_file) != 1 ||
00108       fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES,
00109              output_file) != TESSDATA_NUM_ENTRIES) {
00110     fclose(output_file);
00111     result = false;
00112     tprintf("WriteMetadata failed in TessdataManager!\n");
00113   } else if (fclose(output_file)) {
00114     result = false;
00115     tprintf("WriteMetadata failed to close file!\n");
00116   } else {
00117     tprintf("TessdataManager combined tesseract data files.\n");
00118     for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00119       tprintf("Offset for type %2d (%s%-22s) is %lld\n", i,
00120               language_data_path_prefix, kTessdataFileSuffixes[i],
00121               offset_table[i]);
00122     }
00123   }
00124   return result;
00125 }
00126 
00127 bool TessdataManager::CombineDataFiles(
00128     const char *language_data_path_prefix,
00129     const char *output_filename) {
00130   int i;
00131   inT64 offset_table[TESSDATA_NUM_ENTRIES];
00132   for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
00133   FILE *output_file = fopen(output_filename, "wb");
00134   if (output_file == NULL) {
00135     tprintf("Error opening %s for writing\n", output_filename);
00136     return false;
00137   }
00138   // Leave some space for recording the offset_table.
00139   if (fseek(output_file,
00140             sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
00141     tprintf("Error seeking %s\n", output_filename);
00142     return false;
00143   }
00144 
00145   TessdataType type = TESSDATA_NUM_ENTRIES;
00146   bool text_file = false;
00147   FILE *file_ptr[TESSDATA_NUM_ENTRIES];
00148 
00149   // Load individual tessdata components from files.
00150   for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00151     ASSERT_HOST(TessdataTypeFromFileSuffix(
00152         kTessdataFileSuffixes[i], &type, &text_file));
00153     STRING filename = language_data_path_prefix;
00154     filename += kTessdataFileSuffixes[i];
00155     file_ptr[i] =  fopen(filename.string(), "rb");
00156     if (file_ptr[i] != NULL) {
00157       offset_table[type] = ftell(output_file);
00158       CopyFile(file_ptr[i], output_file, text_file, -1);
00159       fclose(file_ptr[i]);
00160     }
00161   }
00162 
00163   // Make sure that the required components are present.
00164   if (file_ptr[TESSDATA_UNICHARSET] == NULL) {
00165     tprintf("Error opening %sunicharset file\n", language_data_path_prefix);
00166     fclose(output_file);
00167     return false;
00168   }
00169   if (file_ptr[TESSDATA_INTTEMP] != NULL &&
00170       (file_ptr[TESSDATA_PFFMTABLE] == NULL ||
00171        file_ptr[TESSDATA_NORMPROTO] == NULL)) {
00172     tprintf("Error opening %spffmtable and/or %snormproto files"
00173             " while %sinttemp file was present\n", language_data_path_prefix,
00174             language_data_path_prefix, language_data_path_prefix);
00175     fclose(output_file);
00176     return false;
00177   }
00178 
00179   return WriteMetadata(offset_table, language_data_path_prefix, output_file);
00180 }
00181 
00182 bool TessdataManager::OverwriteComponents(
00183     const char *new_traineddata_filename,
00184     char **component_filenames,
00185     int num_new_components) {
00186   int i;
00187   inT64 offset_table[TESSDATA_NUM_ENTRIES];
00188   TessdataType type = TESSDATA_NUM_ENTRIES;
00189   bool text_file = false;
00190   FILE *file_ptr[TESSDATA_NUM_ENTRIES];
00191   for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00192     offset_table[i] = -1;
00193     file_ptr[i] = NULL;
00194   }
00195   FILE *output_file = fopen(new_traineddata_filename, "wb");
00196   if (output_file == NULL) {
00197     tprintf("Error opening %s for writing\n", new_traineddata_filename);
00198     return false;
00199   }
00200 
00201   // Leave some space for recording the offset_table.
00202   if (fseek(output_file,
00203             sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
00204     fclose(output_file);
00205     tprintf("Error seeking %s\n", new_traineddata_filename);
00206     return false;
00207   }
00208 
00209   // Open the files with the new components.
00210   for (i = 0; i < num_new_components; ++i) {
00211     if (TessdataTypeFromFileName(component_filenames[i], &type, &text_file))
00212       file_ptr[type] = fopen(component_filenames[i], "rb");
00213   }
00214 
00215   // Write updated data to the output traineddata file.
00216   for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00217     if (file_ptr[i] != NULL) {
00218       // Get the data from the opened component file.
00219       offset_table[i] = ftell(output_file);
00220       CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
00221       fclose(file_ptr[i]);
00222     } else {
00223       // Get this data component from the loaded data file.
00224       if (SeekToStart(static_cast<TessdataType>(i))) {
00225         offset_table[i] = ftell(output_file);
00226         CopyFile(data_file_, output_file, kTessdataFileIsText[i],
00227                  GetEndOffset(static_cast<TessdataType>(i)) -
00228                  ftell(data_file_) + 1);
00229       }
00230     }
00231   }
00232   const char *language_data_path_prefix = strchr(new_traineddata_filename, '.');
00233   return WriteMetadata(offset_table, language_data_path_prefix, output_file);
00234 }
00235 
00236 bool TessdataManager::TessdataTypeFromFileSuffix(
00237     const char *suffix, TessdataType *type, bool *text_file) {
00238   for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00239     if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
00240       *type = static_cast<TessdataType>(i);
00241       *text_file = kTessdataFileIsText[i];
00242       return true;
00243     }
00244   }
00245   tprintf("TessdataManager can't determine which tessdata"
00246          " component is represented by %s\n", suffix);
00247   return false;
00248 }
00249 
00250 bool TessdataManager::TessdataTypeFromFileName(
00251     const char *filename, TessdataType *type, bool *text_file) {
00252   // Get the file suffix (extension)
00253   const char *suffix = strrchr(filename, '.');
00254   if (suffix == NULL || *(++suffix) == '\0') return false;
00255   return TessdataTypeFromFileSuffix(suffix, type, text_file);
00256 }
00257 
00258 bool TessdataManager::ExtractToFile(const char *filename) {
00259   TessdataType type = TESSDATA_NUM_ENTRIES;
00260   bool text_file = false;
00261   ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(
00262       filename, &type, &text_file));
00263   if (!SeekToStart(type)) return false;
00264 
00265   FILE *output_file = fopen(filename, "wb");
00266   if (output_file == NULL) {
00267     tprintf("Error opening %s\n", filename);
00268     exit(1);
00269   }
00270   inT64 begin_offset = ftell(GetDataFilePtr());
00271   inT64 end_offset = GetEndOffset(type);
00272   tesseract::TessdataManager::CopyFile(
00273       GetDataFilePtr(), output_file, text_file,
00274       end_offset - begin_offset + 1);
00275   fclose(output_file);
00276   return true;
00277 }
00278 
00279 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines