|
tesseract 3.04.01
|
00001 00002 // File: tessdatamanager.cpp 00003 // Description: Functions to handle loading/combining tesseract data files. 00004 // Author: Daria Antonova 00005 // Created: Wed Jun 03 11:26:43 PST 2009 00006 // 00007 // (C) Copyright 2009, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #endif 00023 00024 #include "tessdatamanager.h" 00025 00026 #include <stdio.h> 00027 00028 #include "helpers.h" 00029 #include "serialis.h" 00030 #include "strngs.h" 00031 #include "tprintf.h" 00032 #include "params.h" 00033 00034 namespace tesseract { 00035 00036 bool TessdataManager::Init(const char *data_file_name, int debug_level) { 00037 int i; 00038 debug_level_ = debug_level; 00039 data_file_name_ = data_file_name; 00040 data_file_ = fopen(data_file_name, "rb"); 00041 if (data_file_ == NULL) { 00042 tprintf("Error opening data file %s\n", data_file_name); 00043 tprintf("Please make sure the TESSDATA_PREFIX environment variable is set " 00044 "to the parent directory of your \"tessdata\" directory.\n"); 00045 return false; 00046 } 00047 fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_); 00048 swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries); 00049 if (swap_) { 00050 ReverseN(&actual_tessdata_num_entries_, 00051 sizeof(actual_tessdata_num_entries_)); 00052 } 00053 if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) { 00054 // For forward compatibility, truncate to the number we can handle. 00055 actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES; 00056 } 00057 fread(offset_table_, sizeof(inT64), 00058 actual_tessdata_num_entries_, data_file_); 00059 if (swap_) { 00060 for (i = 0 ; i < actual_tessdata_num_entries_; ++i) { 00061 ReverseN(&offset_table_[i], sizeof(offset_table_[i])); 00062 } 00063 } 00064 if (debug_level_) { 00065 tprintf("TessdataManager loaded %d types of tesseract data files.\n", 00066 actual_tessdata_num_entries_); 00067 for (i = 0; i < actual_tessdata_num_entries_; ++i) { 00068 tprintf("Offset for type %d is %lld\n", i, offset_table_[i]); 00069 } 00070 } 00071 return true; 00072 } 00073 00074 void TessdataManager::CopyFile(FILE *input_file, FILE *output_file, 00075 bool newline_end, inT64 num_bytes_to_copy) { 00076 if (num_bytes_to_copy == 0) return; 00077 int buffer_size = 1024; 00078 if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) { 00079 buffer_size = num_bytes_to_copy; 00080 } 00081 inT64 num_bytes_copied = 0; 00082 char *chunk = new char[buffer_size]; 00083 int bytes_read; 00084 char last_char = 0x0; 00085 while ((bytes_read = fread(chunk, sizeof(char), 00086 buffer_size, input_file))) { 00087 fwrite(chunk, sizeof(char), bytes_read, output_file); 00088 last_char = chunk[bytes_read-1]; 00089 if (num_bytes_to_copy > 0) { 00090 num_bytes_copied += bytes_read; 00091 if (num_bytes_copied == num_bytes_to_copy) break; 00092 if (num_bytes_copied + buffer_size > num_bytes_to_copy) { 00093 buffer_size = num_bytes_to_copy - num_bytes_copied; 00094 } 00095 } 00096 } 00097 if (newline_end) ASSERT_HOST(last_char == '\n'); 00098 delete[] chunk; 00099 } 00100 00101 bool TessdataManager::WriteMetadata(inT64 *offset_table, 00102 const char * language_data_path_prefix, 00103 FILE *output_file) { 00104 inT32 num_entries = TESSDATA_NUM_ENTRIES; 00105 bool result = true; 00106 if (fseek(output_file, 0, SEEK_SET) != 0 || 00107 fwrite(&num_entries, sizeof(inT32), 1, output_file) != 1 || 00108 fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, 00109 output_file) != TESSDATA_NUM_ENTRIES) { 00110 fclose(output_file); 00111 result = false; 00112 tprintf("WriteMetadata failed in TessdataManager!\n"); 00113 } else if (fclose(output_file)) { 00114 result = false; 00115 tprintf("WriteMetadata failed to close file!\n"); 00116 } else { 00117 tprintf("TessdataManager combined tesseract data files.\n"); 00118 for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00119 tprintf("Offset for type %2d (%s%-22s) is %lld\n", i, 00120 language_data_path_prefix, kTessdataFileSuffixes[i], 00121 offset_table[i]); 00122 } 00123 } 00124 return result; 00125 } 00126 00127 bool TessdataManager::CombineDataFiles( 00128 const char *language_data_path_prefix, 00129 const char *output_filename) { 00130 int i; 00131 inT64 offset_table[TESSDATA_NUM_ENTRIES]; 00132 for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1; 00133 FILE *output_file = fopen(output_filename, "wb"); 00134 if (output_file == NULL) { 00135 tprintf("Error opening %s for writing\n", output_filename); 00136 return false; 00137 } 00138 // Leave some space for recording the offset_table. 00139 if (fseek(output_file, 00140 sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) { 00141 tprintf("Error seeking %s\n", output_filename); 00142 return false; 00143 } 00144 00145 TessdataType type = TESSDATA_NUM_ENTRIES; 00146 bool text_file = false; 00147 FILE *file_ptr[TESSDATA_NUM_ENTRIES]; 00148 00149 // Load individual tessdata components from files. 00150 for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00151 ASSERT_HOST(TessdataTypeFromFileSuffix( 00152 kTessdataFileSuffixes[i], &type, &text_file)); 00153 STRING filename = language_data_path_prefix; 00154 filename += kTessdataFileSuffixes[i]; 00155 file_ptr[i] = fopen(filename.string(), "rb"); 00156 if (file_ptr[i] != NULL) { 00157 offset_table[type] = ftell(output_file); 00158 CopyFile(file_ptr[i], output_file, text_file, -1); 00159 fclose(file_ptr[i]); 00160 } 00161 } 00162 00163 // Make sure that the required components are present. 00164 if (file_ptr[TESSDATA_UNICHARSET] == NULL) { 00165 tprintf("Error opening %sunicharset file\n", language_data_path_prefix); 00166 fclose(output_file); 00167 return false; 00168 } 00169 if (file_ptr[TESSDATA_INTTEMP] != NULL && 00170 (file_ptr[TESSDATA_PFFMTABLE] == NULL || 00171 file_ptr[TESSDATA_NORMPROTO] == NULL)) { 00172 tprintf("Error opening %spffmtable and/or %snormproto files" 00173 " while %sinttemp file was present\n", language_data_path_prefix, 00174 language_data_path_prefix, language_data_path_prefix); 00175 fclose(output_file); 00176 return false; 00177 } 00178 00179 return WriteMetadata(offset_table, language_data_path_prefix, output_file); 00180 } 00181 00182 bool TessdataManager::OverwriteComponents( 00183 const char *new_traineddata_filename, 00184 char **component_filenames, 00185 int num_new_components) { 00186 int i; 00187 inT64 offset_table[TESSDATA_NUM_ENTRIES]; 00188 TessdataType type = TESSDATA_NUM_ENTRIES; 00189 bool text_file = false; 00190 FILE *file_ptr[TESSDATA_NUM_ENTRIES]; 00191 for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00192 offset_table[i] = -1; 00193 file_ptr[i] = NULL; 00194 } 00195 FILE *output_file = fopen(new_traineddata_filename, "wb"); 00196 if (output_file == NULL) { 00197 tprintf("Error opening %s for writing\n", new_traineddata_filename); 00198 return false; 00199 } 00200 00201 // Leave some space for recording the offset_table. 00202 if (fseek(output_file, 00203 sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) { 00204 fclose(output_file); 00205 tprintf("Error seeking %s\n", new_traineddata_filename); 00206 return false; 00207 } 00208 00209 // Open the files with the new components. 00210 for (i = 0; i < num_new_components; ++i) { 00211 if (TessdataTypeFromFileName(component_filenames[i], &type, &text_file)) 00212 file_ptr[type] = fopen(component_filenames[i], "rb"); 00213 } 00214 00215 // Write updated data to the output traineddata file. 00216 for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00217 if (file_ptr[i] != NULL) { 00218 // Get the data from the opened component file. 00219 offset_table[i] = ftell(output_file); 00220 CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1); 00221 fclose(file_ptr[i]); 00222 } else { 00223 // Get this data component from the loaded data file. 00224 if (SeekToStart(static_cast<TessdataType>(i))) { 00225 offset_table[i] = ftell(output_file); 00226 CopyFile(data_file_, output_file, kTessdataFileIsText[i], 00227 GetEndOffset(static_cast<TessdataType>(i)) - 00228 ftell(data_file_) + 1); 00229 } 00230 } 00231 } 00232 const char *language_data_path_prefix = strchr(new_traineddata_filename, '.'); 00233 return WriteMetadata(offset_table, language_data_path_prefix, output_file); 00234 } 00235 00236 bool TessdataManager::TessdataTypeFromFileSuffix( 00237 const char *suffix, TessdataType *type, bool *text_file) { 00238 for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00239 if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) { 00240 *type = static_cast<TessdataType>(i); 00241 *text_file = kTessdataFileIsText[i]; 00242 return true; 00243 } 00244 } 00245 tprintf("TessdataManager can't determine which tessdata" 00246 " component is represented by %s\n", suffix); 00247 return false; 00248 } 00249 00250 bool TessdataManager::TessdataTypeFromFileName( 00251 const char *filename, TessdataType *type, bool *text_file) { 00252 // Get the file suffix (extension) 00253 const char *suffix = strrchr(filename, '.'); 00254 if (suffix == NULL || *(++suffix) == '\0') return false; 00255 return TessdataTypeFromFileSuffix(suffix, type, text_file); 00256 } 00257 00258 bool TessdataManager::ExtractToFile(const char *filename) { 00259 TessdataType type = TESSDATA_NUM_ENTRIES; 00260 bool text_file = false; 00261 ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName( 00262 filename, &type, &text_file)); 00263 if (!SeekToStart(type)) return false; 00264 00265 FILE *output_file = fopen(filename, "wb"); 00266 if (output_file == NULL) { 00267 tprintf("Error opening %s\n", filename); 00268 exit(1); 00269 } 00270 inT64 begin_offset = ftell(GetDataFilePtr()); 00271 inT64 end_offset = GetEndOffset(type); 00272 tesseract::TessdataManager::CopyFile( 00273 GetDataFilePtr(), output_file, text_file, 00274 end_offset - begin_offset + 1); 00275 fclose(output_file); 00276 return true; 00277 } 00278 00279 } // namespace tesseract