|
tesseract 3.04.01
|
00001 00002 // File: unicharset_extractor.cpp 00003 // Description: Unicode character/ligature set extractor. 00004 // Author: Thomas Kielbus 00005 // Created: Wed Jun 28 17:05:01 PDT 2006 00006 // 00007 // (C) Copyright 2006, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 // Given a list of box files on the command line, this program generates a file 00021 // containing a unicharset, a list of all the characters used by Tesseract 00022 // 00023 // The file contains the size of the set on the first line, and then one 00024 // unichar per line. 00025 00026 #include <stdio.h> 00027 #if defined(HAVE_WCHAR_T) || defined(_WIN32) || defined(GOOGLE3) 00028 #include <wchar.h> 00029 #include <wctype.h> 00030 #define USING_WCTYPE 00031 #endif 00032 #include <locale.h> 00033 00034 #include "boxread.h" 00035 #include "rect.h" 00036 #include "strngs.h" 00037 #include "tessopt.h" 00038 #include "unichar.h" 00039 #include "unicharset.h" 00040 00041 static const char* const kUnicharsetFileName = "unicharset"; 00042 00043 UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc) { 00044 UNICHAR uch(wc); 00045 char *unichar = uch.utf8_str(); 00046 UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar); 00047 delete[] unichar; 00048 return unichar_id; 00049 } 00050 00051 // Set character properties using wctype if we have it. 00052 // Contributed by piggy@gmail.com. 00053 // Modified by Ray to use UNICHAR for unicode conversion 00054 // and to check for wctype using autoconf/presence of windows. 00055 void set_properties(UNICHARSET *unicharset, const char* const c_string) { 00056 #ifdef USING_WCTYPE 00057 UNICHAR_ID id; 00058 int wc; 00059 00060 // Convert the string to a unichar id. 00061 id = unicharset->unichar_to_id(c_string); 00062 00063 // Set the other_case property to be this unichar id by default. 00064 unicharset->set_other_case(id, id); 00065 00066 int step = UNICHAR::utf8_step(c_string); 00067 if (step == 0) 00068 return; // Invalid utf-8. 00069 00070 // Get the next Unicode code point in the string. 00071 UNICHAR ch(c_string, step); 00072 wc = ch.first_uni(); 00073 00074 /* Copy the properties. */ 00075 if (iswalpha(wc)) { 00076 unicharset->set_isalpha(id, 1); 00077 if (iswlower(wc)) { 00078 unicharset->set_islower(id, 1); 00079 unicharset->set_other_case(id, wc_to_unichar_id(*unicharset, 00080 towupper(wc))); 00081 } 00082 if (iswupper(wc)) { 00083 unicharset->set_isupper(id, 1); 00084 unicharset->set_other_case(id, wc_to_unichar_id(*unicharset, 00085 towlower(wc))); 00086 } 00087 } 00088 if (iswdigit(wc)) 00089 unicharset->set_isdigit(id, 1); 00090 if(iswpunct(wc)) 00091 unicharset->set_ispunctuation(id, 1); 00092 00093 #endif 00094 } 00095 00096 int main(int argc, char** argv) { 00097 int option; 00098 const char* output_directory = "."; 00099 STRING unicharset_file_name; 00100 // Special characters are now included by default. 00101 UNICHARSET unicharset; 00102 00103 setlocale(LC_ALL, ""); 00104 00105 // Print usage 00106 if (argc <= 1) { 00107 printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]); 00108 #ifdef USING_WCTYPE 00109 printf("Character properties using wctype is enabled\n"); 00110 #else 00111 printf("WARNING: Character properties using wctype is DISABLED\n"); 00112 #endif 00113 exit(1); 00114 00115 } 00116 00117 // Parse arguments 00118 while ((option = tessopt(argc, argv, "D" )) != EOF) { 00119 switch (option) { 00120 case 'D': 00121 output_directory = tessoptarg; 00122 ++tessoptind; 00123 break; 00124 } 00125 } 00126 00127 // Save file name 00128 unicharset_file_name = output_directory; 00129 unicharset_file_name += "/"; 00130 unicharset_file_name += kUnicharsetFileName; 00131 00132 // Load box files 00133 for (; tessoptind < argc; ++tessoptind) { 00134 printf("Extracting unicharset from %s\n", argv[tessoptind]); 00135 00136 FILE* box_file = fopen(argv[tessoptind], "rb"); 00137 if (box_file == NULL) { 00138 printf("Cannot open box file %s\n", argv[tessoptind]); 00139 return -1; 00140 } 00141 00142 TBOX box; 00143 STRING unichar_string; 00144 int line_number = 0; 00145 while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) { 00146 unicharset.unichar_insert(unichar_string.string()); 00147 set_properties(&unicharset, unichar_string.string()); 00148 } 00149 } 00150 00151 // Write unicharset file 00152 if (unicharset.save_to_file(unicharset_file_name.string())) { 00153 printf("Wrote unicharset file %s.\n", unicharset_file_name.string()); 00154 } 00155 else { 00156 printf("Cannot save unicharset file %s.\n", unicharset_file_name.string()); 00157 return -1; 00158 } 00159 return 0; 00160 }