|
tesseract 3.04.01
|
00001 00002 // File: unicharset_training_utils.cpp 00003 // Description: Training utilities for UNICHARSET. 00004 // Author: Ray Smith 00005 // Created: Fri Oct 17 17:09:01 PDT 2014 00006 // 00007 // (C) Copyright 2014, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #include "unicharset_training_utils.h" 00021 00022 #include <stdlib.h> 00023 #include <string.h> 00024 #include <string> 00025 00026 #include "fileio.h" 00027 #include "genericvector.h" 00028 #include "icuerrorcode.h" 00029 #include "normstrngs.h" 00030 #include "statistc.h" 00031 #include "strngs.h" 00032 #include "unicharset.h" 00033 #include "unicode/uchar.h" // from libicu 00034 #include "unicode/uscript.h" // from libicu 00035 00036 namespace tesseract { 00037 00038 // Helper sets the character attribute properties and sets up the script table. 00039 // Does not set tops and bottoms. 00040 void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset) { 00041 for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) { 00042 // Convert any custom ligatures. 00043 const char* unichar_str = unicharset->id_to_unichar(unichar_id); 00044 for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) { 00045 if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) { 00046 unichar_str = UNICHARSET::kCustomLigatures[i][0]; 00047 break; 00048 } 00049 } 00050 00051 // Convert the unichar to UTF32 representation 00052 GenericVector<char32> uni_vector; 00053 tesseract::UTF8ToUTF32(unichar_str, &uni_vector); 00054 00055 // Assume that if the property is true for any character in the string, 00056 // then it holds for the whole "character". 00057 bool unichar_isalpha = false; 00058 bool unichar_islower = false; 00059 bool unichar_isupper = false; 00060 bool unichar_isdigit = false; 00061 bool unichar_ispunct = false; 00062 00063 for (int i = 0; i < uni_vector.size(); ++i) { 00064 if (u_isalpha(uni_vector[i])) 00065 unichar_isalpha = true; 00066 if (u_islower(uni_vector[i])) 00067 unichar_islower = true; 00068 if (u_isupper(uni_vector[i])) 00069 unichar_isupper = true; 00070 if (u_isdigit(uni_vector[i])) 00071 unichar_isdigit = true; 00072 if (u_ispunct(uni_vector[i])) 00073 unichar_ispunct = true; 00074 } 00075 00076 unicharset->set_isalpha(unichar_id, unichar_isalpha); 00077 unicharset->set_islower(unichar_id, unichar_islower); 00078 unicharset->set_isupper(unichar_id, unichar_isupper); 00079 unicharset->set_isdigit(unichar_id, unichar_isdigit); 00080 unicharset->set_ispunctuation(unichar_id, unichar_ispunct); 00081 00082 tesseract::IcuErrorCode err; 00083 unicharset->set_script(unichar_id, uscript_getName( 00084 uscript_getScript(uni_vector[0], err))); 00085 00086 const int num_code_points = uni_vector.size(); 00087 // Obtain the lower/upper case if needed and record it in the properties. 00088 unicharset->set_other_case(unichar_id, unichar_id); 00089 if (unichar_islower || unichar_isupper) { 00090 GenericVector<char32> other_case(num_code_points, 0); 00091 for (int i = 0; i < num_code_points; ++i) { 00092 // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used. 00093 // However since they deal with UChars (so need a conversion function 00094 // from char32 or UTF8string) and require a meaningful locale string, 00095 // for now u_tolower()/u_toupper() are used. 00096 other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) : 00097 u_tolower(uni_vector[i]); 00098 } 00099 STRING other_case_uch; 00100 tesseract::UTF32ToUTF8(other_case, &other_case_uch); 00101 UNICHAR_ID other_case_id = 00102 unicharset->unichar_to_id(other_case_uch.c_str()); 00103 if (other_case_id != INVALID_UNICHAR_ID) { 00104 unicharset->set_other_case(unichar_id, other_case_id); 00105 } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) { 00106 tprintf("Other case %s of %s is not in unicharset\n", 00107 other_case_uch.c_str(), unichar_str); 00108 } 00109 } 00110 00111 // Set RTL property and obtain mirror unichar ID from ICU. 00112 GenericVector<char32> mirrors(num_code_points, 0); 00113 for (int i = 0; i < num_code_points; ++i) { 00114 mirrors[i] = u_charMirror(uni_vector[i]); 00115 if (i == 0) { // set directionality to that of the 1st code point 00116 unicharset->set_direction(unichar_id, 00117 static_cast<UNICHARSET::Direction>( 00118 u_charDirection(uni_vector[i]))); 00119 } 00120 } 00121 STRING mirror_uch; 00122 tesseract::UTF32ToUTF8(mirrors, &mirror_uch); 00123 UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str()); 00124 if (mirror_uch_id != INVALID_UNICHAR_ID) { 00125 unicharset->set_mirror(unichar_id, mirror_uch_id); 00126 } else if (report_errors) { 00127 tprintf("Mirror %s of %s is not in unicharset\n", 00128 mirror_uch.c_str(), unichar_str); 00129 } 00130 00131 // Record normalized version of this unichar. 00132 STRING normed_str = tesseract::NormalizeUTF8String(unichar_str); 00133 if (unichar_id != 0 && normed_str.length() > 0) { 00134 unicharset->set_normed(unichar_id, normed_str.c_str()); 00135 } else { 00136 unicharset->set_normed(unichar_id, unichar_str); 00137 } 00138 ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size()); 00139 } 00140 unicharset->post_load_setup(); 00141 } 00142 00143 // Helper to set the properties for an input unicharset file, writes to the 00144 // output file. If an appropriate script unicharset can be found in the 00145 // script_dir directory, then the tops and bottoms are expanded using the 00146 // script unicharset. 00147 // If non-empty, xheight data for the fonts are written to the xheights_file. 00148 void SetPropertiesForInputFile(const string& script_dir, 00149 const string& input_unicharset_file, 00150 const string& output_unicharset_file, 00151 const string& output_xheights_file) { 00152 UNICHARSET unicharset; 00153 00154 // Load the input unicharset 00155 unicharset.load_from_file(input_unicharset_file.c_str()); 00156 tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(), 00157 input_unicharset_file.c_str()); 00158 00159 // Set unichar properties 00160 tprintf("Setting unichar properties\n"); 00161 SetupBasicProperties(true, &unicharset); 00162 string xheights_str; 00163 for (int s = 0; s < unicharset.get_script_table_size(); ++s) { 00164 // Load the unicharset for the script if available. 00165 string filename = script_dir + "/" + 00166 unicharset.get_script_from_script_id(s) + ".unicharset"; 00167 UNICHARSET script_set; 00168 if (script_set.load_from_file(filename.c_str())) { 00169 unicharset.SetPropertiesFromOther(script_set); 00170 } 00171 // Load the xheights for the script if available. 00172 filename = script_dir + "/" + unicharset.get_script_from_script_id(s) + 00173 ".xheights"; 00174 string script_heights; 00175 if (File::ReadFileToString(filename, &script_heights)) 00176 xheights_str += script_heights; 00177 } 00178 if (!output_xheights_file.empty()) 00179 File::WriteStringToFileOrDie(xheights_str, output_xheights_file); 00180 for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset.size(); ++c) { 00181 if (unicharset.PropertiesIncomplete(c)) { 00182 tprintf("Warning: properties incomplete for index %d = %s\n", 00183 c, unicharset.id_to_unichar(c)); 00184 } 00185 } 00186 00187 // Write the output unicharset 00188 tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str()); 00189 unicharset.save_to_file(output_unicharset_file.c_str()); 00190 } 00191 00192 } // namespace tesseract 00193