tesseract 3.04.01

training/unicharset_training_utils.cpp

Go to the documentation of this file.
00001 
00002 // File:        unicharset_training_utils.cpp
00003 // Description: Training utilities for UNICHARSET.
00004 // Author:      Ray Smith
00005 // Created:     Fri Oct 17 17:09:01 PDT 2014
00006 //
00007 // (C) Copyright 2014, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #include "unicharset_training_utils.h"
00021 
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <string>
00025 
00026 #include "fileio.h"
00027 #include "genericvector.h"
00028 #include "icuerrorcode.h"
00029 #include "normstrngs.h"
00030 #include "statistc.h"
00031 #include "strngs.h"
00032 #include "unicharset.h"
00033 #include "unicode/uchar.h"    // from libicu
00034 #include "unicode/uscript.h"  // from libicu
00035 
00036 namespace tesseract {
00037 
00038 // Helper sets the character attribute properties and sets up the script table.
00039 // Does not set tops and bottoms.
00040 void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset) {
00041   for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
00042     // Convert any custom ligatures.
00043     const char* unichar_str = unicharset->id_to_unichar(unichar_id);
00044     for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) {
00045       if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
00046         unichar_str = UNICHARSET::kCustomLigatures[i][0];
00047         break;
00048       }
00049     }
00050 
00051     // Convert the unichar to UTF32 representation
00052     GenericVector<char32> uni_vector;
00053     tesseract::UTF8ToUTF32(unichar_str, &uni_vector);
00054 
00055     // Assume that if the property is true for any character in the string,
00056     // then it holds for the whole "character".
00057     bool unichar_isalpha = false;
00058     bool unichar_islower = false;
00059     bool unichar_isupper = false;
00060     bool unichar_isdigit = false;
00061     bool unichar_ispunct = false;
00062 
00063     for (int i = 0; i < uni_vector.size(); ++i) {
00064       if (u_isalpha(uni_vector[i]))
00065         unichar_isalpha = true;
00066       if (u_islower(uni_vector[i]))
00067         unichar_islower = true;
00068       if (u_isupper(uni_vector[i]))
00069         unichar_isupper = true;
00070       if (u_isdigit(uni_vector[i]))
00071         unichar_isdigit = true;
00072       if (u_ispunct(uni_vector[i]))
00073         unichar_ispunct = true;
00074     }
00075 
00076     unicharset->set_isalpha(unichar_id, unichar_isalpha);
00077     unicharset->set_islower(unichar_id, unichar_islower);
00078     unicharset->set_isupper(unichar_id, unichar_isupper);
00079     unicharset->set_isdigit(unichar_id, unichar_isdigit);
00080     unicharset->set_ispunctuation(unichar_id, unichar_ispunct);
00081 
00082     tesseract::IcuErrorCode err;
00083     unicharset->set_script(unichar_id, uscript_getName(
00084         uscript_getScript(uni_vector[0], err)));
00085 
00086     const int num_code_points = uni_vector.size();
00087     // Obtain the lower/upper case if needed and record it in the properties.
00088     unicharset->set_other_case(unichar_id, unichar_id);
00089     if (unichar_islower || unichar_isupper) {
00090       GenericVector<char32> other_case(num_code_points, 0);
00091       for (int i = 0; i < num_code_points; ++i) {
00092         // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
00093         // However since they deal with UChars (so need a conversion function
00094         // from char32 or UTF8string) and require a meaningful locale string,
00095         // for now u_tolower()/u_toupper() are used.
00096         other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
00097           u_tolower(uni_vector[i]);
00098       }
00099       STRING other_case_uch;
00100       tesseract::UTF32ToUTF8(other_case, &other_case_uch);
00101       UNICHAR_ID other_case_id =
00102           unicharset->unichar_to_id(other_case_uch.c_str());
00103       if (other_case_id != INVALID_UNICHAR_ID) {
00104         unicharset->set_other_case(unichar_id, other_case_id);
00105       } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {
00106         tprintf("Other case %s of %s is not in unicharset\n",
00107                 other_case_uch.c_str(), unichar_str);
00108       }
00109     }
00110 
00111     // Set RTL property and obtain mirror unichar ID from ICU.
00112     GenericVector<char32> mirrors(num_code_points, 0);
00113     for (int i = 0; i < num_code_points; ++i) {
00114       mirrors[i] = u_charMirror(uni_vector[i]);
00115       if (i == 0) {  // set directionality to that of the 1st code point
00116         unicharset->set_direction(unichar_id,
00117                                   static_cast<UNICHARSET::Direction>(
00118                                       u_charDirection(uni_vector[i])));
00119       }
00120     }
00121     STRING mirror_uch;
00122     tesseract::UTF32ToUTF8(mirrors, &mirror_uch);
00123     UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
00124     if (mirror_uch_id != INVALID_UNICHAR_ID) {
00125       unicharset->set_mirror(unichar_id, mirror_uch_id);
00126     } else if (report_errors) {
00127       tprintf("Mirror %s of %s is not in unicharset\n",
00128               mirror_uch.c_str(), unichar_str);
00129     }
00130 
00131     // Record normalized version of this unichar.
00132     STRING normed_str = tesseract::NormalizeUTF8String(unichar_str);
00133     if (unichar_id != 0 && normed_str.length() > 0) {
00134       unicharset->set_normed(unichar_id, normed_str.c_str());
00135     } else {
00136       unicharset->set_normed(unichar_id, unichar_str);
00137     }
00138     ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());
00139   }
00140   unicharset->post_load_setup();
00141 }
00142 
00143 // Helper to set the properties for an input unicharset file, writes to the
00144 // output file. If an appropriate script unicharset can be found in the
00145 // script_dir directory, then the tops and bottoms are expanded using the
00146 // script unicharset.
00147 // If non-empty, xheight data for the fonts are written to the xheights_file.
00148 void SetPropertiesForInputFile(const string& script_dir,
00149                                const string& input_unicharset_file,
00150                                const string& output_unicharset_file,
00151                                const string& output_xheights_file) {
00152   UNICHARSET unicharset;
00153 
00154   // Load the input unicharset
00155   unicharset.load_from_file(input_unicharset_file.c_str());
00156   tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
00157           input_unicharset_file.c_str());
00158 
00159   // Set unichar properties
00160   tprintf("Setting unichar properties\n");
00161   SetupBasicProperties(true, &unicharset);
00162   string xheights_str;
00163   for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
00164     // Load the unicharset for the script if available.
00165     string filename = script_dir + "/" +
00166         unicharset.get_script_from_script_id(s) + ".unicharset";
00167     UNICHARSET script_set;
00168     if (script_set.load_from_file(filename.c_str())) {
00169       unicharset.SetPropertiesFromOther(script_set);
00170     }
00171     // Load the xheights for the script if available.
00172     filename = script_dir + "/" + unicharset.get_script_from_script_id(s) +
00173         ".xheights";
00174     string script_heights;
00175     if (File::ReadFileToString(filename, &script_heights))
00176       xheights_str += script_heights;
00177   }
00178   if (!output_xheights_file.empty())
00179     File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
00180   for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset.size(); ++c) {
00181     if (unicharset.PropertiesIncomplete(c)) {
00182       tprintf("Warning: properties incomplete for index %d = %s\n",
00183               c, unicharset.id_to_unichar(c));
00184     }
00185   }
00186 
00187   // Write the output unicharset
00188   tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
00189   unicharset.save_to_file(output_unicharset_file.c_str());
00190 }
00191 
00192 }  // namespace tesseract
00193 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines