tesseract 3.04.01

ccutil/unicharset.cpp

Go to the documentation of this file.
00001 
00002 // File:        unicharset.cpp
00003 // Description: Unicode character/ligature set class.
00004 // Author:      Thomas Kielbus
00005 // Created:     Wed Jun 28 17:05:01 PDT 2006
00006 //
00007 // (C) Copyright 2006, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #include "unicharset.h"
00021 
00022 #include <assert.h>
00023 #include <stdio.h>
00024 #include <string.h>
00025 
00026 #include "params.h"
00027 #include "serialis.h"
00028 #include "tesscallback.h"
00029 #include "tprintf.h"
00030 #include "unichar.h"
00031 
00032 // Special character used in representing character fragments.
00033 static const char kSeparator = '|';
00034 // Special character used in representing 'natural' character fragments.
00035 static const char kNaturalFlag = 'n';
00036 
00037 static const int ISALPHA_MASK = 0x1;
00038 static const int ISLOWER_MASK = 0x2;
00039 static const int ISUPPER_MASK = 0x4;
00040 static const int ISDIGIT_MASK = 0x8;
00041 static const int ISPUNCTUATION_MASK = 0x10;
00042 
00043 // Y coordinate threshold for determining cap-height vs x-height.
00044 // TODO(rays) Bring the global definition down to the ccutil library level,
00045 // so this constant is relative to some other constants.
00046 static const int kMeanlineThreshold = 220;
00047 // Let C be the number of alpha chars for which all tops exceed
00048 // kMeanlineThreshold, and X the number of alpha chars for which all
00049 // tops are below kMeanlineThreshold, then if X > C *
00050 // kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
00051 // half the alpha characters have upper or lower case, then the
00052 // unicharset "has x-height".
00053 const double kMinXHeightFraction = 0.25;
00054 const double kMinCapHeightFraction = 0.05;
00055 
00056 /*static */
00057 const char* UNICHARSET::kCustomLigatures[][2] = {
00058   {"ct", "\uE003"},  // c + t -> U+E003
00059   {"ſh", "\uE006"},  // long-s + h -> U+E006
00060   {"ſi", "\uE007"},  // long-s + i -> U+E007
00061   {"ſl", "\uE008"},  // long-s + l -> U+E008
00062   {"ſſ", "\uE009"},  // long-s + long-s -> U+E009
00063   {NULL, NULL}
00064 };
00065 
00066 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
00067 const char* UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = {
00068     " ",
00069     "Joined",
00070     "|Broken|0|1"
00071 };
00072 
00073 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
00074   Init();
00075 }
00076 
00077 // Initialize all properties to sensible default values.
00078 void UNICHARSET::UNICHAR_PROPERTIES::Init() {
00079   isalpha = false;
00080   islower = false;
00081   isupper = false;
00082   isdigit = false;
00083   ispunctuation = false;
00084   isngram = false;
00085   enabled = false;
00086   SetRangesOpen();
00087   script_id = 0;
00088   other_case = 0;
00089   mirror = 0;
00090   normed = "";
00091   direction = UNICHARSET::U_LEFT_TO_RIGHT;
00092   fragment = NULL;
00093 }
00094 
00095 // Sets all ranges wide open. Initialization default in case there are
00096 // no useful values available.
00097 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
00098   min_bottom = 0;
00099   max_bottom = MAX_UINT8;
00100   min_top = 0;
00101   max_top = MAX_UINT8;
00102   width = 0.0f;
00103   width_sd = 0.0f;
00104   bearing = 0.0f;
00105   bearing_sd = 0.0f;
00106   advance = 0.0f;
00107   advance_sd = 0.0f;
00108 }
00109 
00110 // Sets all ranges to empty. Used before expanding with font-based data.
00111 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
00112   min_bottom = MAX_UINT8;
00113   max_bottom = 0;
00114   min_top = MAX_UINT8;
00115   max_top = 0;
00116   width = 0.0f;
00117   width_sd = 0.0f;
00118   bearing = 0.0f;
00119   bearing_sd = 0.0f;
00120   advance = 0.0f;
00121   advance_sd = 0.0f;
00122 }
00123 
00124 // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
00125 // is emtpy.
00126 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
00127   return width == 0.0f || advance == 0.0f;
00128 }
00129 
00130 // Expands the ranges with the ranges from the src properties.
00131 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
00132     const UNICHAR_PROPERTIES& src) {
00133   UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
00134   UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
00135   UpdateRange(src.min_top, &min_top, &max_top);
00136   UpdateRange(src.max_top, &min_top, &max_top);
00137   if (src.width_sd > width_sd) {
00138     width = src.width;
00139     width_sd = src.width_sd;
00140   }
00141   if (src.bearing_sd > bearing_sd) {
00142     bearing = src.bearing;
00143     bearing_sd = src.bearing_sd;
00144   }
00145   if (src.advance_sd > advance_sd) {
00146     advance = src.advance;
00147     advance_sd = src.advance_sd;
00148   }
00149 }
00150 
00151 // Copies the properties from src into this.
00152 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES& src) {
00153   // Apart from the fragment, everything else can be done with a default copy.
00154   CHAR_FRAGMENT* saved_fragment = fragment;
00155   *this = src;  // Bitwise copy.
00156   fragment = saved_fragment;
00157 }
00158 
00159 UNICHARSET::UNICHARSET() :
00160     unichars(NULL),
00161     ids(),
00162     size_used(0),
00163     size_reserved(0),
00164     script_table(NULL),
00165     script_table_size_used(0),
00166     null_script("NULL") {
00167   clear();
00168   for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
00169     unichar_insert(kSpecialUnicharCodes[i]);
00170     if (i == UNICHAR_JOINED)
00171       set_isngram(i, true);
00172   }
00173 }
00174 
00175 UNICHARSET::~UNICHARSET() {
00176   clear();
00177 }
00178 
00179 void UNICHARSET::reserve(int unichars_number) {
00180   if (unichars_number > size_reserved) {
00181     UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
00182     for (int i = 0; i < size_used; ++i)
00183       unichars_new[i] = unichars[i];
00184     for (int j = size_used; j < unichars_number; ++j) {
00185       unichars_new[j].properties.script_id = add_script(null_script);
00186     }
00187     delete[] unichars;
00188     unichars = unichars_new;
00189     size_reserved = unichars_number;
00190   }
00191 }
00192 
00193 UNICHAR_ID
00194 UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
00195   return ids.contains(unichar_repr) ?
00196     ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
00197 }
00198 
00199 UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
00200                                      int length) const {
00201   assert(length > 0 && length <= UNICHAR_LEN);
00202   return ids.contains(unichar_repr, length) ?
00203     ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
00204 }
00205 
00206 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
00207 // while leaving the rest of the string encodable. Returns 0 if the
00208 // beginning of the string is not encodable.
00209 // WARNING: this function now encodes the whole string for precision.
00210 // Use encode_string in preference to repeatedly calling step.
00211 int UNICHARSET::step(const char* str) const {
00212   GenericVector<UNICHAR_ID> encoding;
00213   GenericVector<char> lengths;
00214   encode_string(str, true, &encoding, &lengths, NULL);
00215   if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
00216   return lengths[0];
00217 }
00218 
00219 // Return whether the given UTF-8 string is encodable with this UNICHARSET.
00220 // If not encodable, write the first byte offset which cannot be converted
00221 // into the second (return) argument.
00222 bool UNICHARSET::encodable_string(const char *str,
00223                                   int *first_bad_position) const {
00224   GenericVector<UNICHAR_ID> encoding;
00225   return encode_string(str, true, &encoding, NULL, first_bad_position);
00226 }
00227 
00228 // Encodes the given UTF-8 string with this UNICHARSET.
00229 // Returns true if the encoding succeeds completely, false if there is at
00230 // least one INVALID_UNICHAR_ID in the returned encoding, but in this case
00231 // the rest of the string is still encoded.
00232 // If lengths is not NULL, then it is filled with the corresponding
00233 // byte length of each encoded UNICHAR_ID.
00234 bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
00235                                GenericVector<UNICHAR_ID>* encoding,
00236                                GenericVector<char>* lengths,
00237                                int* encoded_length) const {
00238   GenericVector<UNICHAR_ID> working_encoding;
00239   GenericVector<char> working_lengths;
00240   GenericVector<char> best_lengths;
00241   encoding->truncate(0);  // Just in case str is empty.
00242   int str_length = strlen(str);
00243   int str_pos = 0;
00244   bool perfect = true;
00245   while (str_pos < str_length) {
00246     encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
00247                   &str_pos, encoding, &best_lengths);
00248     if (str_pos < str_length) {
00249       // This is a non-match. Skip one utf-8 character.
00250       perfect = false;
00251       if (give_up_on_failure) break;
00252       int step = UNICHAR::utf8_step(str + str_pos);
00253       if (step == 0) step = 1;
00254       encoding->push_back(INVALID_UNICHAR_ID);
00255       best_lengths.push_back(step);
00256       str_pos += step;
00257       working_encoding = *encoding;
00258       working_lengths = best_lengths;
00259     }
00260   }
00261   if (lengths != NULL) *lengths = best_lengths;
00262   if (encoded_length != NULL) *encoded_length = str_pos;
00263   return perfect;
00264 }
00265 
00266 const char* UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
00267   if (id == INVALID_UNICHAR_ID) {
00268     return INVALID_UNICHAR;
00269   }
00270   ASSERT_HOST(id < this->size());
00271   return unichars[id].representation;
00272 }
00273 
00274 const char* UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const {
00275   if (id == INVALID_UNICHAR_ID) {
00276     return INVALID_UNICHAR;
00277   }
00278   ASSERT_HOST(id < this->size());
00279   // Resolve from the kCustomLigatures table if this is a private encoding.
00280   if (get_isprivate(id)) {
00281     const char* ch = id_to_unichar(id);
00282     for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) {
00283       if (!strcmp(ch, kCustomLigatures[i][1])) {
00284         return kCustomLigatures[i][0];
00285       }
00286     }
00287   }
00288   // Otherwise return the stored representation.
00289   return unichars[id].representation;
00290 }
00291 
00292 // Return a STRING that reformats the utf8 str into the str followed
00293 // by its hex unicodes.
00294 STRING UNICHARSET::debug_utf8_str(const char* str) {
00295   STRING result = str;
00296   result += " [";
00297   int step = 1;
00298   // Chop into unicodes and code each as hex.
00299   for (int i = 0; str[i] != '\0'; i += step) {
00300     char hex[sizeof(int) * 2 + 1];
00301     step = UNICHAR::utf8_step(str + i);
00302     if (step == 0) {
00303       step = 1;
00304       sprintf(hex, "%x", str[i]);
00305     } else {
00306       UNICHAR ch(str + i, step);
00307       sprintf(hex, "%x", ch.first_uni());
00308     }
00309     result += hex;
00310     result += " ";
00311   }
00312   result += "]";
00313   return result;
00314 }
00315 
00316 // Return a STRING containing debug information on the unichar, including
00317 // the id_to_unichar, its hex unicodes and the properties.
00318 STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
00319   if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
00320   const CHAR_FRAGMENT *fragment = this->get_fragment(id);
00321   if (fragment) {
00322     return fragment->to_string();
00323   }
00324   const char* str = id_to_unichar(id);
00325   STRING result = debug_utf8_str(str);
00326   // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
00327   if (get_isalpha(id)) {
00328     if (get_islower(id))
00329       result += "a";
00330     else if (get_isupper(id))
00331       result += "A";
00332     else
00333       result += "x";
00334   }
00335   // Append 0 if a digit.
00336   if (get_isdigit(id)) {
00337     result += "0";
00338   }
00339   // Append p is a punctuation symbol.
00340   if (get_ispunctuation(id)) {
00341     result += "p";
00342   }
00343   return result;
00344 }
00345 
00346 // Sets the normed_ids vector from the normed string. normed_ids is not
00347 // stored in the file, and needs to be set when the UNICHARSET is loaded.
00348 void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) {
00349   unichars[unichar_id].properties.normed_ids.truncate(0);
00350   if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
00351     unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
00352   } else if (!encode_string(unichars[unichar_id].properties.normed.string(),
00353                             true, &unichars[unichar_id].properties.normed_ids,
00354                             NULL, NULL)) {
00355     unichars[unichar_id].properties.normed_ids.truncate(0);
00356     unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
00357   }
00358 }
00359 
00360 // Returns whether the unichar id represents a unicode value in the private use
00361 // area. We use this range only internally to represent uncommon ligatures
00362 // (eg. 'ct') that do not have regular unicode values.
00363 bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
00364   UNICHAR uc(id_to_unichar(unichar_id), -1);
00365   int uni = uc.first_uni();
00366   return (uni >= 0xE000 && uni <= 0xF8FF);
00367 }
00368 
00369 
00370 // Sets all ranges to empty, so they can be expanded to set the values.
00371 void UNICHARSET::set_ranges_empty() {
00372   for (int id = 0; id < size_used; ++id) {
00373     unichars[id].properties.SetRangesEmpty();
00374   }
00375 }
00376 
00377 // Sets all the properties for this unicharset given a src unicharset with
00378 // everything set. The unicharsets don't have to be the same, and graphemes
00379 // are correctly accounted for.
00380 void UNICHARSET::PartialSetPropertiesFromOther(int start_index,
00381                                                const UNICHARSET& src) {
00382   for (int ch = start_index; ch < size_used; ++ch) {
00383     const char* utf8 = id_to_unichar(ch);
00384     UNICHAR_PROPERTIES properties;
00385     if (src.GetStrProperties(utf8, &properties)) {
00386       // Setup the script_id, other_case, and mirror properly.
00387       const char* script = src.get_script_from_script_id(properties.script_id);
00388       properties.script_id = add_script(script);
00389       const char* other_case = src.id_to_unichar(properties.other_case);
00390       if (contains_unichar(other_case)) {
00391         properties.other_case = unichar_to_id(other_case);
00392       } else {
00393         properties.other_case = ch;
00394       }
00395       const char* mirror_str = src.id_to_unichar(properties.mirror);
00396       if (contains_unichar(mirror_str)) {
00397         properties.mirror = unichar_to_id(mirror_str);
00398       } else {
00399         properties.mirror = ch;
00400       }
00401       unichars[ch].properties.CopyFrom(properties);
00402       set_normed_ids(ch);
00403     }
00404   }
00405 }
00406 
00407 // Expands the tops and bottoms and widths for this unicharset given a
00408 // src unicharset with ranges in it. The unicharsets don't have to be the
00409 // same, and graphemes are correctly accounted for.
00410 void UNICHARSET::ExpandRangesFromOther(const UNICHARSET& src) {
00411   for (int ch = 0; ch < size_used; ++ch) {
00412     const char* utf8 = id_to_unichar(ch);
00413     UNICHAR_PROPERTIES properties;
00414     if (src.GetStrProperties(utf8, &properties)) {
00415       // Expand just the ranges from properties.
00416       unichars[ch].properties.ExpandRangesFrom(properties);
00417     }
00418   }
00419 }
00420 
00421 // Makes this a copy of src. Clears this completely first, so the automatic
00422 // ids will not be present in this if not in src. Does NOT reorder the set!
00423 void UNICHARSET::CopyFrom(const UNICHARSET& src) {
00424   clear();
00425   for (int ch = 0; ch < src.size_used; ++ch) {
00426     const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
00427     const char* utf8 = src.id_to_unichar(ch);
00428     unichar_insert(utf8);
00429     unichars[ch].properties.ExpandRangesFrom(src_props);
00430   }
00431   // Set properties, including mirror and other_case, WITHOUT reordering
00432   // the unicharset.
00433   PartialSetPropertiesFromOther(0, src);
00434 }
00435 
00436 // For each id in src, if it does not occur in this, add it, as in
00437 // SetPropertiesFromOther, otherwise expand the ranges, as in
00438 // ExpandRangesFromOther.
00439 void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) {
00440   int initial_used = size_used;
00441   for (int ch = 0; ch < src.size_used; ++ch) {
00442     const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
00443     const char* utf8 = src.id_to_unichar(ch);
00444     if (ch >= SPECIAL_UNICHAR_CODES_COUNT && src_props.AnyRangeEmpty()) {
00445       // Only use fully valid entries.
00446       tprintf("Bad properties for index %d, char %s: "
00447               "%d,%d %d,%d %g,%g %g,%g %g,%g\n",
00448               ch, utf8, src_props.min_bottom, src_props.max_bottom,
00449               src_props.min_top, src_props.max_top,
00450               src_props.width, src_props.width_sd,
00451               src_props.bearing, src_props.bearing_sd,
00452               src_props.advance, src_props.advance_sd);
00453       continue;
00454     }
00455     int id = size_used;
00456     if (contains_unichar(utf8)) {
00457       id = unichar_to_id(utf8);
00458       // Just expand current ranges.
00459       unichars[id].properties.ExpandRangesFrom(src_props);
00460     } else {
00461       unichar_insert(utf8);
00462       unichars[id].properties.SetRangesEmpty();
00463     }
00464   }
00465   // Set properties, including mirror and other_case, WITHOUT reordering
00466   // the unicharset.
00467   PartialSetPropertiesFromOther(initial_used, src);
00468 }
00469 
00470 // Returns true if the acceptable ranges of the tops of the characters do
00471 // not overlap, making their x-height calculations distinct.
00472 bool UNICHARSET::SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const {
00473   int overlap = MIN(unichars[id1].properties.max_top,
00474                     unichars[id2].properties.max_top) -
00475                 MAX(unichars[id1].properties.min_top,
00476                     unichars[id2].properties.min_top);
00477   return overlap <= 0;
00478 }
00479 
00480 // Internal recursive version of encode_string above.
00481 // Seeks to encode the given string as a sequence of UNICHAR_IDs such that
00482 // each UNICHAR_ID uses the least possible part of the utf8 str.
00483 // It does this by depth-first tail recursion on increasing length matches
00484 // to the UNICHARSET, saving the first encountered result that encodes the
00485 // maximum total length of str. It stops on a failure to encode to make
00486 // the overall process of encoding a partially failed string more efficient.
00487 // See unicharset.h for definition of the args.
00488 void UNICHARSET::encode_string(const char* str, int str_index, int str_length,
00489                                GenericVector<UNICHAR_ID>* encoding,
00490                                GenericVector<char>* lengths,
00491                                int* best_total_length,
00492                                GenericVector<UNICHAR_ID>* best_encoding,
00493                                GenericVector<char>* best_lengths) const {
00494   if (str_index > *best_total_length) {
00495     // This is the best result so far.
00496     *best_total_length = str_index;
00497     *best_encoding = *encoding;
00498     if (best_lengths != NULL)
00499       *best_lengths = *lengths;
00500   }
00501   if (str_index == str_length) return;
00502   int encoding_index = encoding->size();
00503   // Find the length of the first matching unicharset member.
00504   int length = ids.minmatch(str + str_index);
00505   if (length == 0 || str_index + length > str_length) return;
00506   do {
00507     if (ids.contains(str + str_index, length)) {
00508       // Successful encoding so far.
00509       UNICHAR_ID id = ids.unichar_to_id(str + str_index, length);
00510       encoding->push_back(id);
00511       lengths->push_back(length);
00512       encode_string(str, str_index + length, str_length, encoding, lengths,
00513                     best_total_length, best_encoding, best_lengths);
00514       if (*best_total_length == str_length)
00515         return;  // Tail recursion success!
00516       // Failed with that length, truncate back and try again.
00517       encoding->truncate(encoding_index);
00518       lengths->truncate(encoding_index);
00519     }
00520     int step = UNICHAR::utf8_step(str + str_index + length);
00521     if (step == 0) step = 1;
00522     length += step;
00523   } while (length <= UNICHAR_LEN && str_index + length <= str_length);
00524 }
00525 
00526 // Gets the properties for a grapheme string, combining properties for
00527 // multiple characters in a meaningful way where possible.
00528 // Returns false if no valid match was found in the unicharset.
00529 // NOTE that script_id, mirror, and other_case refer to this unicharset on
00530 // return and will need translation if the target unicharset is different.
00531 bool UNICHARSET::GetStrProperties(const char* utf8_str,
00532                                   UNICHAR_PROPERTIES* props) const {
00533   props->Init();
00534   props->SetRangesEmpty();
00535   int total_unicodes = 0;
00536   GenericVector<UNICHAR_ID> encoding;
00537   if (!encode_string(utf8_str, true, &encoding, NULL, NULL))
00538     return false;  // Some part was invalid.
00539   for (int i = 0; i < encoding.size(); ++i) {
00540     int id = encoding[i];
00541     const UNICHAR_PROPERTIES& src_props = unichars[id].properties;
00542     // Logical OR all the bools.
00543     if (src_props.isalpha) props->isalpha = true;
00544     if (src_props.islower) props->islower = true;
00545     if (src_props.isupper) props->isupper = true;
00546     if (src_props.isdigit) props->isdigit = true;
00547     if (src_props.ispunctuation) props->ispunctuation = true;
00548     if (src_props.isngram) props->isngram = true;
00549     if (src_props.enabled) props->enabled = true;
00550     // Min/max the tops/bottoms.
00551     UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
00552     UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
00553     UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
00554     UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
00555     float bearing = props->advance + src_props.bearing;
00556     if (total_unicodes == 0 || bearing < props->bearing) {
00557       props->bearing = bearing;
00558       props->bearing_sd = props->advance_sd + src_props.bearing_sd;
00559     }
00560     props->advance += src_props.advance;
00561     props->advance_sd += src_props.advance_sd;
00562     // With a single width, just use the widths stored in the unicharset.
00563     props->width = src_props.width;
00564     props->width_sd = src_props.width_sd;
00565     // Use the first script id, other_case, mirror, direction.
00566     // Note that these will need translation, except direction.
00567     if (total_unicodes == 0) {
00568       props->script_id = src_props.script_id;
00569       props->other_case = src_props.other_case;
00570       props->mirror = src_props.mirror;
00571       props->direction = src_props.direction;
00572     }
00573     // The normed string for the compound character is the concatenation of
00574     // the normed versions of the individual characters.
00575     props->normed += src_props.normed;
00576     ++total_unicodes;
00577   }
00578   if (total_unicodes > 1) {
00579     // Estimate the total widths from the advance - bearing.
00580     props->width = props->advance - props->bearing;
00581     props->width_sd = props->advance_sd + props->bearing_sd;
00582   }
00583   return total_unicodes > 0;
00584 }
00585 
00586 // TODO(rays) clean-up the order of functions to match unicharset.h.
00587 
00588 unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {
00589   unsigned int properties = 0;
00590   if (this->get_isalpha(id))
00591     properties |= ISALPHA_MASK;
00592   if (this->get_islower(id))
00593     properties |= ISLOWER_MASK;
00594   if (this->get_isupper(id))
00595     properties |= ISUPPER_MASK;
00596   if (this->get_isdigit(id))
00597     properties |= ISDIGIT_MASK;
00598   if (this->get_ispunctuation(id))
00599     properties |= ISPUNCTUATION_MASK;
00600   return properties;
00601 }
00602 
00603 char UNICHARSET::get_chartype(UNICHAR_ID id) const {
00604   if (this->get_isupper(id)) return 'A';
00605   if (this->get_islower(id)) return 'a';
00606   if (this->get_isalpha(id)) return 'x';
00607   if (this->get_isdigit(id)) return '0';
00608   if (this->get_ispunctuation(id)) return 'p';
00609   return 0;
00610 }
00611 
00612 void UNICHARSET::unichar_insert(const char* const unichar_repr) {
00613   if (!ids.contains(unichar_repr)) {
00614     if (strlen(unichar_repr) > UNICHAR_LEN) {
00615       fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
00616               int(strlen(unichar_repr)), unichar_repr);
00617       return;
00618     }
00619     if (size_used == size_reserved) {
00620       if (size_used == 0)
00621         reserve(8);
00622       else
00623         reserve(2 * size_used);
00624     }
00625 
00626     strcpy(unichars[size_used].representation, unichar_repr);
00627     this->set_script(size_used, null_script);
00628     // If the given unichar_repr represents a fragmented character, set
00629     // fragment property to a pointer to CHAR_FRAGMENT class instance with
00630     // information parsed from the unichar representation. Use the script
00631     // of the base unichar for the fragmented character if possible.
00632     CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
00633     this->unichars[size_used].properties.fragment = frag;
00634     if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
00635       this->unichars[size_used].properties.script_id =
00636         this->get_script(frag->get_unichar());
00637     }
00638     this->unichars[size_used].properties.enabled = true;
00639     ids.insert(unichar_repr, size_used);
00640     ++size_used;
00641   }
00642 }
00643 
00644 bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
00645   return ids.contains(unichar_repr);
00646 }
00647 
00648 bool UNICHARSET::contains_unichar(const char* const unichar_repr,
00649                                   int length) const {
00650   if (length == 0) {
00651     return false;
00652   }
00653   return ids.contains(unichar_repr, length);
00654 }
00655 
00656 bool UNICHARSET::eq(UNICHAR_ID unichar_id,
00657                     const char* const unichar_repr) const {
00658   return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
00659 }
00660 
00661 bool UNICHARSET::save_to_string(STRING *str) const {
00662   const int kFileBufSize = 1024;
00663   char buffer[kFileBufSize + 1];
00664   snprintf(buffer, kFileBufSize, "%d\n", this->size());
00665   *str = buffer;
00666   for (UNICHAR_ID id = 0; id < this->size(); ++id) {
00667     int min_bottom, max_bottom, min_top, max_top;
00668     get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
00669     float width, width_sd;
00670     get_width_stats(id, &width, &width_sd);
00671     float bearing, bearing_sd;
00672     get_bearing_stats(id, &bearing, &bearing_sd);
00673     float advance, advance_sd;
00674     get_advance_stats(id, &advance, &advance_sd);
00675     unsigned int properties = this->get_properties(id);
00676     if (strcmp(this->id_to_unichar(id), " ") == 0) {
00677       snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
00678               this->get_script_from_script_id(this->get_script(id)),
00679               this->get_other_case(id));
00680     } else {
00681       snprintf(buffer, kFileBufSize,
00682               "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %s %d %d %d %s\t# %s\n",
00683               this->id_to_unichar(id), properties,
00684               min_bottom, max_bottom, min_top, max_top, width, width_sd,
00685               bearing, bearing_sd, advance, advance_sd,
00686               this->get_script_from_script_id(this->get_script(id)),
00687               this->get_other_case(id), this->get_direction(id),
00688               this->get_mirror(id), this->get_normed_unichar(id),
00689               this->debug_str(id).string());
00690     }
00691     *str += buffer;
00692   }
00693   return true;
00694 }
00695 
00696 // TODO(rays) Replace with TFile everywhere.
00697 class InMemoryFilePointer {
00698  public:
00699   InMemoryFilePointer(const char *memory, int mem_size)
00700       : memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { }
00701 
00702   char *fgets(char *orig_dst, int size) {
00703     const char *src_end = memory_ + mem_size_;
00704     char *dst_end = orig_dst + size - 1;
00705     if (size < 1) {
00706       return fgets_ptr_ < src_end ? orig_dst : NULL;
00707     }
00708 
00709     char *dst = orig_dst;
00710     char ch = '^';
00711     while (fgets_ptr_ < src_end && dst < dst_end && ch != '\n') {
00712       ch = *dst++ = *fgets_ptr_++;
00713     }
00714     *dst = 0;
00715     return (dst == orig_dst) ? NULL : orig_dst;
00716   }
00717 
00718  private:
00719   const char *memory_;
00720   const char *fgets_ptr_;
00721   const int mem_size_;
00722 };
00723 
00724 bool UNICHARSET::load_from_inmemory_file(
00725     const char *memory, int mem_size, bool skip_fragments) {
00726   InMemoryFilePointer mem_fp(memory, mem_size);
00727   TessResultCallback2<char *, char *, int> *fgets_cb =
00728       NewPermanentTessCallback(&mem_fp, &InMemoryFilePointer::fgets);
00729   bool success = load_via_fgets(fgets_cb, skip_fragments);
00730   delete fgets_cb;
00731   return success;
00732 }
00733 
00734 class LocalFilePointer {
00735  public:
00736   LocalFilePointer(FILE *stream) : fp_(stream) {}
00737   char *fgets(char *dst, int size) {
00738     return ::fgets(dst, size, fp_);
00739   }
00740  private:
00741   FILE *fp_;
00742 };
00743 
00744 bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
00745   LocalFilePointer lfp(file);
00746   TessResultCallback2<char *, char *, int> *fgets_cb =
00747       NewPermanentTessCallback(&lfp, &LocalFilePointer::fgets);
00748   bool success = load_via_fgets(fgets_cb, skip_fragments);
00749   delete fgets_cb;
00750   return success;
00751 }
00752 
00753 bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {
00754   TessResultCallback2<char *, char *, int> *fgets_cb =
00755       NewPermanentTessCallback(file, &tesseract::TFile::FGets);
00756   bool success = load_via_fgets(fgets_cb, skip_fragments);
00757   delete fgets_cb;
00758   return success;
00759 }
00760 
00761 bool UNICHARSET::load_via_fgets(
00762     TessResultCallback2<char *, char *, int> *fgets_cb,
00763     bool skip_fragments) {
00764   int unicharset_size;
00765   char buffer[256];
00766 
00767   this->clear();
00768   if (fgets_cb->Run(buffer, sizeof(buffer)) == NULL ||
00769       sscanf(buffer, "%d", &unicharset_size) != 1) {
00770     return false;
00771   }
00772   this->reserve(unicharset_size);
00773   for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
00774     char unichar[256];
00775     unsigned int properties;
00776     char script[64];
00777 
00778     strcpy(script, null_script);
00779     int min_bottom = 0;
00780     int max_bottom = MAX_UINT8;
00781     int min_top = 0;
00782     int max_top = MAX_UINT8;
00783     float width = 0.0f;
00784     float width_sd = 0.0f;
00785     float bearing = 0.0f;
00786     float bearing_sd = 0.0f;
00787     float advance = 0.0f;
00788     float advance_sd = 0.0f;
00789     // TODO(eger): check that this default it ok
00790     // after enabling BiDi iterator for Arabic+Cube.
00791     int direction = UNICHARSET::U_LEFT_TO_RIGHT;
00792     UNICHAR_ID other_case = id;
00793     UNICHAR_ID mirror = id;
00794     char normed[64];
00795     int v = -1;
00796     if (fgets_cb->Run(buffer, sizeof (buffer)) == NULL ||
00797         ((v = sscanf(buffer,
00798                      "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d %63s",
00799                      unichar, &properties,
00800                      &min_bottom, &max_bottom, &min_top, &max_top,
00801                      &width, &width_sd, &bearing, &bearing_sd,
00802                      &advance, &advance_sd, script, &other_case,
00803                      &direction, &mirror, normed)) != 17 &&
00804          (v = sscanf(buffer,
00805                      "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d",
00806                      unichar, &properties,
00807                      &min_bottom, &max_bottom, &min_top, &max_top,
00808                      &width, &width_sd, &bearing, &bearing_sd,
00809                      &advance, &advance_sd, script, &other_case,
00810                      &direction, &mirror)) != 16 &&
00811           (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d",
00812                       unichar, &properties,
00813                       &min_bottom, &max_bottom, &min_top, &max_top,
00814                       script, &other_case, &direction, &mirror)) != 10 &&
00815           (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties,
00816                       &min_bottom, &max_bottom, &min_top, &max_top,
00817                       script, &other_case)) != 8 &&
00818           (v = sscanf(buffer, "%s %x %63s %d", unichar, &properties,
00819                       script, &other_case)) != 4 &&
00820           (v = sscanf(buffer, "%s %x %63s",
00821                       unichar, &properties, script)) != 3 &&
00822           (v = sscanf(buffer, "%s %x", unichar, &properties)) != 2)) {
00823       return false;
00824     }
00825 
00826     // Skip fragments if needed.
00827     CHAR_FRAGMENT *frag = NULL;
00828     if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
00829       int num_pieces = frag->get_total();
00830       delete frag;
00831       // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in.
00832       if (num_pieces > 1)
00833         continue;
00834     }
00835     // Insert unichar into unicharset and set its properties.
00836     if (strcmp(unichar, "NULL") == 0)
00837       this->unichar_insert(" ");
00838     else
00839       this->unichar_insert(unichar);
00840 
00841     this->set_isalpha(id, properties & ISALPHA_MASK);
00842     this->set_islower(id, properties & ISLOWER_MASK);
00843     this->set_isupper(id, properties & ISUPPER_MASK);
00844     this->set_isdigit(id, properties & ISDIGIT_MASK);
00845     this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
00846     this->set_isngram(id, false);
00847     this->set_script(id, script);
00848     this->unichars[id].properties.enabled = true;
00849     this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
00850     this->set_width_stats(id, width, width_sd);
00851     this->set_bearing_stats(id, bearing, bearing_sd);
00852     this->set_advance_stats(id, advance, advance_sd);
00853     this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
00854     ASSERT_HOST(other_case < unicharset_size);
00855     this->set_other_case(id, (v>3) ? other_case : id);
00856     ASSERT_HOST(mirror < unicharset_size);
00857     this->set_mirror(id, (v>8) ? mirror : id);
00858     this->set_normed(id, (v>16) ? normed : unichar);
00859   }
00860   post_load_setup();
00861   return true;
00862 }
00863 
00864 // Sets up internal data after loading the file, based on the char
00865 // properties. Called from load_from_file, but also needs to be run
00866 // during set_unicharset_properties.
00867 void UNICHARSET::post_load_setup() {
00868   // Number of alpha chars with the case property minus those without,
00869   // in order to determine that half the alpha chars have case.
00870   int net_case_alphas = 0;
00871   int x_height_alphas = 0;
00872   int cap_height_alphas = 0;
00873   top_bottom_set_ = false;
00874   for (UNICHAR_ID id = 0; id < size_used; ++id) {
00875     int min_bottom = 0;
00876     int max_bottom = MAX_UINT8;
00877     int min_top = 0;
00878     int max_top = MAX_UINT8;
00879     get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
00880     if (min_top > 0)
00881       top_bottom_set_ = true;
00882     if (get_isalpha(id)) {
00883       if (get_islower(id) || get_isupper(id))
00884         ++net_case_alphas;
00885       else
00886         --net_case_alphas;
00887       if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
00888         ++x_height_alphas;
00889       else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
00890         ++cap_height_alphas;
00891     }
00892     set_normed_ids(id);
00893   }
00894 
00895   script_has_upper_lower_ = net_case_alphas > 0;
00896   script_has_xheight_ = script_has_upper_lower_ ||
00897       (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
00898        cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
00899 
00900   null_sid_ = get_script_id_from_name(null_script);
00901   ASSERT_HOST(null_sid_ == 0);
00902   common_sid_ = get_script_id_from_name("Common");
00903   latin_sid_ = get_script_id_from_name("Latin");
00904   cyrillic_sid_ = get_script_id_from_name("Cyrillic");
00905   greek_sid_ = get_script_id_from_name("Greek");
00906   han_sid_ = get_script_id_from_name("Han");
00907   hiragana_sid_ = get_script_id_from_name("Hiragana");
00908   katakana_sid_ = get_script_id_from_name("Katakana");
00909 
00910   // Compute default script. Use the highest-counting alpha script, that is
00911   // not the common script, as that still contains some "alphas".
00912   int* script_counts = new int[script_table_size_used];
00913   memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
00914   for (int id = 0; id < size_used; ++id) {
00915     if (get_isalpha(id)) {
00916       ++script_counts[get_script(id)];
00917     }
00918   }
00919   default_sid_ = 0;
00920   for (int s = 1; s < script_table_size_used; ++s) {
00921     if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
00922       default_sid_ = s;
00923   }
00924   delete [] script_counts;
00925 }
00926 
00927 // Returns true if right_to_left scripts are significant in the unicharset,
00928 // but without being so sensitive that "universal" unicharsets containing
00929 // characters from many scripts, like orientation and script detection,
00930 // look like they are right_to_left.
00931 bool UNICHARSET::major_right_to_left() const {
00932   int ltr_count = 0;
00933   int rtl_count = 0;
00934   for (int id = 0; id < size_used; ++id) {
00935     int dir = get_direction(id);
00936     if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
00937     if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
00938         dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||
00939         dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
00940   }
00941   return rtl_count > ltr_count;
00942 }
00943 
00944 // Set a whitelist and/or blacklist of characters to recognize.
00945 // An empty or NULL whitelist enables everything (minus any blacklist).
00946 // An empty or NULL blacklist disables nothing.
00947 // An empty or NULL blacklist has no effect.
00948 void UNICHARSET::set_black_and_whitelist(const char* blacklist,
00949                                          const char* whitelist,
00950                                          const char* unblacklist) {
00951   bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
00952   // Set everything to default
00953   for (int ch = 0; ch < size_used; ++ch)
00954     unichars[ch].properties.enabled = def_enabled;
00955   if (!def_enabled) {
00956     // Enable the whitelist.
00957     GenericVector<UNICHAR_ID> encoding;
00958     encode_string(whitelist, false, &encoding, NULL, NULL);
00959     for (int i = 0; i < encoding.size(); ++i) {
00960       if (encoding[i] != INVALID_UNICHAR_ID)
00961         unichars[encoding[i]].properties.enabled = true;
00962     }
00963   }
00964   if (blacklist != NULL && blacklist[0] != '\0') {
00965     // Disable the blacklist.
00966     GenericVector<UNICHAR_ID> encoding;
00967     encode_string(blacklist, false, &encoding, NULL, NULL);
00968     for (int i = 0; i < encoding.size(); ++i) {
00969       if (encoding[i] != INVALID_UNICHAR_ID)
00970         unichars[encoding[i]].properties.enabled = false;
00971     }
00972   }
00973   if (unblacklist != NULL && unblacklist[0] != '\0') {
00974     // Re-enable the unblacklist.
00975     GenericVector<UNICHAR_ID> encoding;
00976     encode_string(unblacklist, false, &encoding, NULL, NULL);
00977     for (int i = 0; i < encoding.size(); ++i) {
00978       if (encoding[i] != INVALID_UNICHAR_ID)
00979         unichars[encoding[i]].properties.enabled = true;
00980     }
00981   }
00982 }
00983 
00984 // Returns true if there are any repeated unicodes in the normalized
00985 // text of any unichar-id in the unicharset.
00986 bool UNICHARSET::AnyRepeatedUnicodes() const {
00987   int start_id = 0;
00988   if (has_special_codes()) start_id = SPECIAL_UNICHAR_CODES_COUNT;
00989   for (int id = start_id; id < size_used; ++id) {
00990     // Convert to unicodes.
00991     GenericVector<int> unicodes;
00992     if (UNICHAR::UTF8ToUnicode(get_normed_unichar(id), &unicodes) &&
00993         unicodes.size() > 1) {
00994       for (int u = 1; u < unicodes.size(); ++u) {
00995         if (unicodes[u - 1] == unicodes[u]) return true;
00996       }
00997     }
00998   }
00999   return false;
01000 }
01001 
01002 int UNICHARSET::add_script(const char* script) {
01003   for (int i = 0; i < script_table_size_used; ++i) {
01004     if (strcmp(script, script_table[i]) == 0)
01005       return i;
01006   }
01007   if (script_table_size_reserved == 0) {
01008     script_table_size_reserved = 8;
01009     script_table = new char*[script_table_size_reserved];
01010   }
01011   if (script_table_size_used + 1 >= script_table_size_reserved) {
01012     char** new_script_table = new char*[script_table_size_reserved * 2];
01013     memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*));
01014     delete[] script_table;
01015     script_table = new_script_table;
01016       script_table_size_reserved = 2 * script_table_size_reserved;
01017   }
01018   script_table[script_table_size_used] = new char[strlen(script) + 1];
01019   strcpy(script_table[script_table_size_used], script);
01020   return script_table_size_used++;
01021 }
01022 
01023 // Returns the string that represents a fragment
01024 // with the given unichar, pos and total.
01025 STRING CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
01026                                 bool natural) {
01027   if (total == 1) return STRING(unichar);
01028   STRING result = "";
01029   result += kSeparator;
01030   result += unichar;
01031   char buffer[kMaxLen];
01032   snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
01033            natural ? kNaturalFlag : kSeparator, total);
01034   result += buffer;
01035   return result;
01036 }
01037 
01038 CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
01039   const char *ptr = string;
01040   int len = strlen(string);
01041   if (len < kMinLen || *ptr != kSeparator) {
01042     return NULL;  // this string can not represent a fragment
01043   }
01044   ptr++;  // move to the next character
01045   int step = 0;
01046   while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
01047     step += UNICHAR::utf8_step(ptr + step);
01048   }
01049   if (step == 0 || step > UNICHAR_LEN) {
01050     return NULL;  // no character for unichar or the character is too long
01051   }
01052   char unichar[UNICHAR_LEN + 1];
01053   strncpy(unichar, ptr, step);
01054   unichar[step] = '\0';  // null terminate unichar
01055   ptr += step;  // move to the next fragment separator
01056   int pos = 0;
01057   int total = 0;
01058   bool natural = false;
01059   char *end_ptr = NULL;
01060   for (int i = 0; i < 2; i++) {
01061     if (ptr > string + len || *ptr != kSeparator) {
01062       if (i == 1 && *ptr == kNaturalFlag)
01063         natural = true;
01064       else
01065         return NULL;  // Failed to parse fragment representation.
01066     }
01067     ptr++;  // move to the next character
01068     i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
01069       : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
01070     ptr = end_ptr;
01071   }
01072   if (ptr != string + len) {
01073     return NULL;  // malformed fragment representation
01074   }
01075   CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT();
01076   fragment->set_all(unichar, pos, total, natural);
01077   return fragment;
01078 }
01079 
01080 int UNICHARSET::get_script_id_from_name(const char* script_name) const {
01081   for (int i = 0; i < script_table_size_used; ++i) {
01082     if (strcmp(script_name, script_table[i]) == 0)
01083       return i;
01084   }
01085   return 0;  // 0 is always the null_script
01086 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines