|
tesseract 3.04.01
|
00001 00002 // File: unicharset.cpp 00003 // Description: Unicode character/ligature set class. 00004 // Author: Thomas Kielbus 00005 // Created: Wed Jun 28 17:05:01 PDT 2006 00006 // 00007 // (C) Copyright 2006, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #include "unicharset.h" 00021 00022 #include <assert.h> 00023 #include <stdio.h> 00024 #include <string.h> 00025 00026 #include "params.h" 00027 #include "serialis.h" 00028 #include "tesscallback.h" 00029 #include "tprintf.h" 00030 #include "unichar.h" 00031 00032 // Special character used in representing character fragments. 00033 static const char kSeparator = '|'; 00034 // Special character used in representing 'natural' character fragments. 00035 static const char kNaturalFlag = 'n'; 00036 00037 static const int ISALPHA_MASK = 0x1; 00038 static const int ISLOWER_MASK = 0x2; 00039 static const int ISUPPER_MASK = 0x4; 00040 static const int ISDIGIT_MASK = 0x8; 00041 static const int ISPUNCTUATION_MASK = 0x10; 00042 00043 // Y coordinate threshold for determining cap-height vs x-height. 00044 // TODO(rays) Bring the global definition down to the ccutil library level, 00045 // so this constant is relative to some other constants. 00046 static const int kMeanlineThreshold = 220; 00047 // Let C be the number of alpha chars for which all tops exceed 00048 // kMeanlineThreshold, and X the number of alpha chars for which all 00049 // tops are below kMeanlineThreshold, then if X > C * 00050 // kMinXHeightFraction and C > X * kMinCapHeightFraction or more than 00051 // half the alpha characters have upper or lower case, then the 00052 // unicharset "has x-height". 00053 const double kMinXHeightFraction = 0.25; 00054 const double kMinCapHeightFraction = 0.05; 00055 00056 /*static */ 00057 const char* UNICHARSET::kCustomLigatures[][2] = { 00058 {"ct", "\uE003"}, // c + t -> U+E003 00059 {"ſh", "\uE006"}, // long-s + h -> U+E006 00060 {"ſi", "\uE007"}, // long-s + i -> U+E007 00061 {"ſl", "\uE008"}, // long-s + l -> U+E008 00062 {"ſſ", "\uE009"}, // long-s + long-s -> U+E009 00063 {NULL, NULL} 00064 }; 00065 00066 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum. 00067 const char* UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = { 00068 " ", 00069 "Joined", 00070 "|Broken|0|1" 00071 }; 00072 00073 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() { 00074 Init(); 00075 } 00076 00077 // Initialize all properties to sensible default values. 00078 void UNICHARSET::UNICHAR_PROPERTIES::Init() { 00079 isalpha = false; 00080 islower = false; 00081 isupper = false; 00082 isdigit = false; 00083 ispunctuation = false; 00084 isngram = false; 00085 enabled = false; 00086 SetRangesOpen(); 00087 script_id = 0; 00088 other_case = 0; 00089 mirror = 0; 00090 normed = ""; 00091 direction = UNICHARSET::U_LEFT_TO_RIGHT; 00092 fragment = NULL; 00093 } 00094 00095 // Sets all ranges wide open. Initialization default in case there are 00096 // no useful values available. 00097 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() { 00098 min_bottom = 0; 00099 max_bottom = MAX_UINT8; 00100 min_top = 0; 00101 max_top = MAX_UINT8; 00102 width = 0.0f; 00103 width_sd = 0.0f; 00104 bearing = 0.0f; 00105 bearing_sd = 0.0f; 00106 advance = 0.0f; 00107 advance_sd = 0.0f; 00108 } 00109 00110 // Sets all ranges to empty. Used before expanding with font-based data. 00111 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() { 00112 min_bottom = MAX_UINT8; 00113 max_bottom = 0; 00114 min_top = MAX_UINT8; 00115 max_top = 0; 00116 width = 0.0f; 00117 width_sd = 0.0f; 00118 bearing = 0.0f; 00119 bearing_sd = 0.0f; 00120 advance = 0.0f; 00121 advance_sd = 0.0f; 00122 } 00123 00124 // Returns true if any of the top/bottom/width/bearing/advance ranges/stats 00125 // is emtpy. 00126 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const { 00127 return width == 0.0f || advance == 0.0f; 00128 } 00129 00130 // Expands the ranges with the ranges from the src properties. 00131 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom( 00132 const UNICHAR_PROPERTIES& src) { 00133 UpdateRange(src.min_bottom, &min_bottom, &max_bottom); 00134 UpdateRange(src.max_bottom, &min_bottom, &max_bottom); 00135 UpdateRange(src.min_top, &min_top, &max_top); 00136 UpdateRange(src.max_top, &min_top, &max_top); 00137 if (src.width_sd > width_sd) { 00138 width = src.width; 00139 width_sd = src.width_sd; 00140 } 00141 if (src.bearing_sd > bearing_sd) { 00142 bearing = src.bearing; 00143 bearing_sd = src.bearing_sd; 00144 } 00145 if (src.advance_sd > advance_sd) { 00146 advance = src.advance; 00147 advance_sd = src.advance_sd; 00148 } 00149 } 00150 00151 // Copies the properties from src into this. 00152 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES& src) { 00153 // Apart from the fragment, everything else can be done with a default copy. 00154 CHAR_FRAGMENT* saved_fragment = fragment; 00155 *this = src; // Bitwise copy. 00156 fragment = saved_fragment; 00157 } 00158 00159 UNICHARSET::UNICHARSET() : 00160 unichars(NULL), 00161 ids(), 00162 size_used(0), 00163 size_reserved(0), 00164 script_table(NULL), 00165 script_table_size_used(0), 00166 null_script("NULL") { 00167 clear(); 00168 for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) { 00169 unichar_insert(kSpecialUnicharCodes[i]); 00170 if (i == UNICHAR_JOINED) 00171 set_isngram(i, true); 00172 } 00173 } 00174 00175 UNICHARSET::~UNICHARSET() { 00176 clear(); 00177 } 00178 00179 void UNICHARSET::reserve(int unichars_number) { 00180 if (unichars_number > size_reserved) { 00181 UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number]; 00182 for (int i = 0; i < size_used; ++i) 00183 unichars_new[i] = unichars[i]; 00184 for (int j = size_used; j < unichars_number; ++j) { 00185 unichars_new[j].properties.script_id = add_script(null_script); 00186 } 00187 delete[] unichars; 00188 unichars = unichars_new; 00189 size_reserved = unichars_number; 00190 } 00191 } 00192 00193 UNICHAR_ID 00194 UNICHARSET::unichar_to_id(const char* const unichar_repr) const { 00195 return ids.contains(unichar_repr) ? 00196 ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID; 00197 } 00198 00199 UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr, 00200 int length) const { 00201 assert(length > 0 && length <= UNICHAR_LEN); 00202 return ids.contains(unichar_repr, length) ? 00203 ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID; 00204 } 00205 00206 // Return the minimum number of bytes that matches a legal UNICHAR_ID, 00207 // while leaving the rest of the string encodable. Returns 0 if the 00208 // beginning of the string is not encodable. 00209 // WARNING: this function now encodes the whole string for precision. 00210 // Use encode_string in preference to repeatedly calling step. 00211 int UNICHARSET::step(const char* str) const { 00212 GenericVector<UNICHAR_ID> encoding; 00213 GenericVector<char> lengths; 00214 encode_string(str, true, &encoding, &lengths, NULL); 00215 if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0; 00216 return lengths[0]; 00217 } 00218 00219 // Return whether the given UTF-8 string is encodable with this UNICHARSET. 00220 // If not encodable, write the first byte offset which cannot be converted 00221 // into the second (return) argument. 00222 bool UNICHARSET::encodable_string(const char *str, 00223 int *first_bad_position) const { 00224 GenericVector<UNICHAR_ID> encoding; 00225 return encode_string(str, true, &encoding, NULL, first_bad_position); 00226 } 00227 00228 // Encodes the given UTF-8 string with this UNICHARSET. 00229 // Returns true if the encoding succeeds completely, false if there is at 00230 // least one INVALID_UNICHAR_ID in the returned encoding, but in this case 00231 // the rest of the string is still encoded. 00232 // If lengths is not NULL, then it is filled with the corresponding 00233 // byte length of each encoded UNICHAR_ID. 00234 bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure, 00235 GenericVector<UNICHAR_ID>* encoding, 00236 GenericVector<char>* lengths, 00237 int* encoded_length) const { 00238 GenericVector<UNICHAR_ID> working_encoding; 00239 GenericVector<char> working_lengths; 00240 GenericVector<char> best_lengths; 00241 encoding->truncate(0); // Just in case str is empty. 00242 int str_length = strlen(str); 00243 int str_pos = 0; 00244 bool perfect = true; 00245 while (str_pos < str_length) { 00246 encode_string(str, str_pos, str_length, &working_encoding, &working_lengths, 00247 &str_pos, encoding, &best_lengths); 00248 if (str_pos < str_length) { 00249 // This is a non-match. Skip one utf-8 character. 00250 perfect = false; 00251 if (give_up_on_failure) break; 00252 int step = UNICHAR::utf8_step(str + str_pos); 00253 if (step == 0) step = 1; 00254 encoding->push_back(INVALID_UNICHAR_ID); 00255 best_lengths.push_back(step); 00256 str_pos += step; 00257 working_encoding = *encoding; 00258 working_lengths = best_lengths; 00259 } 00260 } 00261 if (lengths != NULL) *lengths = best_lengths; 00262 if (encoded_length != NULL) *encoded_length = str_pos; 00263 return perfect; 00264 } 00265 00266 const char* UNICHARSET::id_to_unichar(UNICHAR_ID id) const { 00267 if (id == INVALID_UNICHAR_ID) { 00268 return INVALID_UNICHAR; 00269 } 00270 ASSERT_HOST(id < this->size()); 00271 return unichars[id].representation; 00272 } 00273 00274 const char* UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const { 00275 if (id == INVALID_UNICHAR_ID) { 00276 return INVALID_UNICHAR; 00277 } 00278 ASSERT_HOST(id < this->size()); 00279 // Resolve from the kCustomLigatures table if this is a private encoding. 00280 if (get_isprivate(id)) { 00281 const char* ch = id_to_unichar(id); 00282 for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) { 00283 if (!strcmp(ch, kCustomLigatures[i][1])) { 00284 return kCustomLigatures[i][0]; 00285 } 00286 } 00287 } 00288 // Otherwise return the stored representation. 00289 return unichars[id].representation; 00290 } 00291 00292 // Return a STRING that reformats the utf8 str into the str followed 00293 // by its hex unicodes. 00294 STRING UNICHARSET::debug_utf8_str(const char* str) { 00295 STRING result = str; 00296 result += " ["; 00297 int step = 1; 00298 // Chop into unicodes and code each as hex. 00299 for (int i = 0; str[i] != '\0'; i += step) { 00300 char hex[sizeof(int) * 2 + 1]; 00301 step = UNICHAR::utf8_step(str + i); 00302 if (step == 0) { 00303 step = 1; 00304 sprintf(hex, "%x", str[i]); 00305 } else { 00306 UNICHAR ch(str + i, step); 00307 sprintf(hex, "%x", ch.first_uni()); 00308 } 00309 result += hex; 00310 result += " "; 00311 } 00312 result += "]"; 00313 return result; 00314 } 00315 00316 // Return a STRING containing debug information on the unichar, including 00317 // the id_to_unichar, its hex unicodes and the properties. 00318 STRING UNICHARSET::debug_str(UNICHAR_ID id) const { 00319 if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id)); 00320 const CHAR_FRAGMENT *fragment = this->get_fragment(id); 00321 if (fragment) { 00322 return fragment->to_string(); 00323 } 00324 const char* str = id_to_unichar(id); 00325 STRING result = debug_utf8_str(str); 00326 // Append a for lower alpha, A for upper alpha, and x if alpha but neither. 00327 if (get_isalpha(id)) { 00328 if (get_islower(id)) 00329 result += "a"; 00330 else if (get_isupper(id)) 00331 result += "A"; 00332 else 00333 result += "x"; 00334 } 00335 // Append 0 if a digit. 00336 if (get_isdigit(id)) { 00337 result += "0"; 00338 } 00339 // Append p is a punctuation symbol. 00340 if (get_ispunctuation(id)) { 00341 result += "p"; 00342 } 00343 return result; 00344 } 00345 00346 // Sets the normed_ids vector from the normed string. normed_ids is not 00347 // stored in the file, and needs to be set when the UNICHARSET is loaded. 00348 void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) { 00349 unichars[unichar_id].properties.normed_ids.truncate(0); 00350 if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') { 00351 unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE); 00352 } else if (!encode_string(unichars[unichar_id].properties.normed.string(), 00353 true, &unichars[unichar_id].properties.normed_ids, 00354 NULL, NULL)) { 00355 unichars[unichar_id].properties.normed_ids.truncate(0); 00356 unichars[unichar_id].properties.normed_ids.push_back(unichar_id); 00357 } 00358 } 00359 00360 // Returns whether the unichar id represents a unicode value in the private use 00361 // area. We use this range only internally to represent uncommon ligatures 00362 // (eg. 'ct') that do not have regular unicode values. 00363 bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const { 00364 UNICHAR uc(id_to_unichar(unichar_id), -1); 00365 int uni = uc.first_uni(); 00366 return (uni >= 0xE000 && uni <= 0xF8FF); 00367 } 00368 00369 00370 // Sets all ranges to empty, so they can be expanded to set the values. 00371 void UNICHARSET::set_ranges_empty() { 00372 for (int id = 0; id < size_used; ++id) { 00373 unichars[id].properties.SetRangesEmpty(); 00374 } 00375 } 00376 00377 // Sets all the properties for this unicharset given a src unicharset with 00378 // everything set. The unicharsets don't have to be the same, and graphemes 00379 // are correctly accounted for. 00380 void UNICHARSET::PartialSetPropertiesFromOther(int start_index, 00381 const UNICHARSET& src) { 00382 for (int ch = start_index; ch < size_used; ++ch) { 00383 const char* utf8 = id_to_unichar(ch); 00384 UNICHAR_PROPERTIES properties; 00385 if (src.GetStrProperties(utf8, &properties)) { 00386 // Setup the script_id, other_case, and mirror properly. 00387 const char* script = src.get_script_from_script_id(properties.script_id); 00388 properties.script_id = add_script(script); 00389 const char* other_case = src.id_to_unichar(properties.other_case); 00390 if (contains_unichar(other_case)) { 00391 properties.other_case = unichar_to_id(other_case); 00392 } else { 00393 properties.other_case = ch; 00394 } 00395 const char* mirror_str = src.id_to_unichar(properties.mirror); 00396 if (contains_unichar(mirror_str)) { 00397 properties.mirror = unichar_to_id(mirror_str); 00398 } else { 00399 properties.mirror = ch; 00400 } 00401 unichars[ch].properties.CopyFrom(properties); 00402 set_normed_ids(ch); 00403 } 00404 } 00405 } 00406 00407 // Expands the tops and bottoms and widths for this unicharset given a 00408 // src unicharset with ranges in it. The unicharsets don't have to be the 00409 // same, and graphemes are correctly accounted for. 00410 void UNICHARSET::ExpandRangesFromOther(const UNICHARSET& src) { 00411 for (int ch = 0; ch < size_used; ++ch) { 00412 const char* utf8 = id_to_unichar(ch); 00413 UNICHAR_PROPERTIES properties; 00414 if (src.GetStrProperties(utf8, &properties)) { 00415 // Expand just the ranges from properties. 00416 unichars[ch].properties.ExpandRangesFrom(properties); 00417 } 00418 } 00419 } 00420 00421 // Makes this a copy of src. Clears this completely first, so the automatic 00422 // ids will not be present in this if not in src. Does NOT reorder the set! 00423 void UNICHARSET::CopyFrom(const UNICHARSET& src) { 00424 clear(); 00425 for (int ch = 0; ch < src.size_used; ++ch) { 00426 const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties; 00427 const char* utf8 = src.id_to_unichar(ch); 00428 unichar_insert(utf8); 00429 unichars[ch].properties.ExpandRangesFrom(src_props); 00430 } 00431 // Set properties, including mirror and other_case, WITHOUT reordering 00432 // the unicharset. 00433 PartialSetPropertiesFromOther(0, src); 00434 } 00435 00436 // For each id in src, if it does not occur in this, add it, as in 00437 // SetPropertiesFromOther, otherwise expand the ranges, as in 00438 // ExpandRangesFromOther. 00439 void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) { 00440 int initial_used = size_used; 00441 for (int ch = 0; ch < src.size_used; ++ch) { 00442 const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties; 00443 const char* utf8 = src.id_to_unichar(ch); 00444 if (ch >= SPECIAL_UNICHAR_CODES_COUNT && src_props.AnyRangeEmpty()) { 00445 // Only use fully valid entries. 00446 tprintf("Bad properties for index %d, char %s: " 00447 "%d,%d %d,%d %g,%g %g,%g %g,%g\n", 00448 ch, utf8, src_props.min_bottom, src_props.max_bottom, 00449 src_props.min_top, src_props.max_top, 00450 src_props.width, src_props.width_sd, 00451 src_props.bearing, src_props.bearing_sd, 00452 src_props.advance, src_props.advance_sd); 00453 continue; 00454 } 00455 int id = size_used; 00456 if (contains_unichar(utf8)) { 00457 id = unichar_to_id(utf8); 00458 // Just expand current ranges. 00459 unichars[id].properties.ExpandRangesFrom(src_props); 00460 } else { 00461 unichar_insert(utf8); 00462 unichars[id].properties.SetRangesEmpty(); 00463 } 00464 } 00465 // Set properties, including mirror and other_case, WITHOUT reordering 00466 // the unicharset. 00467 PartialSetPropertiesFromOther(initial_used, src); 00468 } 00469 00470 // Returns true if the acceptable ranges of the tops of the characters do 00471 // not overlap, making their x-height calculations distinct. 00472 bool UNICHARSET::SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const { 00473 int overlap = MIN(unichars[id1].properties.max_top, 00474 unichars[id2].properties.max_top) - 00475 MAX(unichars[id1].properties.min_top, 00476 unichars[id2].properties.min_top); 00477 return overlap <= 0; 00478 } 00479 00480 // Internal recursive version of encode_string above. 00481 // Seeks to encode the given string as a sequence of UNICHAR_IDs such that 00482 // each UNICHAR_ID uses the least possible part of the utf8 str. 00483 // It does this by depth-first tail recursion on increasing length matches 00484 // to the UNICHARSET, saving the first encountered result that encodes the 00485 // maximum total length of str. It stops on a failure to encode to make 00486 // the overall process of encoding a partially failed string more efficient. 00487 // See unicharset.h for definition of the args. 00488 void UNICHARSET::encode_string(const char* str, int str_index, int str_length, 00489 GenericVector<UNICHAR_ID>* encoding, 00490 GenericVector<char>* lengths, 00491 int* best_total_length, 00492 GenericVector<UNICHAR_ID>* best_encoding, 00493 GenericVector<char>* best_lengths) const { 00494 if (str_index > *best_total_length) { 00495 // This is the best result so far. 00496 *best_total_length = str_index; 00497 *best_encoding = *encoding; 00498 if (best_lengths != NULL) 00499 *best_lengths = *lengths; 00500 } 00501 if (str_index == str_length) return; 00502 int encoding_index = encoding->size(); 00503 // Find the length of the first matching unicharset member. 00504 int length = ids.minmatch(str + str_index); 00505 if (length == 0 || str_index + length > str_length) return; 00506 do { 00507 if (ids.contains(str + str_index, length)) { 00508 // Successful encoding so far. 00509 UNICHAR_ID id = ids.unichar_to_id(str + str_index, length); 00510 encoding->push_back(id); 00511 lengths->push_back(length); 00512 encode_string(str, str_index + length, str_length, encoding, lengths, 00513 best_total_length, best_encoding, best_lengths); 00514 if (*best_total_length == str_length) 00515 return; // Tail recursion success! 00516 // Failed with that length, truncate back and try again. 00517 encoding->truncate(encoding_index); 00518 lengths->truncate(encoding_index); 00519 } 00520 int step = UNICHAR::utf8_step(str + str_index + length); 00521 if (step == 0) step = 1; 00522 length += step; 00523 } while (length <= UNICHAR_LEN && str_index + length <= str_length); 00524 } 00525 00526 // Gets the properties for a grapheme string, combining properties for 00527 // multiple characters in a meaningful way where possible. 00528 // Returns false if no valid match was found in the unicharset. 00529 // NOTE that script_id, mirror, and other_case refer to this unicharset on 00530 // return and will need translation if the target unicharset is different. 00531 bool UNICHARSET::GetStrProperties(const char* utf8_str, 00532 UNICHAR_PROPERTIES* props) const { 00533 props->Init(); 00534 props->SetRangesEmpty(); 00535 int total_unicodes = 0; 00536 GenericVector<UNICHAR_ID> encoding; 00537 if (!encode_string(utf8_str, true, &encoding, NULL, NULL)) 00538 return false; // Some part was invalid. 00539 for (int i = 0; i < encoding.size(); ++i) { 00540 int id = encoding[i]; 00541 const UNICHAR_PROPERTIES& src_props = unichars[id].properties; 00542 // Logical OR all the bools. 00543 if (src_props.isalpha) props->isalpha = true; 00544 if (src_props.islower) props->islower = true; 00545 if (src_props.isupper) props->isupper = true; 00546 if (src_props.isdigit) props->isdigit = true; 00547 if (src_props.ispunctuation) props->ispunctuation = true; 00548 if (src_props.isngram) props->isngram = true; 00549 if (src_props.enabled) props->enabled = true; 00550 // Min/max the tops/bottoms. 00551 UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom); 00552 UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom); 00553 UpdateRange(src_props.min_top, &props->min_top, &props->max_top); 00554 UpdateRange(src_props.max_top, &props->min_top, &props->max_top); 00555 float bearing = props->advance + src_props.bearing; 00556 if (total_unicodes == 0 || bearing < props->bearing) { 00557 props->bearing = bearing; 00558 props->bearing_sd = props->advance_sd + src_props.bearing_sd; 00559 } 00560 props->advance += src_props.advance; 00561 props->advance_sd += src_props.advance_sd; 00562 // With a single width, just use the widths stored in the unicharset. 00563 props->width = src_props.width; 00564 props->width_sd = src_props.width_sd; 00565 // Use the first script id, other_case, mirror, direction. 00566 // Note that these will need translation, except direction. 00567 if (total_unicodes == 0) { 00568 props->script_id = src_props.script_id; 00569 props->other_case = src_props.other_case; 00570 props->mirror = src_props.mirror; 00571 props->direction = src_props.direction; 00572 } 00573 // The normed string for the compound character is the concatenation of 00574 // the normed versions of the individual characters. 00575 props->normed += src_props.normed; 00576 ++total_unicodes; 00577 } 00578 if (total_unicodes > 1) { 00579 // Estimate the total widths from the advance - bearing. 00580 props->width = props->advance - props->bearing; 00581 props->width_sd = props->advance_sd + props->bearing_sd; 00582 } 00583 return total_unicodes > 0; 00584 } 00585 00586 // TODO(rays) clean-up the order of functions to match unicharset.h. 00587 00588 unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const { 00589 unsigned int properties = 0; 00590 if (this->get_isalpha(id)) 00591 properties |= ISALPHA_MASK; 00592 if (this->get_islower(id)) 00593 properties |= ISLOWER_MASK; 00594 if (this->get_isupper(id)) 00595 properties |= ISUPPER_MASK; 00596 if (this->get_isdigit(id)) 00597 properties |= ISDIGIT_MASK; 00598 if (this->get_ispunctuation(id)) 00599 properties |= ISPUNCTUATION_MASK; 00600 return properties; 00601 } 00602 00603 char UNICHARSET::get_chartype(UNICHAR_ID id) const { 00604 if (this->get_isupper(id)) return 'A'; 00605 if (this->get_islower(id)) return 'a'; 00606 if (this->get_isalpha(id)) return 'x'; 00607 if (this->get_isdigit(id)) return '0'; 00608 if (this->get_ispunctuation(id)) return 'p'; 00609 return 0; 00610 } 00611 00612 void UNICHARSET::unichar_insert(const char* const unichar_repr) { 00613 if (!ids.contains(unichar_repr)) { 00614 if (strlen(unichar_repr) > UNICHAR_LEN) { 00615 fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n", 00616 int(strlen(unichar_repr)), unichar_repr); 00617 return; 00618 } 00619 if (size_used == size_reserved) { 00620 if (size_used == 0) 00621 reserve(8); 00622 else 00623 reserve(2 * size_used); 00624 } 00625 00626 strcpy(unichars[size_used].representation, unichar_repr); 00627 this->set_script(size_used, null_script); 00628 // If the given unichar_repr represents a fragmented character, set 00629 // fragment property to a pointer to CHAR_FRAGMENT class instance with 00630 // information parsed from the unichar representation. Use the script 00631 // of the base unichar for the fragmented character if possible. 00632 CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr); 00633 this->unichars[size_used].properties.fragment = frag; 00634 if (frag != NULL && this->contains_unichar(frag->get_unichar())) { 00635 this->unichars[size_used].properties.script_id = 00636 this->get_script(frag->get_unichar()); 00637 } 00638 this->unichars[size_used].properties.enabled = true; 00639 ids.insert(unichar_repr, size_used); 00640 ++size_used; 00641 } 00642 } 00643 00644 bool UNICHARSET::contains_unichar(const char* const unichar_repr) const { 00645 return ids.contains(unichar_repr); 00646 } 00647 00648 bool UNICHARSET::contains_unichar(const char* const unichar_repr, 00649 int length) const { 00650 if (length == 0) { 00651 return false; 00652 } 00653 return ids.contains(unichar_repr, length); 00654 } 00655 00656 bool UNICHARSET::eq(UNICHAR_ID unichar_id, 00657 const char* const unichar_repr) const { 00658 return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0; 00659 } 00660 00661 bool UNICHARSET::save_to_string(STRING *str) const { 00662 const int kFileBufSize = 1024; 00663 char buffer[kFileBufSize + 1]; 00664 snprintf(buffer, kFileBufSize, "%d\n", this->size()); 00665 *str = buffer; 00666 for (UNICHAR_ID id = 0; id < this->size(); ++id) { 00667 int min_bottom, max_bottom, min_top, max_top; 00668 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top); 00669 float width, width_sd; 00670 get_width_stats(id, &width, &width_sd); 00671 float bearing, bearing_sd; 00672 get_bearing_stats(id, &bearing, &bearing_sd); 00673 float advance, advance_sd; 00674 get_advance_stats(id, &advance, &advance_sd); 00675 unsigned int properties = this->get_properties(id); 00676 if (strcmp(this->id_to_unichar(id), " ") == 0) { 00677 snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties, 00678 this->get_script_from_script_id(this->get_script(id)), 00679 this->get_other_case(id)); 00680 } else { 00681 snprintf(buffer, kFileBufSize, 00682 "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %s %d %d %d %s\t# %s\n", 00683 this->id_to_unichar(id), properties, 00684 min_bottom, max_bottom, min_top, max_top, width, width_sd, 00685 bearing, bearing_sd, advance, advance_sd, 00686 this->get_script_from_script_id(this->get_script(id)), 00687 this->get_other_case(id), this->get_direction(id), 00688 this->get_mirror(id), this->get_normed_unichar(id), 00689 this->debug_str(id).string()); 00690 } 00691 *str += buffer; 00692 } 00693 return true; 00694 } 00695 00696 // TODO(rays) Replace with TFile everywhere. 00697 class InMemoryFilePointer { 00698 public: 00699 InMemoryFilePointer(const char *memory, int mem_size) 00700 : memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { } 00701 00702 char *fgets(char *orig_dst, int size) { 00703 const char *src_end = memory_ + mem_size_; 00704 char *dst_end = orig_dst + size - 1; 00705 if (size < 1) { 00706 return fgets_ptr_ < src_end ? orig_dst : NULL; 00707 } 00708 00709 char *dst = orig_dst; 00710 char ch = '^'; 00711 while (fgets_ptr_ < src_end && dst < dst_end && ch != '\n') { 00712 ch = *dst++ = *fgets_ptr_++; 00713 } 00714 *dst = 0; 00715 return (dst == orig_dst) ? NULL : orig_dst; 00716 } 00717 00718 private: 00719 const char *memory_; 00720 const char *fgets_ptr_; 00721 const int mem_size_; 00722 }; 00723 00724 bool UNICHARSET::load_from_inmemory_file( 00725 const char *memory, int mem_size, bool skip_fragments) { 00726 InMemoryFilePointer mem_fp(memory, mem_size); 00727 TessResultCallback2<char *, char *, int> *fgets_cb = 00728 NewPermanentTessCallback(&mem_fp, &InMemoryFilePointer::fgets); 00729 bool success = load_via_fgets(fgets_cb, skip_fragments); 00730 delete fgets_cb; 00731 return success; 00732 } 00733 00734 class LocalFilePointer { 00735 public: 00736 LocalFilePointer(FILE *stream) : fp_(stream) {} 00737 char *fgets(char *dst, int size) { 00738 return ::fgets(dst, size, fp_); 00739 } 00740 private: 00741 FILE *fp_; 00742 }; 00743 00744 bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) { 00745 LocalFilePointer lfp(file); 00746 TessResultCallback2<char *, char *, int> *fgets_cb = 00747 NewPermanentTessCallback(&lfp, &LocalFilePointer::fgets); 00748 bool success = load_via_fgets(fgets_cb, skip_fragments); 00749 delete fgets_cb; 00750 return success; 00751 } 00752 00753 bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) { 00754 TessResultCallback2<char *, char *, int> *fgets_cb = 00755 NewPermanentTessCallback(file, &tesseract::TFile::FGets); 00756 bool success = load_via_fgets(fgets_cb, skip_fragments); 00757 delete fgets_cb; 00758 return success; 00759 } 00760 00761 bool UNICHARSET::load_via_fgets( 00762 TessResultCallback2<char *, char *, int> *fgets_cb, 00763 bool skip_fragments) { 00764 int unicharset_size; 00765 char buffer[256]; 00766 00767 this->clear(); 00768 if (fgets_cb->Run(buffer, sizeof(buffer)) == NULL || 00769 sscanf(buffer, "%d", &unicharset_size) != 1) { 00770 return false; 00771 } 00772 this->reserve(unicharset_size); 00773 for (UNICHAR_ID id = 0; id < unicharset_size; ++id) { 00774 char unichar[256]; 00775 unsigned int properties; 00776 char script[64]; 00777 00778 strcpy(script, null_script); 00779 int min_bottom = 0; 00780 int max_bottom = MAX_UINT8; 00781 int min_top = 0; 00782 int max_top = MAX_UINT8; 00783 float width = 0.0f; 00784 float width_sd = 0.0f; 00785 float bearing = 0.0f; 00786 float bearing_sd = 0.0f; 00787 float advance = 0.0f; 00788 float advance_sd = 0.0f; 00789 // TODO(eger): check that this default it ok 00790 // after enabling BiDi iterator for Arabic+Cube. 00791 int direction = UNICHARSET::U_LEFT_TO_RIGHT; 00792 UNICHAR_ID other_case = id; 00793 UNICHAR_ID mirror = id; 00794 char normed[64]; 00795 int v = -1; 00796 if (fgets_cb->Run(buffer, sizeof (buffer)) == NULL || 00797 ((v = sscanf(buffer, 00798 "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d %63s", 00799 unichar, &properties, 00800 &min_bottom, &max_bottom, &min_top, &max_top, 00801 &width, &width_sd, &bearing, &bearing_sd, 00802 &advance, &advance_sd, script, &other_case, 00803 &direction, &mirror, normed)) != 17 && 00804 (v = sscanf(buffer, 00805 "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d", 00806 unichar, &properties, 00807 &min_bottom, &max_bottom, &min_top, &max_top, 00808 &width, &width_sd, &bearing, &bearing_sd, 00809 &advance, &advance_sd, script, &other_case, 00810 &direction, &mirror)) != 16 && 00811 (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d", 00812 unichar, &properties, 00813 &min_bottom, &max_bottom, &min_top, &max_top, 00814 script, &other_case, &direction, &mirror)) != 10 && 00815 (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties, 00816 &min_bottom, &max_bottom, &min_top, &max_top, 00817 script, &other_case)) != 8 && 00818 (v = sscanf(buffer, "%s %x %63s %d", unichar, &properties, 00819 script, &other_case)) != 4 && 00820 (v = sscanf(buffer, "%s %x %63s", 00821 unichar, &properties, script)) != 3 && 00822 (v = sscanf(buffer, "%s %x", unichar, &properties)) != 2)) { 00823 return false; 00824 } 00825 00826 // Skip fragments if needed. 00827 CHAR_FRAGMENT *frag = NULL; 00828 if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) { 00829 int num_pieces = frag->get_total(); 00830 delete frag; 00831 // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in. 00832 if (num_pieces > 1) 00833 continue; 00834 } 00835 // Insert unichar into unicharset and set its properties. 00836 if (strcmp(unichar, "NULL") == 0) 00837 this->unichar_insert(" "); 00838 else 00839 this->unichar_insert(unichar); 00840 00841 this->set_isalpha(id, properties & ISALPHA_MASK); 00842 this->set_islower(id, properties & ISLOWER_MASK); 00843 this->set_isupper(id, properties & ISUPPER_MASK); 00844 this->set_isdigit(id, properties & ISDIGIT_MASK); 00845 this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK); 00846 this->set_isngram(id, false); 00847 this->set_script(id, script); 00848 this->unichars[id].properties.enabled = true; 00849 this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top); 00850 this->set_width_stats(id, width, width_sd); 00851 this->set_bearing_stats(id, bearing, bearing_sd); 00852 this->set_advance_stats(id, advance, advance_sd); 00853 this->set_direction(id, static_cast<UNICHARSET::Direction>(direction)); 00854 ASSERT_HOST(other_case < unicharset_size); 00855 this->set_other_case(id, (v>3) ? other_case : id); 00856 ASSERT_HOST(mirror < unicharset_size); 00857 this->set_mirror(id, (v>8) ? mirror : id); 00858 this->set_normed(id, (v>16) ? normed : unichar); 00859 } 00860 post_load_setup(); 00861 return true; 00862 } 00863 00864 // Sets up internal data after loading the file, based on the char 00865 // properties. Called from load_from_file, but also needs to be run 00866 // during set_unicharset_properties. 00867 void UNICHARSET::post_load_setup() { 00868 // Number of alpha chars with the case property minus those without, 00869 // in order to determine that half the alpha chars have case. 00870 int net_case_alphas = 0; 00871 int x_height_alphas = 0; 00872 int cap_height_alphas = 0; 00873 top_bottom_set_ = false; 00874 for (UNICHAR_ID id = 0; id < size_used; ++id) { 00875 int min_bottom = 0; 00876 int max_bottom = MAX_UINT8; 00877 int min_top = 0; 00878 int max_top = MAX_UINT8; 00879 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top); 00880 if (min_top > 0) 00881 top_bottom_set_ = true; 00882 if (get_isalpha(id)) { 00883 if (get_islower(id) || get_isupper(id)) 00884 ++net_case_alphas; 00885 else 00886 --net_case_alphas; 00887 if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold) 00888 ++x_height_alphas; 00889 else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold) 00890 ++cap_height_alphas; 00891 } 00892 set_normed_ids(id); 00893 } 00894 00895 script_has_upper_lower_ = net_case_alphas > 0; 00896 script_has_xheight_ = script_has_upper_lower_ || 00897 (x_height_alphas > cap_height_alphas * kMinXHeightFraction && 00898 cap_height_alphas > x_height_alphas * kMinCapHeightFraction); 00899 00900 null_sid_ = get_script_id_from_name(null_script); 00901 ASSERT_HOST(null_sid_ == 0); 00902 common_sid_ = get_script_id_from_name("Common"); 00903 latin_sid_ = get_script_id_from_name("Latin"); 00904 cyrillic_sid_ = get_script_id_from_name("Cyrillic"); 00905 greek_sid_ = get_script_id_from_name("Greek"); 00906 han_sid_ = get_script_id_from_name("Han"); 00907 hiragana_sid_ = get_script_id_from_name("Hiragana"); 00908 katakana_sid_ = get_script_id_from_name("Katakana"); 00909 00910 // Compute default script. Use the highest-counting alpha script, that is 00911 // not the common script, as that still contains some "alphas". 00912 int* script_counts = new int[script_table_size_used]; 00913 memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used); 00914 for (int id = 0; id < size_used; ++id) { 00915 if (get_isalpha(id)) { 00916 ++script_counts[get_script(id)]; 00917 } 00918 } 00919 default_sid_ = 0; 00920 for (int s = 1; s < script_table_size_used; ++s) { 00921 if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) 00922 default_sid_ = s; 00923 } 00924 delete [] script_counts; 00925 } 00926 00927 // Returns true if right_to_left scripts are significant in the unicharset, 00928 // but without being so sensitive that "universal" unicharsets containing 00929 // characters from many scripts, like orientation and script detection, 00930 // look like they are right_to_left. 00931 bool UNICHARSET::major_right_to_left() const { 00932 int ltr_count = 0; 00933 int rtl_count = 0; 00934 for (int id = 0; id < size_used; ++id) { 00935 int dir = get_direction(id); 00936 if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++; 00937 if (dir == UNICHARSET::U_RIGHT_TO_LEFT || 00938 dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC || 00939 dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++; 00940 } 00941 return rtl_count > ltr_count; 00942 } 00943 00944 // Set a whitelist and/or blacklist of characters to recognize. 00945 // An empty or NULL whitelist enables everything (minus any blacklist). 00946 // An empty or NULL blacklist disables nothing. 00947 // An empty or NULL blacklist has no effect. 00948 void UNICHARSET::set_black_and_whitelist(const char* blacklist, 00949 const char* whitelist, 00950 const char* unblacklist) { 00951 bool def_enabled = whitelist == NULL || whitelist[0] == '\0'; 00952 // Set everything to default 00953 for (int ch = 0; ch < size_used; ++ch) 00954 unichars[ch].properties.enabled = def_enabled; 00955 if (!def_enabled) { 00956 // Enable the whitelist. 00957 GenericVector<UNICHAR_ID> encoding; 00958 encode_string(whitelist, false, &encoding, NULL, NULL); 00959 for (int i = 0; i < encoding.size(); ++i) { 00960 if (encoding[i] != INVALID_UNICHAR_ID) 00961 unichars[encoding[i]].properties.enabled = true; 00962 } 00963 } 00964 if (blacklist != NULL && blacklist[0] != '\0') { 00965 // Disable the blacklist. 00966 GenericVector<UNICHAR_ID> encoding; 00967 encode_string(blacklist, false, &encoding, NULL, NULL); 00968 for (int i = 0; i < encoding.size(); ++i) { 00969 if (encoding[i] != INVALID_UNICHAR_ID) 00970 unichars[encoding[i]].properties.enabled = false; 00971 } 00972 } 00973 if (unblacklist != NULL && unblacklist[0] != '\0') { 00974 // Re-enable the unblacklist. 00975 GenericVector<UNICHAR_ID> encoding; 00976 encode_string(unblacklist, false, &encoding, NULL, NULL); 00977 for (int i = 0; i < encoding.size(); ++i) { 00978 if (encoding[i] != INVALID_UNICHAR_ID) 00979 unichars[encoding[i]].properties.enabled = true; 00980 } 00981 } 00982 } 00983 00984 // Returns true if there are any repeated unicodes in the normalized 00985 // text of any unichar-id in the unicharset. 00986 bool UNICHARSET::AnyRepeatedUnicodes() const { 00987 int start_id = 0; 00988 if (has_special_codes()) start_id = SPECIAL_UNICHAR_CODES_COUNT; 00989 for (int id = start_id; id < size_used; ++id) { 00990 // Convert to unicodes. 00991 GenericVector<int> unicodes; 00992 if (UNICHAR::UTF8ToUnicode(get_normed_unichar(id), &unicodes) && 00993 unicodes.size() > 1) { 00994 for (int u = 1; u < unicodes.size(); ++u) { 00995 if (unicodes[u - 1] == unicodes[u]) return true; 00996 } 00997 } 00998 } 00999 return false; 01000 } 01001 01002 int UNICHARSET::add_script(const char* script) { 01003 for (int i = 0; i < script_table_size_used; ++i) { 01004 if (strcmp(script, script_table[i]) == 0) 01005 return i; 01006 } 01007 if (script_table_size_reserved == 0) { 01008 script_table_size_reserved = 8; 01009 script_table = new char*[script_table_size_reserved]; 01010 } 01011 if (script_table_size_used + 1 >= script_table_size_reserved) { 01012 char** new_script_table = new char*[script_table_size_reserved * 2]; 01013 memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*)); 01014 delete[] script_table; 01015 script_table = new_script_table; 01016 script_table_size_reserved = 2 * script_table_size_reserved; 01017 } 01018 script_table[script_table_size_used] = new char[strlen(script) + 1]; 01019 strcpy(script_table[script_table_size_used], script); 01020 return script_table_size_used++; 01021 } 01022 01023 // Returns the string that represents a fragment 01024 // with the given unichar, pos and total. 01025 STRING CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total, 01026 bool natural) { 01027 if (total == 1) return STRING(unichar); 01028 STRING result = ""; 01029 result += kSeparator; 01030 result += unichar; 01031 char buffer[kMaxLen]; 01032 snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, 01033 natural ? kNaturalFlag : kSeparator, total); 01034 result += buffer; 01035 return result; 01036 } 01037 01038 CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) { 01039 const char *ptr = string; 01040 int len = strlen(string); 01041 if (len < kMinLen || *ptr != kSeparator) { 01042 return NULL; // this string can not represent a fragment 01043 } 01044 ptr++; // move to the next character 01045 int step = 0; 01046 while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) { 01047 step += UNICHAR::utf8_step(ptr + step); 01048 } 01049 if (step == 0 || step > UNICHAR_LEN) { 01050 return NULL; // no character for unichar or the character is too long 01051 } 01052 char unichar[UNICHAR_LEN + 1]; 01053 strncpy(unichar, ptr, step); 01054 unichar[step] = '\0'; // null terminate unichar 01055 ptr += step; // move to the next fragment separator 01056 int pos = 0; 01057 int total = 0; 01058 bool natural = false; 01059 char *end_ptr = NULL; 01060 for (int i = 0; i < 2; i++) { 01061 if (ptr > string + len || *ptr != kSeparator) { 01062 if (i == 1 && *ptr == kNaturalFlag) 01063 natural = true; 01064 else 01065 return NULL; // Failed to parse fragment representation. 01066 } 01067 ptr++; // move to the next character 01068 i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10)) 01069 : total = static_cast<int>(strtol(ptr, &end_ptr, 10)); 01070 ptr = end_ptr; 01071 } 01072 if (ptr != string + len) { 01073 return NULL; // malformed fragment representation 01074 } 01075 CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT(); 01076 fragment->set_all(unichar, pos, total, natural); 01077 return fragment; 01078 } 01079 01080 int UNICHARSET::get_script_id_from_name(const char* script_name) const { 01081 for (int i = 0; i < script_table_size_used; ++i) { 01082 if (strcmp(script_name, script_table[i]) == 0) 01083 return i; 01084 } 01085 return 0; // 0 is always the null_script 01086 }