|
tesseract 3.04.01
|
00001 00002 // File: unicharset.h 00003 // Description: Unicode character/ligature set class. 00004 // Author: Thomas Kielbus 00005 // Created: Wed Jun 28 17:05:01 PDT 2006 00006 // 00007 // (C) Copyright 2006, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__ 00021 #define TESSERACT_CCUTIL_UNICHARSET_H__ 00022 00023 #include "errcode.h" 00024 #include "genericvector.h" 00025 #include "helpers.h" 00026 #include "serialis.h" 00027 #include "strngs.h" 00028 #include "tesscallback.h" 00029 #include "unichar.h" 00030 #include "unicharmap.h" 00031 00032 // Enum holding special values of unichar_id. Every unicharset has these. 00033 // Warning! Keep in sync with kSpecialUnicharCodes. 00034 enum SpecialUnicharCodes { 00035 UNICHAR_SPACE, 00036 UNICHAR_JOINED, 00037 UNICHAR_BROKEN, 00038 00039 SPECIAL_UNICHAR_CODES_COUNT 00040 }; 00041 00042 class CHAR_FRAGMENT { 00043 public: 00044 // Minimum number of characters used for fragment representation. 00045 static const int kMinLen = 6; 00046 // Maximum number of characters used for fragment representation. 00047 static const int kMaxLen = 3 + UNICHAR_LEN + 2; 00048 // Maximum number of fragments per character. 00049 static const int kMaxChunks = 5; 00050 00051 // Setters and Getters. 00052 inline void set_all(const char *unichar, int pos, int total, bool natural) { 00053 set_unichar(unichar); 00054 set_pos(pos); 00055 set_total(total); 00056 set_natural(natural); 00057 } 00058 inline void set_unichar(const char *uch) { 00059 strncpy(this->unichar, uch, UNICHAR_LEN); 00060 this->unichar[UNICHAR_LEN] = '\0'; 00061 } 00062 inline void set_pos(int p) { this->pos = p; } 00063 inline void set_total(int t) { this->total = t; } 00064 inline const char* get_unichar() const { return this->unichar; } 00065 inline int get_pos() const { return this->pos; } 00066 inline int get_total() const { return this->total; } 00067 00068 // Returns the string that represents a fragment 00069 // with the given unichar, pos and total. 00070 static STRING to_string(const char *unichar, int pos, int total, 00071 bool natural); 00072 // Returns the string that represents this fragment. 00073 STRING to_string() const { 00074 return to_string(unichar, pos, total, natural); 00075 } 00076 00077 // Checks whether a fragment has the same unichar, 00078 // position and total as the given inputs. 00079 inline bool equals(const char *other_unichar, 00080 int other_pos, int other_total) const { 00081 return (strcmp(this->unichar, other_unichar) == 0 && 00082 this->pos == other_pos && this->total == other_total); 00083 } 00084 inline bool equals(const CHAR_FRAGMENT *other) const { 00085 return this->equals(other->get_unichar(), 00086 other->get_pos(), 00087 other->get_total()); 00088 } 00089 00090 // Checks whether a given fragment is a continuation of this fragment. 00091 // Assumes that the given fragment pointer is not NULL. 00092 inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const { 00093 return (strcmp(this->unichar, fragment->get_unichar()) == 0 && 00094 this->total == fragment->get_total() && 00095 this->pos == fragment->get_pos() + 1); 00096 } 00097 00098 // Returns true if this fragment is a beginning fragment. 00099 inline bool is_beginning() const { return this->pos == 0; } 00100 00101 // Returns true if this fragment is an ending fragment. 00102 inline bool is_ending() const { return this->pos == this->total-1; } 00103 00104 // Returns true if the fragment was a separate component to begin with, 00105 // ie did not need chopping to be isolated, but may have been separated 00106 // out from a multi-outline blob. 00107 inline bool is_natural() const { return natural; } 00108 void set_natural(bool value) { natural = value; } 00109 00110 // Parses the string to see whether it represents a character fragment 00111 // (rather than a regular character). If so, allocates memory for a new 00112 // CHAR_FRAGMENT instance and fills it in with the corresponding fragment 00113 // information. Fragments are of the form: 00114 // |m|1|2, meaning chunk 1 of 2 of character m, or 00115 // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed 00116 // to divide the parts, as they were already separate connected components. 00117 // 00118 // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT 00119 // instance, otherwise (if the string does not represent a fragment or it 00120 // looks like it does, but parsing it as a fragment fails) returns NULL. 00121 // 00122 // Note: The caller is responsible for deallocating memory 00123 // associated with the returned pointer. 00124 static CHAR_FRAGMENT *parse_from_string(const char *str); 00125 00126 private: 00127 char unichar[UNICHAR_LEN + 1]; 00128 // True if the fragment was a separate component to begin with, 00129 // ie did not need chopping to be isolated, but may have been separated 00130 // out from a multi-outline blob. 00131 bool natural; 00132 inT16 pos; // fragment position in the character 00133 inT16 total; // total number of fragments in the character 00134 }; 00135 00136 // The UNICHARSET class is an utility class for Tesseract that holds the 00137 // set of characters that are used by the engine. Each character is identified 00138 // by a unique number, from 0 to (size - 1). 00139 class UNICHARSET { 00140 public: 00141 // Custom list of characters and their ligature forms (UTF8) 00142 // These map to unicode values in the private use area (PUC) and are supported 00143 // by only few font families (eg. Wyld, Adobe Caslon Pro). 00144 static const char* kCustomLigatures[][2]; 00145 00146 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum. 00147 static const char* kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]; 00148 00149 // ICU 2.0 UCharDirection enum (from third_party/icu/include/unicode/uchar.h) 00150 enum Direction { 00151 U_LEFT_TO_RIGHT = 0, 00152 U_RIGHT_TO_LEFT = 1, 00153 U_EUROPEAN_NUMBER = 2, 00154 U_EUROPEAN_NUMBER_SEPARATOR = 3, 00155 U_EUROPEAN_NUMBER_TERMINATOR = 4, 00156 U_ARABIC_NUMBER = 5, 00157 U_COMMON_NUMBER_SEPARATOR = 6, 00158 U_BLOCK_SEPARATOR = 7, 00159 U_SEGMENT_SEPARATOR = 8, 00160 U_WHITE_SPACE_NEUTRAL = 9, 00161 U_OTHER_NEUTRAL = 10, 00162 U_LEFT_TO_RIGHT_EMBEDDING = 11, 00163 U_LEFT_TO_RIGHT_OVERRIDE = 12, 00164 U_RIGHT_TO_LEFT_ARABIC = 13, 00165 U_RIGHT_TO_LEFT_EMBEDDING = 14, 00166 U_RIGHT_TO_LEFT_OVERRIDE = 15, 00167 U_POP_DIRECTIONAL_FORMAT = 16, 00168 U_DIR_NON_SPACING_MARK = 17, 00169 U_BOUNDARY_NEUTRAL = 18, 00170 U_CHAR_DIRECTION_COUNT 00171 }; 00172 00173 // Create an empty UNICHARSET 00174 UNICHARSET(); 00175 00176 ~UNICHARSET(); 00177 00178 // Return the UNICHAR_ID of a given unichar representation within the 00179 // UNICHARSET. 00180 UNICHAR_ID unichar_to_id(const char* const unichar_repr) const; 00181 00182 // Return the UNICHAR_ID of a given unichar representation within the 00183 // UNICHARSET. Only the first length characters from unichar_repr are used. 00184 UNICHAR_ID unichar_to_id(const char* const unichar_repr, 00185 int length) const; 00186 00187 // Return the minimum number of bytes that matches a legal UNICHAR_ID, 00188 // while leaving the rest of the string encodable. Returns 0 if the 00189 // beginning of the string is not encodable. 00190 // WARNING: this function now encodes the whole string for precision. 00191 // Use encode_string in preference to repeatedly calling step. 00192 int step(const char* str) const; 00193 00194 // Return whether the given UTF-8 string is encodable with this UNICHARSET. 00195 // If not encodable, write the first byte offset which cannot be converted 00196 // into the second (return) argument. 00197 bool encodable_string(const char *str, int *first_bad_position) const; 00198 00199 // Encodes the given UTF-8 string with this UNICHARSET. 00200 // Any part of the string that cannot be encoded (because the utf8 can't 00201 // be broken up into pieces that are in the unicharset) then: 00202 // if give_up_on_failure, stops and returns a partial encoding, 00203 // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding. 00204 // Returns true if the encoding succeeds completely, false if there is at 00205 // least one failure. 00206 // If lengths is not NULL, then it is filled with the corresponding 00207 // byte length of each encoded UNICHAR_ID. 00208 // If encoded_length is not NULL then on return it contains the length of 00209 // str that was encoded. (if give_up_on_failure the location of the first 00210 // failure, otherwise strlen(str).) 00211 bool encode_string(const char* str, bool give_up_on_failure, 00212 GenericVector<UNICHAR_ID>* encoding, 00213 GenericVector<char>* lengths, 00214 int* encoded_length) const; 00215 00216 // Return the unichar representation corresponding to the given UNICHAR_ID 00217 // within the UNICHARSET. 00218 const char* id_to_unichar(UNICHAR_ID id) const; 00219 00220 // Return the UTF8 representation corresponding to the given UNICHAR_ID after 00221 // resolving any private encodings internal to Tesseract. This method is 00222 // preferable to id_to_unichar for outputting text that will be visible to 00223 // external applications. 00224 const char* id_to_unichar_ext(UNICHAR_ID id) const; 00225 00226 // Return a STRING that reformats the utf8 str into the str followed 00227 // by its hex unicodes. 00228 static STRING debug_utf8_str(const char* str); 00229 00230 // Return a STRING containing debug information on the unichar, including 00231 // the id_to_unichar, its hex unicodes and the properties. 00232 STRING debug_str(UNICHAR_ID id) const; 00233 STRING debug_str(const char * unichar_repr) const { 00234 return debug_str(unichar_to_id(unichar_repr)); 00235 } 00236 00237 // Add a unichar representation to the set. 00238 void unichar_insert(const char* const unichar_repr); 00239 00240 // Return true if the given unichar id exists within the set. 00241 // Relies on the fact that unichar ids are contiguous in the unicharset. 00242 bool contains_unichar_id(UNICHAR_ID unichar_id) const { 00243 return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used && 00244 unichar_id >= 0; 00245 } 00246 00247 // Return true if the given unichar representation exists within the set. 00248 bool contains_unichar(const char* const unichar_repr) const; 00249 bool contains_unichar(const char* const unichar_repr, int length) const; 00250 00251 // Return true if the given unichar representation corresponds to the given 00252 // UNICHAR_ID within the set. 00253 bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const; 00254 00255 // Delete CHAR_FRAGMENTs stored in properties of unichars array. 00256 void delete_pointers_in_unichars() { 00257 for (int i = 0; i < size_used; ++i) { 00258 if (unichars[i].properties.fragment != NULL) { 00259 delete unichars[i].properties.fragment; 00260 unichars[i].properties.fragment = NULL; 00261 } 00262 } 00263 } 00264 00265 // Clear the UNICHARSET (all the previous data is lost). 00266 void clear() { 00267 if (script_table != NULL) { 00268 for (int i = 0; i < script_table_size_used; ++i) 00269 delete[] script_table[i]; 00270 delete[] script_table; 00271 script_table = NULL; 00272 script_table_size_used = 0; 00273 } 00274 if (unichars != NULL) { 00275 delete_pointers_in_unichars(); 00276 delete[] unichars; 00277 unichars = NULL; 00278 } 00279 script_table_size_reserved = 0; 00280 size_reserved = 0; 00281 size_used = 0; 00282 ids.clear(); 00283 top_bottom_set_ = false; 00284 script_has_upper_lower_ = false; 00285 script_has_xheight_ = false; 00286 null_sid_ = 0; 00287 common_sid_ = 0; 00288 latin_sid_ = 0; 00289 cyrillic_sid_ = 0; 00290 greek_sid_ = 0; 00291 han_sid_ = 0; 00292 hiragana_sid_ = 0; 00293 katakana_sid_ = 0; 00294 } 00295 00296 // Return the size of the set (the number of different UNICHAR it holds). 00297 int size() const { 00298 return size_used; 00299 } 00300 00301 // Reserve enough memory space for the given number of UNICHARS 00302 void reserve(int unichars_number); 00303 00304 // Opens the file indicated by filename and saves unicharset to that file. 00305 // Returns true if the operation is successful. 00306 bool save_to_file(const char * const filename) const { 00307 FILE* file = fopen(filename, "w+b"); 00308 if (file == NULL) return false; 00309 bool result = save_to_file(file); 00310 fclose(file); 00311 return result; 00312 } 00313 00314 // Saves the content of the UNICHARSET to the given file. 00315 // Returns true if the operation is successful. 00316 bool save_to_file(FILE *file) const { 00317 STRING str; 00318 if (!save_to_string(&str)) return false; 00319 if (fwrite(&str[0], str.length(), 1, file) != 1) return false; 00320 return true; 00321 } 00322 bool save_to_file(tesseract::TFile *file) const { 00323 STRING str; 00324 if (!save_to_string(&str)) return false; 00325 if (file->FWrite(&str[0], str.length(), 1) != 1) return false; 00326 return true; 00327 } 00328 00329 // Saves the content of the UNICHARSET to the given STRING. 00330 // Returns true if the operation is successful. 00331 bool save_to_string(STRING *str) const; 00332 00333 // Load a unicharset from a unicharset file that has been loaded into 00334 // the given memory buffer. 00335 // Returns true if the operation is successful. 00336 bool load_from_inmemory_file(const char* const memory, int mem_size, 00337 bool skip_fragments); 00338 // Returns true if the operation is successful. 00339 bool load_from_inmemory_file(const char* const memory, int mem_size) { 00340 return load_from_inmemory_file(memory, mem_size, false); 00341 } 00342 00343 // Opens the file indicated by filename and loads the UNICHARSET 00344 // from the given file. The previous data is lost. 00345 // Returns true if the operation is successful. 00346 bool load_from_file(const char* const filename, bool skip_fragments) { 00347 FILE* file = fopen(filename, "rb"); 00348 if (file == NULL) return false; 00349 bool result = load_from_file(file, skip_fragments); 00350 fclose(file); 00351 return result; 00352 } 00353 // returns true if the operation is successful. 00354 bool load_from_file(const char* const filename) { 00355 return load_from_file(filename, false); 00356 } 00357 00358 // Loads the UNICHARSET from the given file. The previous data is lost. 00359 // Returns true if the operation is successful. 00360 bool load_from_file(FILE *file, bool skip_fragments); 00361 bool load_from_file(FILE *file) { return load_from_file(file, false); } 00362 bool load_from_file(tesseract::TFile *file, bool skip_fragments); 00363 00364 00365 // Sets up internal data after loading the file, based on the char 00366 // properties. Called from load_from_file, but also needs to be run 00367 // during set_unicharset_properties. 00368 void post_load_setup(); 00369 00370 // Returns true if right_to_left scripts are significant in the unicharset, 00371 // but without being so sensitive that "universal" unicharsets containing 00372 // characters from many scripts, like orientation and script detection, 00373 // look like they are right_to_left. 00374 bool major_right_to_left() const; 00375 00376 // Set a whitelist and/or blacklist of characters to recognize. 00377 // An empty or NULL whitelist enables everything (minus any blacklist). 00378 // An empty or NULL blacklist disables nothing. 00379 // An empty or NULL unblacklist has no effect. 00380 // The blacklist overrides the whitelist. 00381 // The unblacklist overrides the blacklist. 00382 // Each list is a string of utf8 character strings. Boundaries between 00383 // unicharset units are worked out automatically, and characters not in 00384 // the unicharset are silently ignored. 00385 void set_black_and_whitelist(const char* blacklist, const char* whitelist, 00386 const char* unblacklist); 00387 00388 // Set the isalpha property of the given unichar to the given value. 00389 void set_isalpha(UNICHAR_ID unichar_id, bool value) { 00390 unichars[unichar_id].properties.isalpha = value; 00391 } 00392 00393 // Set the islower property of the given unichar to the given value. 00394 void set_islower(UNICHAR_ID unichar_id, bool value) { 00395 unichars[unichar_id].properties.islower = value; 00396 } 00397 00398 // Set the isupper property of the given unichar to the given value. 00399 void set_isupper(UNICHAR_ID unichar_id, bool value) { 00400 unichars[unichar_id].properties.isupper = value; 00401 } 00402 00403 // Set the isdigit property of the given unichar to the given value. 00404 void set_isdigit(UNICHAR_ID unichar_id, bool value) { 00405 unichars[unichar_id].properties.isdigit = value; 00406 } 00407 00408 // Set the ispunctuation property of the given unichar to the given value. 00409 void set_ispunctuation(UNICHAR_ID unichar_id, bool value) { 00410 unichars[unichar_id].properties.ispunctuation = value; 00411 } 00412 00413 // Set the isngram property of the given unichar to the given value. 00414 void set_isngram(UNICHAR_ID unichar_id, bool value) { 00415 unichars[unichar_id].properties.isngram = value; 00416 } 00417 00418 // Set the script name of the given unichar to the given value. 00419 // Value is copied and thus can be a temporary; 00420 void set_script(UNICHAR_ID unichar_id, const char* value) { 00421 unichars[unichar_id].properties.script_id = add_script(value); 00422 } 00423 00424 // Set other_case unichar id in the properties for the given unichar id. 00425 void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) { 00426 unichars[unichar_id].properties.other_case = other_case; 00427 } 00428 00429 // Set the direction property of the given unichar to the given value. 00430 void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) { 00431 unichars[unichar_id].properties.direction = value; 00432 } 00433 00434 // Set mirror unichar id in the properties for the given unichar id. 00435 void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) { 00436 unichars[unichar_id].properties.mirror = mirror; 00437 } 00438 00439 // Record normalized version of unichar with the given unichar_id. 00440 void set_normed(UNICHAR_ID unichar_id, const char* normed) { 00441 unichars[unichar_id].properties.normed = normed; 00442 unichars[unichar_id].properties.normed_ids.truncate(0); 00443 } 00444 // Sets the normed_ids vector from the normed string. normed_ids is not 00445 // stored in the file, and needs to be set when the UNICHARSET is loaded. 00446 void set_normed_ids(UNICHAR_ID unichar_id); 00447 00448 // Return the isalpha property of the given unichar. 00449 bool get_isalpha(UNICHAR_ID unichar_id) const { 00450 if (INVALID_UNICHAR_ID == unichar_id) return false; 00451 ASSERT_HOST(contains_unichar_id(unichar_id)); 00452 return unichars[unichar_id].properties.isalpha; 00453 } 00454 00455 // Return the islower property of the given unichar. 00456 bool get_islower(UNICHAR_ID unichar_id) const { 00457 if (INVALID_UNICHAR_ID == unichar_id) return false; 00458 ASSERT_HOST(contains_unichar_id(unichar_id)); 00459 return unichars[unichar_id].properties.islower; 00460 } 00461 00462 // Return the isupper property of the given unichar. 00463 bool get_isupper(UNICHAR_ID unichar_id) const { 00464 if (INVALID_UNICHAR_ID == unichar_id) return false; 00465 ASSERT_HOST(contains_unichar_id(unichar_id)); 00466 return unichars[unichar_id].properties.isupper; 00467 } 00468 00469 // Return the isdigit property of the given unichar. 00470 bool get_isdigit(UNICHAR_ID unichar_id) const { 00471 if (INVALID_UNICHAR_ID == unichar_id) return false; 00472 ASSERT_HOST(contains_unichar_id(unichar_id)); 00473 return unichars[unichar_id].properties.isdigit; 00474 } 00475 00476 // Return the ispunctuation property of the given unichar. 00477 bool get_ispunctuation(UNICHAR_ID unichar_id) const { 00478 if (INVALID_UNICHAR_ID == unichar_id) return false; 00479 ASSERT_HOST(contains_unichar_id(unichar_id)); 00480 return unichars[unichar_id].properties.ispunctuation; 00481 } 00482 00483 // Return the isngram property of the given unichar. 00484 bool get_isngram(UNICHAR_ID unichar_id) const { 00485 if (INVALID_UNICHAR_ID == unichar_id) return false; 00486 ASSERT_HOST(contains_unichar_id(unichar_id)); 00487 return unichars[unichar_id].properties.isngram; 00488 } 00489 00490 // Returns whether the unichar id represents a unicode value in the private 00491 // use area. 00492 bool get_isprivate(UNICHAR_ID unichar_id) const; 00493 00494 // Returns true if the ids have useful min/max top/bottom values. 00495 bool top_bottom_useful() const { 00496 return top_bottom_set_; 00497 } 00498 // Sets all ranges to empty, so they can be expanded to set the values. 00499 void set_ranges_empty(); 00500 // Sets all the properties for this unicharset given a src_unicharset with 00501 // everything set. The unicharsets don't have to be the same, and graphemes 00502 // are correctly accounted for. 00503 void SetPropertiesFromOther(const UNICHARSET& src) { 00504 PartialSetPropertiesFromOther(0, src); 00505 } 00506 // Sets properties from Other, starting only at the given index. 00507 void PartialSetPropertiesFromOther(int start_index, const UNICHARSET& src); 00508 // Expands the tops and bottoms and widths for this unicharset given a 00509 // src_unicharset with ranges in it. The unicharsets don't have to be the 00510 // same, and graphemes are correctly accounted for. 00511 void ExpandRangesFromOther(const UNICHARSET& src); 00512 // Makes this a copy of src. Clears this completely first, so the automattic 00513 // ids will not be present in this if not in src. 00514 void CopyFrom(const UNICHARSET& src); 00515 // For each id in src, if it does not occur in this, add it, as in 00516 // SetPropertiesFromOther, otherwise expand the ranges, as in 00517 // ExpandRangesFromOther. 00518 void AppendOtherUnicharset(const UNICHARSET& src); 00519 // Returns true if the acceptable ranges of the tops of the characters do 00520 // not overlap, making their x-height calculations distinct. 00521 bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const; 00522 // Returns the min and max bottom and top of the given unichar in 00523 // baseline-normalized coordinates, ie, where the baseline is 00524 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight 00525 // (See normalis.h for the definitions). 00526 void get_top_bottom(UNICHAR_ID unichar_id, 00527 int* min_bottom, int* max_bottom, 00528 int* min_top, int* max_top) const { 00529 if (INVALID_UNICHAR_ID == unichar_id) { 00530 *min_bottom = *min_top = 0; 00531 *max_bottom = *max_top = 256; // kBlnCellHeight 00532 return; 00533 } 00534 ASSERT_HOST(contains_unichar_id(unichar_id)); 00535 *min_bottom = unichars[unichar_id].properties.min_bottom; 00536 *max_bottom = unichars[unichar_id].properties.max_bottom; 00537 *min_top = unichars[unichar_id].properties.min_top; 00538 *max_top = unichars[unichar_id].properties.max_top; 00539 } 00540 void set_top_bottom(UNICHAR_ID unichar_id, 00541 int min_bottom, int max_bottom, 00542 int min_top, int max_top) { 00543 unichars[unichar_id].properties.min_bottom = 00544 static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8)); 00545 unichars[unichar_id].properties.max_bottom = 00546 static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8)); 00547 unichars[unichar_id].properties.min_top = 00548 static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8)); 00549 unichars[unichar_id].properties.max_top = 00550 static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8)); 00551 } 00552 // Returns the width stats (as mean, sd) of the given unichar relative to the 00553 // median advance of all characters in the character set. 00554 void get_width_stats(UNICHAR_ID unichar_id, 00555 float* width, float* width_sd) const { 00556 if (INVALID_UNICHAR_ID == unichar_id) { 00557 *width = 0.0f; 00558 *width_sd = 0.0f;; 00559 return; 00560 } 00561 ASSERT_HOST(contains_unichar_id(unichar_id)); 00562 *width = unichars[unichar_id].properties.width; 00563 *width_sd = unichars[unichar_id].properties.width_sd; 00564 } 00565 void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) { 00566 unichars[unichar_id].properties.width = width; 00567 unichars[unichar_id].properties.width_sd = width_sd; 00568 } 00569 // Returns the stats of the x-bearing (as mean, sd) of the given unichar 00570 // relative to the median advance of all characters in the character set. 00571 void get_bearing_stats(UNICHAR_ID unichar_id, 00572 float* bearing, float* bearing_sd) const { 00573 if (INVALID_UNICHAR_ID == unichar_id) { 00574 *bearing = *bearing_sd = 0.0f; 00575 return; 00576 } 00577 ASSERT_HOST(contains_unichar_id(unichar_id)); 00578 *bearing = unichars[unichar_id].properties.bearing; 00579 *bearing_sd = unichars[unichar_id].properties.bearing_sd; 00580 } 00581 void set_bearing_stats(UNICHAR_ID unichar_id, 00582 float bearing, float bearing_sd) { 00583 unichars[unichar_id].properties.bearing = bearing; 00584 unichars[unichar_id].properties.bearing_sd = bearing_sd; 00585 } 00586 // Returns the stats of the x-advance of the given unichar (as mean, sd) 00587 // relative to the median advance of all characters in the character set. 00588 void get_advance_stats(UNICHAR_ID unichar_id, 00589 float* advance, float* advance_sd) const { 00590 if (INVALID_UNICHAR_ID == unichar_id) { 00591 *advance = *advance_sd = 0; 00592 return; 00593 } 00594 ASSERT_HOST(contains_unichar_id(unichar_id)); 00595 *advance = unichars[unichar_id].properties.advance; 00596 *advance_sd = unichars[unichar_id].properties.advance_sd; 00597 } 00598 void set_advance_stats(UNICHAR_ID unichar_id, 00599 float advance, float advance_sd) { 00600 unichars[unichar_id].properties.advance = advance; 00601 unichars[unichar_id].properties.advance_sd = advance_sd; 00602 } 00603 // Returns true if the font metrics properties are empty. 00604 bool PropertiesIncomplete(UNICHAR_ID unichar_id) const { 00605 return unichars[unichar_id].properties.AnyRangeEmpty(); 00606 } 00607 00608 // Return the script name of the given unichar. 00609 // The returned pointer will always be the same for the same script, it's 00610 // managed by unicharset and thus MUST NOT be deleted 00611 int get_script(UNICHAR_ID unichar_id) const { 00612 if (INVALID_UNICHAR_ID == unichar_id) return null_sid_; 00613 ASSERT_HOST(contains_unichar_id(unichar_id)); 00614 return unichars[unichar_id].properties.script_id; 00615 } 00616 00617 // Return the character properties, eg. alpha/upper/lower/digit/punct, 00618 // as a bit field of unsigned int. 00619 unsigned int get_properties(UNICHAR_ID unichar_id) const; 00620 00621 // Return the character property as a single char. If a character has 00622 // multiple attributes, the main property is defined by the following order: 00623 // upper_case : 'A' 00624 // lower_case : 'a' 00625 // alpha : 'x' 00626 // digit : '0' 00627 // punctuation: 'p' 00628 char get_chartype(UNICHAR_ID unichar_id) const; 00629 00630 // Get other_case unichar id in the properties for the given unichar id. 00631 UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const { 00632 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; 00633 ASSERT_HOST(contains_unichar_id(unichar_id)); 00634 return unichars[unichar_id].properties.other_case; 00635 } 00636 00637 // Returns the direction property of the given unichar. 00638 Direction get_direction(UNICHAR_ID unichar_id) const { 00639 if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL; 00640 ASSERT_HOST(contains_unichar_id(unichar_id)); 00641 return unichars[unichar_id].properties.direction; 00642 } 00643 00644 // Get mirror unichar id in the properties for the given unichar id. 00645 UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const { 00646 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; 00647 ASSERT_HOST(contains_unichar_id(unichar_id)); 00648 return unichars[unichar_id].properties.mirror; 00649 } 00650 00651 // Returns UNICHAR_ID of the corresponding lower-case unichar. 00652 UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const { 00653 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; 00654 ASSERT_HOST(contains_unichar_id(unichar_id)); 00655 if (unichars[unichar_id].properties.islower) return unichar_id; 00656 return unichars[unichar_id].properties.other_case; 00657 } 00658 00659 // Returns UNICHAR_ID of the corresponding upper-case unichar. 00660 UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const { 00661 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; 00662 ASSERT_HOST(contains_unichar_id(unichar_id)); 00663 if (unichars[unichar_id].properties.isupper) return unichar_id; 00664 return unichars[unichar_id].properties.other_case; 00665 } 00666 00667 // Returns true if this UNICHARSET has the special codes in 00668 // SpecialUnicharCodes available. If false then there are normal unichars 00669 // at these codes and they should not be used. 00670 bool has_special_codes() const { 00671 return get_fragment(UNICHAR_BROKEN) != NULL && 00672 strcmp(id_to_unichar(UNICHAR_BROKEN), 00673 kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0; 00674 } 00675 00676 // Returns true if there are any repeated unicodes in the normalized 00677 // text of any unichar-id in the unicharset. 00678 bool AnyRepeatedUnicodes() const; 00679 00680 // Return a pointer to the CHAR_FRAGMENT class if the given 00681 // unichar id represents a character fragment. 00682 const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const { 00683 if (INVALID_UNICHAR_ID == unichar_id) return NULL; 00684 ASSERT_HOST(contains_unichar_id(unichar_id)); 00685 return unichars[unichar_id].properties.fragment; 00686 } 00687 00688 // Return the isalpha property of the given unichar representation. 00689 bool get_isalpha(const char* const unichar_repr) const { 00690 return get_isalpha(unichar_to_id(unichar_repr)); 00691 } 00692 00693 // Return the islower property of the given unichar representation. 00694 bool get_islower(const char* const unichar_repr) const { 00695 return get_islower(unichar_to_id(unichar_repr)); 00696 } 00697 00698 // Return the isupper property of the given unichar representation. 00699 bool get_isupper(const char* const unichar_repr) const { 00700 return get_isupper(unichar_to_id(unichar_repr)); 00701 } 00702 00703 // Return the isdigit property of the given unichar representation. 00704 bool get_isdigit(const char* const unichar_repr) const { 00705 return get_isdigit(unichar_to_id(unichar_repr)); 00706 } 00707 00708 // Return the ispunctuation property of the given unichar representation. 00709 bool get_ispunctuation(const char* const unichar_repr) const { 00710 return get_ispunctuation(unichar_to_id(unichar_repr)); 00711 } 00712 00713 // Return the character properties, eg. alpha/upper/lower/digit/punct, 00714 // of the given unichar representation 00715 unsigned int get_properties(const char* const unichar_repr) const { 00716 return get_properties(unichar_to_id(unichar_repr)); 00717 } 00718 00719 char get_chartype(const char* const unichar_repr) const { 00720 return get_chartype(unichar_to_id(unichar_repr)); 00721 } 00722 00723 // Return the script name of the given unichar representation. 00724 // The returned pointer will always be the same for the same script, it's 00725 // managed by unicharset and thus MUST NOT be deleted 00726 int get_script(const char* const unichar_repr) const { 00727 return get_script(unichar_to_id(unichar_repr)); 00728 } 00729 00730 // Return a pointer to the CHAR_FRAGMENT class struct if the given 00731 // unichar representation represents a character fragment. 00732 const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const { 00733 if (unichar_repr == NULL || unichar_repr[0] == '\0' || 00734 !ids.contains(unichar_repr)) { 00735 return NULL; 00736 } 00737 return get_fragment(unichar_to_id(unichar_repr)); 00738 } 00739 00740 // Return the isalpha property of the given unichar representation. 00741 // Only the first length characters from unichar_repr are used. 00742 bool get_isalpha(const char* const unichar_repr, 00743 int length) const { 00744 return get_isalpha(unichar_to_id(unichar_repr, length)); 00745 } 00746 00747 // Return the islower property of the given unichar representation. 00748 // Only the first length characters from unichar_repr are used. 00749 bool get_islower(const char* const unichar_repr, 00750 int length) const { 00751 return get_islower(unichar_to_id(unichar_repr, length)); 00752 } 00753 00754 // Return the isupper property of the given unichar representation. 00755 // Only the first length characters from unichar_repr are used. 00756 bool get_isupper(const char* const unichar_repr, 00757 int length) const { 00758 return get_isupper(unichar_to_id(unichar_repr, length)); 00759 } 00760 00761 // Return the isdigit property of the given unichar representation. 00762 // Only the first length characters from unichar_repr are used. 00763 bool get_isdigit(const char* const unichar_repr, 00764 int length) const { 00765 return get_isdigit(unichar_to_id(unichar_repr, length)); 00766 } 00767 00768 // Return the ispunctuation property of the given unichar representation. 00769 // Only the first length characters from unichar_repr are used. 00770 bool get_ispunctuation(const char* const unichar_repr, 00771 int length) const { 00772 return get_ispunctuation(unichar_to_id(unichar_repr, length)); 00773 } 00774 00775 // Returns normalized version of unichar with the given unichar_id. 00776 const char *get_normed_unichar(UNICHAR_ID unichar_id) const { 00777 if (unichar_id == UNICHAR_SPACE && has_special_codes()) return " "; 00778 return unichars[unichar_id].properties.normed.string(); 00779 } 00780 // Returns a vector of UNICHAR_IDs that represent the ids of the normalized 00781 // version of the given id. There may be more than one UNICHAR_ID in the 00782 // vector if unichar_id represents a ligature. 00783 const GenericVector<UNICHAR_ID>& normed_ids(UNICHAR_ID unichar_id) const { 00784 return unichars[unichar_id].properties.normed_ids; 00785 } 00786 00787 // Return the script name of the given unichar representation. 00788 // Only the first length characters from unichar_repr are used. 00789 // The returned pointer will always be the same for the same script, it's 00790 // managed by unicharset and thus MUST NOT be deleted 00791 int get_script(const char* const unichar_repr, 00792 int length) const { 00793 return get_script(unichar_to_id(unichar_repr, length)); 00794 } 00795 00796 // Return the (current) number of scripts in the script table 00797 int get_script_table_size() const { 00798 return script_table_size_used; 00799 } 00800 00801 // Return the script string from its id 00802 const char* get_script_from_script_id(int id) const { 00803 if (id >= script_table_size_used || id < 0) 00804 return null_script; 00805 return script_table[id]; 00806 } 00807 00808 // Returns the id from the name of the script, or 0 if script is not found. 00809 // Note that this is an expensive operation since it involves iteratively 00810 // comparing strings in the script table. To avoid dependency on STL, we 00811 // won't use a hash. Instead, the calling function can use this to lookup 00812 // and save the ID for relevant scripts for fast comparisons later. 00813 int get_script_id_from_name(const char* script_name) const; 00814 00815 // Return true if the given script is the null script 00816 bool is_null_script(const char* script) const { 00817 return script == null_script; 00818 } 00819 00820 // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0, 00821 // then the returned pointer will be the same. 00822 // The script parameter is copied and thus can be a temporary. 00823 int add_script(const char* script); 00824 00825 // Return the enabled property of the given unichar. 00826 bool get_enabled(UNICHAR_ID unichar_id) const { 00827 return unichars[unichar_id].properties.enabled; 00828 } 00829 00830 00831 int null_sid() const { return null_sid_; } 00832 int common_sid() const { return common_sid_; } 00833 int latin_sid() const { return latin_sid_; } 00834 int cyrillic_sid() const { return cyrillic_sid_; } 00835 int greek_sid() const { return greek_sid_; } 00836 int han_sid() const { return han_sid_; } 00837 int hiragana_sid() const { return hiragana_sid_; } 00838 int katakana_sid() const { return katakana_sid_; } 00839 int default_sid() const { return default_sid_; } 00840 00841 // Returns true if the unicharset has the concept of upper/lower case. 00842 bool script_has_upper_lower() const { 00843 return script_has_upper_lower_; 00844 } 00845 // Returns true if the unicharset has the concept of x-height. 00846 // script_has_xheight can be true even if script_has_upper_lower is not, 00847 // when the script has a sufficiently predominant top line with ascenders, 00848 // such as Devanagari and Thai. 00849 bool script_has_xheight() const { 00850 return script_has_xheight_; 00851 } 00852 00853 private: 00854 00855 struct UNICHAR_PROPERTIES { 00856 UNICHAR_PROPERTIES(); 00857 // Initializes all properties to sensible default values. 00858 void Init(); 00859 // Sets all ranges wide open. Initialization default in case there are 00860 // no useful values available. 00861 void SetRangesOpen(); 00862 // Sets all ranges to empty. Used before expanding with font-based data. 00863 void SetRangesEmpty(); 00864 // Returns true if any of the top/bottom/width/bearing/advance ranges/stats 00865 // is emtpy. 00866 bool AnyRangeEmpty() const; 00867 // Expands the ranges with the ranges from the src properties. 00868 void ExpandRangesFrom(const UNICHAR_PROPERTIES& src); 00869 // Copies the properties from src into this. 00870 void CopyFrom(const UNICHAR_PROPERTIES& src); 00871 00872 bool isalpha; 00873 bool islower; 00874 bool isupper; 00875 bool isdigit; 00876 bool ispunctuation; 00877 bool isngram; 00878 bool enabled; 00879 // Possible limits of the top and bottom of the bounding box in 00880 // baseline-normalized coordinates, ie, where the baseline is 00881 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight 00882 // (See normalis.h for the definitions). 00883 uinT8 min_bottom; 00884 uinT8 max_bottom; 00885 uinT8 min_top; 00886 uinT8 max_top; 00887 // Statstics of the widths of bounding box, relative to the median advance. 00888 float width; 00889 float width_sd; 00890 // Stats of the x-bearing and advance, also relative to the median advance. 00891 float bearing; 00892 float bearing_sd; 00893 float advance; 00894 float advance_sd; 00895 int script_id; 00896 UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar 00897 Direction direction; // direction of this unichar 00898 // Mirror property is useful for reverse DAWG lookup for words in 00899 // right-to-left languages (e.g. "(word)" would be in 00900 // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string. 00901 // However, what we want in our DAWG is 00902 // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not 00903 // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'. 00904 UNICHAR_ID mirror; 00905 // A string of unichar_ids that represent the corresponding normed string. 00906 // For awkward characters like em-dash, this gives hyphen. 00907 // For ligatures, this gives the string of normal unichars. 00908 GenericVector<UNICHAR_ID> normed_ids; 00909 STRING normed; // normalized version of this unichar 00910 // Contains meta information about the fragment if a unichar represents 00911 // a fragment of a character, otherwise should be set to NULL. 00912 // It is assumed that character fragments are added to the unicharset 00913 // after the corresponding 'base' characters. 00914 CHAR_FRAGMENT *fragment; 00915 }; 00916 00917 struct UNICHAR_SLOT { 00918 char representation[UNICHAR_LEN + 1]; 00919 UNICHAR_PROPERTIES properties; 00920 }; 00921 00922 // Internal recursive version of encode_string above. 00923 // str is the start of the whole string. 00924 // str_index is the current position in str. 00925 // str_length is the length of str. 00926 // encoding is a working encoding of str. 00927 // lengths is a working set of lengths of each element of encoding. 00928 // best_total_length is the longest length of str that has been successfully 00929 // encoded so far. 00930 // On return: 00931 // best_encoding contains the encoding that used the longest part of str. 00932 // best_lengths (may be null) contains the lengths of best_encoding. 00933 void encode_string(const char* str, int str_index, int str_length, 00934 GenericVector<UNICHAR_ID>* encoding, 00935 GenericVector<char>* lengths, 00936 int* best_total_length, 00937 GenericVector<UNICHAR_ID>* best_encoding, 00938 GenericVector<char>* best_lengths) const; 00939 00940 // Gets the properties for a grapheme string, combining properties for 00941 // multiple characters in a meaningful way where possible. 00942 // Returns false if no valid match was found in the unicharset. 00943 // NOTE that script_id, mirror, and other_case refer to this unicharset on 00944 // return and will need redirecting if the target unicharset is different. 00945 bool GetStrProperties(const char* utf8_str, 00946 UNICHAR_PROPERTIES* props) const; 00947 00948 // Load ourselves from a "file" where our only interface to the file is 00949 // an implementation of fgets(). This is the parsing primitive accessed by 00950 // the public routines load_from_file() and load_from_inmemory_file(). 00951 bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb, 00952 bool skip_fragments); 00953 00954 UNICHAR_SLOT* unichars; 00955 UNICHARMAP ids; 00956 int size_used; 00957 int size_reserved; 00958 char** script_table; 00959 int script_table_size_used; 00960 int script_table_size_reserved; 00961 const char* null_script; 00962 // True if the unichars have their tops/bottoms set. 00963 bool top_bottom_set_; 00964 // True if the unicharset has significant upper/lower case chars. 00965 bool script_has_upper_lower_; 00966 // True if the unicharset has a significant mean-line with significant 00967 // ascenders above that. 00968 bool script_has_xheight_; 00969 00970 // A few convenient script name-to-id mapping without using hash. 00971 // These are initialized when unicharset file is loaded. Anything 00972 // missing from this list can be looked up using get_script_id_from_name. 00973 int null_sid_; 00974 int common_sid_; 00975 int latin_sid_; 00976 int cyrillic_sid_; 00977 int greek_sid_; 00978 int han_sid_; 00979 int hiragana_sid_; 00980 int katakana_sid_; 00981 // The most frequently occurring script in the charset. 00982 int default_sid_; 00983 }; 00984 00985 #endif // TESSERACT_CCUTIL_UNICHARSET_H__