tesseract 3.04.01

ccutil/unicharset.h

Go to the documentation of this file.
00001 
00002 // File:        unicharset.h
00003 // Description: Unicode character/ligature set class.
00004 // Author:      Thomas Kielbus
00005 // Created:     Wed Jun 28 17:05:01 PDT 2006
00006 //
00007 // (C) Copyright 2006, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__
00021 #define TESSERACT_CCUTIL_UNICHARSET_H__
00022 
00023 #include "errcode.h"
00024 #include "genericvector.h"
00025 #include "helpers.h"
00026 #include "serialis.h"
00027 #include "strngs.h"
00028 #include "tesscallback.h"
00029 #include "unichar.h"
00030 #include "unicharmap.h"
00031 
00032 // Enum holding special values of unichar_id. Every unicharset has these.
00033 // Warning! Keep in sync with kSpecialUnicharCodes.
00034 enum SpecialUnicharCodes {
00035   UNICHAR_SPACE,
00036   UNICHAR_JOINED,
00037   UNICHAR_BROKEN,
00038 
00039   SPECIAL_UNICHAR_CODES_COUNT
00040 };
00041 
00042 class CHAR_FRAGMENT {
00043  public:
00044   // Minimum number of characters used for fragment representation.
00045   static const int kMinLen = 6;
00046   // Maximum number of characters used for fragment representation.
00047   static const int kMaxLen = 3 + UNICHAR_LEN + 2;
00048   // Maximum number of fragments per character.
00049   static const int kMaxChunks = 5;
00050 
00051   // Setters and Getters.
00052   inline void set_all(const char *unichar, int pos, int total, bool natural) {
00053     set_unichar(unichar);
00054     set_pos(pos);
00055     set_total(total);
00056     set_natural(natural);
00057   }
00058   inline void set_unichar(const char *uch) {
00059     strncpy(this->unichar, uch, UNICHAR_LEN);
00060     this->unichar[UNICHAR_LEN] = '\0';
00061   }
00062   inline void set_pos(int p) { this->pos = p; }
00063   inline void set_total(int t) { this->total = t; }
00064   inline const char* get_unichar() const { return this->unichar; }
00065   inline int get_pos() const { return this->pos; }
00066   inline int get_total() const { return this->total; }
00067 
00068   // Returns the string that represents a fragment
00069   // with the given unichar, pos and total.
00070   static STRING to_string(const char *unichar, int pos, int total,
00071                           bool natural);
00072   // Returns the string that represents this fragment.
00073   STRING to_string() const {
00074     return to_string(unichar, pos, total, natural);
00075   }
00076 
00077   // Checks whether a fragment has the same unichar,
00078   // position and total as the given inputs.
00079   inline bool equals(const char *other_unichar,
00080                      int other_pos, int other_total) const {
00081     return (strcmp(this->unichar, other_unichar) == 0 &&
00082             this->pos == other_pos && this->total == other_total);
00083   }
00084   inline bool equals(const CHAR_FRAGMENT *other) const {
00085     return this->equals(other->get_unichar(),
00086                         other->get_pos(),
00087                         other->get_total());
00088   }
00089 
00090   // Checks whether a given fragment is a continuation of this fragment.
00091   // Assumes that the given fragment pointer is not NULL.
00092   inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
00093     return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
00094             this->total == fragment->get_total() &&
00095             this->pos == fragment->get_pos() + 1);
00096   }
00097 
00098   // Returns true if this fragment is a beginning fragment.
00099   inline bool is_beginning() const { return this->pos == 0; }
00100 
00101   // Returns true if this fragment is an ending fragment.
00102   inline bool is_ending() const { return this->pos == this->total-1; }
00103 
00104   // Returns true if the fragment was a separate component to begin with,
00105   // ie did not need chopping to be isolated, but may have been separated
00106   // out from a multi-outline blob.
00107   inline bool is_natural() const { return natural; }
00108   void set_natural(bool value) { natural = value; }
00109 
00110   // Parses the string to see whether it represents a character fragment
00111   // (rather than a regular character). If so, allocates memory for a new
00112   // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
00113   // information. Fragments are of the form:
00114   // |m|1|2, meaning chunk 1 of 2 of character m, or
00115   // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
00116   // to divide the parts, as they were already separate connected components.
00117   //
00118   // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
00119   // instance, otherwise (if the string does not represent a fragment or it
00120   // looks like it does, but parsing it as a fragment fails) returns NULL.
00121   //
00122   // Note: The caller is responsible for deallocating memory
00123   // associated with the returned pointer.
00124   static CHAR_FRAGMENT *parse_from_string(const char *str);
00125 
00126  private:
00127   char unichar[UNICHAR_LEN + 1];
00128   // True if the fragment was a separate component to begin with,
00129   // ie did not need chopping to be isolated, but may have been separated
00130   // out from a multi-outline blob.
00131   bool natural;
00132   inT16 pos;    // fragment position in the character
00133   inT16 total;  // total number of fragments in the character
00134 };
00135 
00136 // The UNICHARSET class is an utility class for Tesseract that holds the
00137 // set of characters that are used by the engine. Each character is identified
00138 // by a unique number, from 0 to (size - 1).
00139 class UNICHARSET {
00140  public:
00141   // Custom list of characters and their ligature forms (UTF8)
00142   // These map to unicode values in the private use area (PUC) and are supported
00143   // by only few font families (eg. Wyld, Adobe Caslon Pro).
00144   static const char* kCustomLigatures[][2];
00145 
00146   // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
00147   static const char* kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT];
00148 
00149   // ICU 2.0 UCharDirection enum (from third_party/icu/include/unicode/uchar.h)
00150   enum Direction {
00151       U_LEFT_TO_RIGHT               = 0,
00152       U_RIGHT_TO_LEFT               = 1,
00153       U_EUROPEAN_NUMBER             = 2,
00154       U_EUROPEAN_NUMBER_SEPARATOR   = 3,
00155       U_EUROPEAN_NUMBER_TERMINATOR  = 4,
00156       U_ARABIC_NUMBER               = 5,
00157       U_COMMON_NUMBER_SEPARATOR     = 6,
00158       U_BLOCK_SEPARATOR             = 7,
00159       U_SEGMENT_SEPARATOR           = 8,
00160       U_WHITE_SPACE_NEUTRAL         = 9,
00161       U_OTHER_NEUTRAL               = 10,
00162       U_LEFT_TO_RIGHT_EMBEDDING     = 11,
00163       U_LEFT_TO_RIGHT_OVERRIDE      = 12,
00164       U_RIGHT_TO_LEFT_ARABIC        = 13,
00165       U_RIGHT_TO_LEFT_EMBEDDING     = 14,
00166       U_RIGHT_TO_LEFT_OVERRIDE      = 15,
00167       U_POP_DIRECTIONAL_FORMAT      = 16,
00168       U_DIR_NON_SPACING_MARK        = 17,
00169       U_BOUNDARY_NEUTRAL            = 18,
00170       U_CHAR_DIRECTION_COUNT
00171   };
00172 
00173   // Create an empty UNICHARSET
00174   UNICHARSET();
00175 
00176   ~UNICHARSET();
00177 
00178   // Return the UNICHAR_ID of a given unichar representation within the
00179   // UNICHARSET.
00180   UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
00181 
00182   // Return the UNICHAR_ID of a given unichar representation within the
00183   // UNICHARSET. Only the first length characters from unichar_repr are used.
00184   UNICHAR_ID unichar_to_id(const char* const unichar_repr,
00185                                  int length) const;
00186 
00187   // Return the minimum number of bytes that matches a legal UNICHAR_ID,
00188   // while leaving the rest of the string encodable. Returns 0 if the
00189   // beginning of the string is not encodable.
00190   // WARNING: this function now encodes the whole string for precision.
00191   // Use encode_string in preference to repeatedly calling step.
00192   int step(const char* str) const;
00193 
00194   // Return whether the given UTF-8 string is encodable with this UNICHARSET.
00195   // If not encodable, write the first byte offset which cannot be converted
00196   // into the second (return) argument.
00197   bool encodable_string(const char *str, int *first_bad_position) const;
00198 
00199   // Encodes the given UTF-8 string with this UNICHARSET.
00200   // Any part of the string that cannot be encoded (because the utf8 can't
00201   // be broken up into pieces that are in the unicharset) then:
00202   // if give_up_on_failure, stops and returns a partial encoding,
00203   // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.
00204   // Returns true if the encoding succeeds completely, false if there is at
00205   // least one failure.
00206   // If lengths is not NULL, then it is filled with the corresponding
00207   // byte length of each encoded UNICHAR_ID.
00208   // If encoded_length is not NULL then on return it contains the length of
00209   // str that was encoded. (if give_up_on_failure the location of the first
00210   // failure, otherwise strlen(str).)
00211   bool encode_string(const char* str, bool give_up_on_failure,
00212                      GenericVector<UNICHAR_ID>* encoding,
00213                      GenericVector<char>* lengths,
00214                      int* encoded_length) const;
00215 
00216   // Return the unichar representation corresponding to the given UNICHAR_ID
00217   // within the UNICHARSET.
00218   const char* id_to_unichar(UNICHAR_ID id) const;
00219 
00220   // Return the UTF8 representation corresponding to the given UNICHAR_ID after
00221   // resolving any private encodings internal to Tesseract. This method is
00222   // preferable to id_to_unichar for outputting text that will be visible to
00223   // external applications.
00224   const char* id_to_unichar_ext(UNICHAR_ID id) const;
00225 
00226   // Return a STRING that reformats the utf8 str into the str followed
00227   // by its hex unicodes.
00228   static STRING debug_utf8_str(const char* str);
00229 
00230   // Return a STRING containing debug information on the unichar, including
00231   // the id_to_unichar, its hex unicodes and the properties.
00232   STRING debug_str(UNICHAR_ID id) const;
00233   STRING debug_str(const char * unichar_repr) const {
00234     return debug_str(unichar_to_id(unichar_repr));
00235   }
00236 
00237   // Add a unichar representation to the set.
00238   void unichar_insert(const char* const unichar_repr);
00239 
00240   // Return true if the given unichar id exists within the set.
00241   // Relies on the fact that unichar ids are contiguous in the unicharset.
00242   bool contains_unichar_id(UNICHAR_ID unichar_id) const {
00243     return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
00244         unichar_id >= 0;
00245   }
00246 
00247   // Return true if the given unichar representation exists within the set.
00248   bool contains_unichar(const char* const unichar_repr) const;
00249   bool contains_unichar(const char* const unichar_repr, int length) const;
00250 
00251   // Return true if the given unichar representation corresponds to the given
00252   // UNICHAR_ID within the set.
00253   bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
00254 
00255   // Delete CHAR_FRAGMENTs stored in properties of unichars array.
00256   void delete_pointers_in_unichars() {
00257     for (int i = 0; i < size_used; ++i) {
00258       if (unichars[i].properties.fragment != NULL) {
00259         delete unichars[i].properties.fragment;
00260         unichars[i].properties.fragment = NULL;
00261       }
00262     }
00263   }
00264 
00265   // Clear the UNICHARSET (all the previous data is lost).
00266   void clear() {
00267     if (script_table != NULL) {
00268       for (int i = 0; i < script_table_size_used; ++i)
00269         delete[] script_table[i];
00270       delete[] script_table;
00271       script_table = NULL;
00272       script_table_size_used = 0;
00273     }
00274     if (unichars != NULL) {
00275       delete_pointers_in_unichars();
00276       delete[] unichars;
00277       unichars = NULL;
00278     }
00279     script_table_size_reserved = 0;
00280     size_reserved = 0;
00281     size_used = 0;
00282     ids.clear();
00283     top_bottom_set_ = false;
00284     script_has_upper_lower_ = false;
00285     script_has_xheight_ = false;
00286     null_sid_ = 0;
00287     common_sid_ = 0;
00288     latin_sid_ = 0;
00289     cyrillic_sid_ = 0;
00290     greek_sid_ = 0;
00291     han_sid_ = 0;
00292     hiragana_sid_ = 0;
00293     katakana_sid_ = 0;
00294   }
00295 
00296   // Return the size of the set (the number of different UNICHAR it holds).
00297   int size() const {
00298     return size_used;
00299   }
00300 
00301   // Reserve enough memory space for the given number of UNICHARS
00302   void reserve(int unichars_number);
00303 
00304   // Opens the file indicated by filename and saves unicharset to that file.
00305   // Returns true if the operation is successful.
00306   bool save_to_file(const char * const filename) const {
00307     FILE* file = fopen(filename, "w+b");
00308     if (file == NULL) return false;
00309     bool result = save_to_file(file);
00310     fclose(file);
00311     return result;
00312   }
00313 
00314   // Saves the content of the UNICHARSET to the given file.
00315   // Returns true if the operation is successful.
00316   bool save_to_file(FILE *file) const {
00317     STRING str;
00318     if (!save_to_string(&str)) return false;
00319     if (fwrite(&str[0], str.length(), 1, file) != 1) return false;
00320     return true;
00321   }
00322   bool save_to_file(tesseract::TFile *file) const {
00323     STRING str;
00324     if (!save_to_string(&str)) return false;
00325     if (file->FWrite(&str[0], str.length(), 1) != 1) return false;
00326     return true;
00327   }
00328 
00329   // Saves the content of the UNICHARSET to the given STRING.
00330   // Returns true if the operation is successful.
00331   bool save_to_string(STRING *str) const;
00332 
00333   // Load a unicharset from a unicharset file that has been loaded into
00334   // the given memory buffer.
00335   // Returns true if the operation is successful.
00336   bool load_from_inmemory_file(const char* const memory, int mem_size,
00337                                bool skip_fragments);
00338   // Returns true if the operation is successful.
00339   bool load_from_inmemory_file(const char* const memory, int mem_size) {
00340     return load_from_inmemory_file(memory, mem_size, false);
00341   }
00342 
00343   // Opens the file indicated by filename and loads the UNICHARSET
00344   // from the given file. The previous data is lost.
00345   // Returns true if the operation is successful.
00346   bool load_from_file(const char* const filename, bool skip_fragments) {
00347     FILE* file = fopen(filename, "rb");
00348     if (file == NULL) return false;
00349     bool result = load_from_file(file, skip_fragments);
00350     fclose(file);
00351     return result;
00352   }
00353   // returns true if the operation is successful.
00354   bool load_from_file(const char* const filename) {
00355     return load_from_file(filename, false);
00356   }
00357 
00358   // Loads the UNICHARSET from the given file. The previous data is lost.
00359   // Returns true if the operation is successful.
00360   bool load_from_file(FILE *file, bool skip_fragments);
00361   bool load_from_file(FILE *file) { return load_from_file(file, false); }
00362   bool load_from_file(tesseract::TFile *file, bool skip_fragments);
00363 
00364 
00365   // Sets up internal data after loading the file, based on the char
00366   // properties. Called from load_from_file, but also needs to be run
00367   // during set_unicharset_properties.
00368   void post_load_setup();
00369 
00370   // Returns true if right_to_left scripts are significant in the unicharset,
00371   // but without being so sensitive that "universal" unicharsets containing
00372   // characters from many scripts, like orientation and script detection,
00373   // look like they are right_to_left.
00374   bool major_right_to_left() const;
00375 
00376   // Set a whitelist and/or blacklist of characters to recognize.
00377   // An empty or NULL whitelist enables everything (minus any blacklist).
00378   // An empty or NULL blacklist disables nothing.
00379   // An empty or NULL unblacklist has no effect.
00380   // The blacklist overrides the whitelist.
00381   // The unblacklist overrides the blacklist.
00382   // Each list is a string of utf8 character strings. Boundaries between
00383   // unicharset units are worked out automatically, and characters not in
00384   // the unicharset are silently ignored.
00385   void set_black_and_whitelist(const char* blacklist, const char* whitelist,
00386                                const char* unblacklist);
00387 
00388   // Set the isalpha property of the given unichar to the given value.
00389   void set_isalpha(UNICHAR_ID unichar_id, bool value) {
00390     unichars[unichar_id].properties.isalpha = value;
00391   }
00392 
00393   // Set the islower property of the given unichar to the given value.
00394   void set_islower(UNICHAR_ID unichar_id, bool value) {
00395     unichars[unichar_id].properties.islower = value;
00396   }
00397 
00398   // Set the isupper property of the given unichar to the given value.
00399   void set_isupper(UNICHAR_ID unichar_id, bool value) {
00400     unichars[unichar_id].properties.isupper = value;
00401   }
00402 
00403   // Set the isdigit property of the given unichar to the given value.
00404   void set_isdigit(UNICHAR_ID unichar_id, bool value) {
00405     unichars[unichar_id].properties.isdigit = value;
00406   }
00407 
00408   // Set the ispunctuation property of the given unichar to the given value.
00409   void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
00410     unichars[unichar_id].properties.ispunctuation = value;
00411   }
00412 
00413   // Set the isngram property of the given unichar to the given value.
00414   void set_isngram(UNICHAR_ID unichar_id, bool value) {
00415     unichars[unichar_id].properties.isngram = value;
00416   }
00417 
00418   // Set the script name of the given unichar to the given value.
00419   // Value is copied and thus can be a temporary;
00420   void set_script(UNICHAR_ID unichar_id, const char* value) {
00421     unichars[unichar_id].properties.script_id = add_script(value);
00422   }
00423 
00424   // Set other_case unichar id in the properties for the given unichar id.
00425   void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
00426     unichars[unichar_id].properties.other_case = other_case;
00427   }
00428 
00429   // Set the direction property of the given unichar to the given value.
00430   void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) {
00431     unichars[unichar_id].properties.direction = value;
00432   }
00433 
00434   // Set mirror unichar id in the properties for the given unichar id.
00435   void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
00436     unichars[unichar_id].properties.mirror = mirror;
00437   }
00438 
00439   // Record normalized version of unichar with the given unichar_id.
00440   void set_normed(UNICHAR_ID unichar_id, const char* normed) {
00441     unichars[unichar_id].properties.normed = normed;
00442     unichars[unichar_id].properties.normed_ids.truncate(0);
00443   }
00444   // Sets the normed_ids vector from the normed string. normed_ids is not
00445   // stored in the file, and needs to be set when the UNICHARSET is loaded.
00446   void set_normed_ids(UNICHAR_ID unichar_id);
00447 
00448   // Return the isalpha property of the given unichar.
00449   bool get_isalpha(UNICHAR_ID unichar_id) const {
00450     if (INVALID_UNICHAR_ID == unichar_id) return false;
00451     ASSERT_HOST(contains_unichar_id(unichar_id));
00452     return unichars[unichar_id].properties.isalpha;
00453   }
00454 
00455   // Return the islower property of the given unichar.
00456   bool get_islower(UNICHAR_ID unichar_id) const {
00457     if (INVALID_UNICHAR_ID == unichar_id) return false;
00458     ASSERT_HOST(contains_unichar_id(unichar_id));
00459     return unichars[unichar_id].properties.islower;
00460   }
00461 
00462   // Return the isupper property of the given unichar.
00463   bool get_isupper(UNICHAR_ID unichar_id) const {
00464     if (INVALID_UNICHAR_ID == unichar_id) return false;
00465     ASSERT_HOST(contains_unichar_id(unichar_id));
00466     return unichars[unichar_id].properties.isupper;
00467   }
00468 
00469   // Return the isdigit property of the given unichar.
00470   bool get_isdigit(UNICHAR_ID unichar_id) const {
00471     if (INVALID_UNICHAR_ID == unichar_id) return false;
00472     ASSERT_HOST(contains_unichar_id(unichar_id));
00473     return unichars[unichar_id].properties.isdigit;
00474   }
00475 
00476   // Return the ispunctuation property of the given unichar.
00477   bool get_ispunctuation(UNICHAR_ID unichar_id) const {
00478     if (INVALID_UNICHAR_ID == unichar_id) return false;
00479     ASSERT_HOST(contains_unichar_id(unichar_id));
00480     return unichars[unichar_id].properties.ispunctuation;
00481   }
00482 
00483   // Return the isngram property of the given unichar.
00484   bool get_isngram(UNICHAR_ID unichar_id) const {
00485     if (INVALID_UNICHAR_ID == unichar_id) return false;
00486     ASSERT_HOST(contains_unichar_id(unichar_id));
00487     return unichars[unichar_id].properties.isngram;
00488   }
00489 
00490   // Returns whether the unichar id represents a unicode value in the private
00491   // use area.
00492   bool get_isprivate(UNICHAR_ID unichar_id) const;
00493 
00494   // Returns true if the ids have useful min/max top/bottom values.
00495   bool top_bottom_useful() const {
00496     return top_bottom_set_;
00497   }
00498   // Sets all ranges to empty, so they can be expanded to set the values.
00499   void set_ranges_empty();
00500   // Sets all the properties for this unicharset given a src_unicharset with
00501   // everything set. The unicharsets don't have to be the same, and graphemes
00502   // are correctly accounted for.
00503   void SetPropertiesFromOther(const UNICHARSET& src) {
00504     PartialSetPropertiesFromOther(0, src);
00505   }
00506   // Sets properties from Other, starting only at the given index.
00507   void PartialSetPropertiesFromOther(int start_index, const UNICHARSET& src);
00508   // Expands the tops and bottoms and widths for this unicharset given a
00509   // src_unicharset with ranges in it. The unicharsets don't have to be the
00510   // same, and graphemes are correctly accounted for.
00511   void ExpandRangesFromOther(const UNICHARSET& src);
00512   // Makes this a copy of src. Clears this completely first, so the automattic
00513   // ids will not be present in this if not in src.
00514   void CopyFrom(const UNICHARSET& src);
00515   // For each id in src, if it does not occur in this, add it, as in
00516   // SetPropertiesFromOther, otherwise expand the ranges, as in
00517   // ExpandRangesFromOther.
00518   void AppendOtherUnicharset(const UNICHARSET& src);
00519   // Returns true if the acceptable ranges of the tops of the characters do
00520   // not overlap, making their x-height calculations distinct.
00521   bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;
00522   // Returns the min and max bottom and top of the given unichar in
00523   // baseline-normalized coordinates, ie, where the baseline is
00524   // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
00525   // (See normalis.h for the definitions).
00526   void get_top_bottom(UNICHAR_ID unichar_id,
00527                       int* min_bottom, int* max_bottom,
00528                       int* min_top, int* max_top) const {
00529     if (INVALID_UNICHAR_ID == unichar_id) {
00530       *min_bottom = *min_top = 0;
00531       *max_bottom = *max_top = 256;  // kBlnCellHeight
00532       return;
00533     }
00534     ASSERT_HOST(contains_unichar_id(unichar_id));
00535     *min_bottom = unichars[unichar_id].properties.min_bottom;
00536     *max_bottom = unichars[unichar_id].properties.max_bottom;
00537     *min_top = unichars[unichar_id].properties.min_top;
00538     *max_top = unichars[unichar_id].properties.max_top;
00539   }
00540   void set_top_bottom(UNICHAR_ID unichar_id,
00541                       int min_bottom, int max_bottom,
00542                       int min_top, int max_top) {
00543     unichars[unichar_id].properties.min_bottom =
00544         static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8));
00545     unichars[unichar_id].properties.max_bottom =
00546         static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8));
00547     unichars[unichar_id].properties.min_top =
00548         static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8));
00549     unichars[unichar_id].properties.max_top =
00550         static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));
00551   }
00552   // Returns the width stats (as mean, sd) of the given unichar relative to the
00553   // median advance of all characters in the character set.
00554   void get_width_stats(UNICHAR_ID unichar_id,
00555                        float* width, float* width_sd) const {
00556     if (INVALID_UNICHAR_ID == unichar_id) {
00557       *width = 0.0f;
00558       *width_sd = 0.0f;;
00559       return;
00560     }
00561     ASSERT_HOST(contains_unichar_id(unichar_id));
00562     *width = unichars[unichar_id].properties.width;
00563     *width_sd = unichars[unichar_id].properties.width_sd;
00564   }
00565   void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) {
00566     unichars[unichar_id].properties.width = width;
00567     unichars[unichar_id].properties.width_sd = width_sd;
00568   }
00569   // Returns the stats of the x-bearing (as mean, sd) of the given unichar
00570   // relative to the median advance of all characters in the character set.
00571   void get_bearing_stats(UNICHAR_ID unichar_id,
00572                          float* bearing, float* bearing_sd) const {
00573     if (INVALID_UNICHAR_ID == unichar_id) {
00574       *bearing = *bearing_sd = 0.0f;
00575       return;
00576     }
00577     ASSERT_HOST(contains_unichar_id(unichar_id));
00578     *bearing = unichars[unichar_id].properties.bearing;
00579     *bearing_sd = unichars[unichar_id].properties.bearing_sd;
00580   }
00581   void set_bearing_stats(UNICHAR_ID unichar_id,
00582                          float bearing, float bearing_sd) {
00583     unichars[unichar_id].properties.bearing = bearing;
00584     unichars[unichar_id].properties.bearing_sd = bearing_sd;
00585   }
00586   // Returns the stats of the x-advance of the given unichar (as mean, sd)
00587   // relative to the median advance of all characters in the character set.
00588   void get_advance_stats(UNICHAR_ID unichar_id,
00589                          float* advance, float* advance_sd) const {
00590     if (INVALID_UNICHAR_ID == unichar_id) {
00591       *advance = *advance_sd = 0;
00592       return;
00593     }
00594     ASSERT_HOST(contains_unichar_id(unichar_id));
00595     *advance = unichars[unichar_id].properties.advance;
00596     *advance_sd = unichars[unichar_id].properties.advance_sd;
00597   }
00598   void set_advance_stats(UNICHAR_ID unichar_id,
00599                          float advance, float advance_sd) {
00600     unichars[unichar_id].properties.advance = advance;
00601     unichars[unichar_id].properties.advance_sd = advance_sd;
00602   }
00603   // Returns true if the font metrics properties are empty.
00604   bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {
00605     return unichars[unichar_id].properties.AnyRangeEmpty();
00606   }
00607 
00608   // Return the script name of the given unichar.
00609   // The returned pointer will always be the same for the same script, it's
00610   // managed by unicharset and thus MUST NOT be deleted
00611   int get_script(UNICHAR_ID unichar_id) const {
00612     if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
00613     ASSERT_HOST(contains_unichar_id(unichar_id));
00614     return unichars[unichar_id].properties.script_id;
00615   }
00616 
00617   // Return the character properties, eg. alpha/upper/lower/digit/punct,
00618   // as a bit field of unsigned int.
00619   unsigned int get_properties(UNICHAR_ID unichar_id) const;
00620 
00621   // Return the character property as a single char.  If a character has
00622   // multiple attributes, the main property is defined by the following order:
00623   //   upper_case : 'A'
00624   //   lower_case : 'a'
00625   //   alpha      : 'x'
00626   //   digit      : '0'
00627   //   punctuation: 'p'
00628   char get_chartype(UNICHAR_ID unichar_id) const;
00629 
00630   // Get other_case unichar id in the properties for the given unichar id.
00631   UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
00632     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00633     ASSERT_HOST(contains_unichar_id(unichar_id));
00634     return unichars[unichar_id].properties.other_case;
00635   }
00636 
00637   // Returns the direction property of the given unichar.
00638   Direction get_direction(UNICHAR_ID unichar_id) const {
00639      if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
00640      ASSERT_HOST(contains_unichar_id(unichar_id));
00641      return unichars[unichar_id].properties.direction;
00642    }
00643 
00644   // Get mirror unichar id in the properties for the given unichar id.
00645   UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
00646     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00647     ASSERT_HOST(contains_unichar_id(unichar_id));
00648     return unichars[unichar_id].properties.mirror;
00649   }
00650 
00651   // Returns UNICHAR_ID of the corresponding lower-case unichar.
00652   UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
00653     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00654     ASSERT_HOST(contains_unichar_id(unichar_id));
00655     if (unichars[unichar_id].properties.islower) return unichar_id;
00656     return unichars[unichar_id].properties.other_case;
00657   }
00658 
00659   // Returns UNICHAR_ID of the corresponding upper-case unichar.
00660   UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
00661     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00662     ASSERT_HOST(contains_unichar_id(unichar_id));
00663     if (unichars[unichar_id].properties.isupper) return unichar_id;
00664     return unichars[unichar_id].properties.other_case;
00665   }
00666 
00667   // Returns true if this UNICHARSET has the special codes in
00668   // SpecialUnicharCodes available. If false then there are normal unichars
00669   // at these codes and they should not be used.
00670   bool has_special_codes() const {
00671     return get_fragment(UNICHAR_BROKEN) != NULL &&
00672         strcmp(id_to_unichar(UNICHAR_BROKEN),
00673                kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
00674   }
00675 
00676   // Returns true if there are any repeated unicodes in the normalized
00677   // text of any unichar-id in the unicharset.
00678   bool AnyRepeatedUnicodes() const;
00679 
00680   // Return a pointer to the CHAR_FRAGMENT class if the given
00681   // unichar id represents a character fragment.
00682   const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
00683     if (INVALID_UNICHAR_ID == unichar_id) return NULL;
00684     ASSERT_HOST(contains_unichar_id(unichar_id));
00685     return unichars[unichar_id].properties.fragment;
00686   }
00687 
00688   // Return the isalpha property of the given unichar representation.
00689   bool get_isalpha(const char* const unichar_repr) const {
00690     return get_isalpha(unichar_to_id(unichar_repr));
00691   }
00692 
00693   // Return the islower property of the given unichar representation.
00694   bool get_islower(const char* const unichar_repr) const {
00695     return get_islower(unichar_to_id(unichar_repr));
00696   }
00697 
00698   // Return the isupper property of the given unichar representation.
00699   bool get_isupper(const char* const unichar_repr) const {
00700     return get_isupper(unichar_to_id(unichar_repr));
00701   }
00702 
00703   // Return the isdigit property of the given unichar representation.
00704   bool get_isdigit(const char* const unichar_repr) const {
00705     return get_isdigit(unichar_to_id(unichar_repr));
00706   }
00707 
00708   // Return the ispunctuation property of the given unichar representation.
00709   bool get_ispunctuation(const char* const unichar_repr) const {
00710     return get_ispunctuation(unichar_to_id(unichar_repr));
00711   }
00712 
00713   // Return the character properties, eg. alpha/upper/lower/digit/punct,
00714   // of the given unichar representation
00715   unsigned int get_properties(const char* const unichar_repr) const {
00716     return get_properties(unichar_to_id(unichar_repr));
00717   }
00718 
00719   char get_chartype(const char* const unichar_repr) const {
00720     return get_chartype(unichar_to_id(unichar_repr));
00721   }
00722 
00723   // Return the script name of the given unichar representation.
00724   // The returned pointer will always be the same for the same script, it's
00725   // managed by unicharset and thus MUST NOT be deleted
00726   int get_script(const char* const unichar_repr) const {
00727     return get_script(unichar_to_id(unichar_repr));
00728   }
00729 
00730   // Return a pointer to the CHAR_FRAGMENT class struct if the given
00731   // unichar representation represents a character fragment.
00732   const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
00733     if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
00734         !ids.contains(unichar_repr)) {
00735       return NULL;
00736     }
00737     return get_fragment(unichar_to_id(unichar_repr));
00738   }
00739 
00740   // Return the isalpha property of the given unichar representation.
00741   // Only the first length characters from unichar_repr are used.
00742   bool get_isalpha(const char* const unichar_repr,
00743                int length) const {
00744     return get_isalpha(unichar_to_id(unichar_repr, length));
00745   }
00746 
00747   // Return the islower property of the given unichar representation.
00748   // Only the first length characters from unichar_repr are used.
00749   bool get_islower(const char* const unichar_repr,
00750                int length) const {
00751     return get_islower(unichar_to_id(unichar_repr, length));
00752   }
00753 
00754   // Return the isupper property of the given unichar representation.
00755   // Only the first length characters from unichar_repr are used.
00756   bool get_isupper(const char* const unichar_repr,
00757                int length) const {
00758     return get_isupper(unichar_to_id(unichar_repr, length));
00759   }
00760 
00761   // Return the isdigit property of the given unichar representation.
00762   // Only the first length characters from unichar_repr are used.
00763   bool get_isdigit(const char* const unichar_repr,
00764                int length) const {
00765     return get_isdigit(unichar_to_id(unichar_repr, length));
00766   }
00767 
00768   // Return the ispunctuation property of the given unichar representation.
00769   // Only the first length characters from unichar_repr are used.
00770   bool get_ispunctuation(const char* const unichar_repr,
00771                           int length) const {
00772     return get_ispunctuation(unichar_to_id(unichar_repr, length));
00773   }
00774 
00775   // Returns normalized version of unichar with the given unichar_id.
00776   const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
00777     if (unichar_id == UNICHAR_SPACE && has_special_codes()) return " ";
00778     return unichars[unichar_id].properties.normed.string();
00779   }
00780   // Returns a vector of UNICHAR_IDs that represent the ids of the normalized
00781   // version of the given id. There may be more than one UNICHAR_ID in the
00782   // vector if unichar_id represents a ligature.
00783   const GenericVector<UNICHAR_ID>& normed_ids(UNICHAR_ID unichar_id) const {
00784     return unichars[unichar_id].properties.normed_ids;
00785   }
00786 
00787   // Return the script name of the given unichar representation.
00788   // Only the first length characters from unichar_repr are used.
00789   // The returned pointer will always be the same for the same script, it's
00790   // managed by unicharset and thus MUST NOT be deleted
00791   int get_script(const char* const unichar_repr,
00792                  int length) const {
00793     return get_script(unichar_to_id(unichar_repr, length));
00794   }
00795 
00796   // Return the (current) number of scripts in the script table
00797   int get_script_table_size() const {
00798     return script_table_size_used;
00799   }
00800 
00801   // Return the script string from its id
00802   const char* get_script_from_script_id(int id) const {
00803     if (id >= script_table_size_used || id < 0)
00804       return null_script;
00805     return script_table[id];
00806   }
00807 
00808   // Returns the id from the name of the script, or 0 if script is not found.
00809   // Note that this is an expensive operation since it involves iteratively
00810   // comparing strings in the script table.  To avoid dependency on STL, we
00811   // won't use a hash.  Instead, the calling function can use this to lookup
00812   // and save the ID for relevant scripts for fast comparisons later.
00813   int get_script_id_from_name(const char* script_name) const;
00814 
00815   // Return true if the given script is the null script
00816   bool is_null_script(const char* script) const {
00817     return script == null_script;
00818   }
00819 
00820   // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
00821   // then the returned pointer will be the same.
00822   // The script parameter is copied and thus can be a temporary.
00823   int add_script(const char* script);
00824 
00825   // Return the enabled property of the given unichar.
00826   bool get_enabled(UNICHAR_ID unichar_id) const {
00827     return unichars[unichar_id].properties.enabled;
00828   }
00829 
00830 
00831   int null_sid() const { return null_sid_; }
00832   int common_sid() const { return common_sid_; }
00833   int latin_sid() const { return latin_sid_; }
00834   int cyrillic_sid() const { return cyrillic_sid_; }
00835   int greek_sid() const { return greek_sid_; }
00836   int han_sid() const { return han_sid_; }
00837   int hiragana_sid() const { return hiragana_sid_; }
00838   int katakana_sid() const { return katakana_sid_; }
00839   int default_sid() const { return default_sid_; }
00840 
00841   // Returns true if the unicharset has the concept of upper/lower case.
00842   bool script_has_upper_lower() const {
00843     return script_has_upper_lower_;
00844   }
00845   // Returns true if the unicharset has the concept of x-height.
00846   // script_has_xheight can be true even if script_has_upper_lower is not,
00847   // when the script has a sufficiently predominant top line with ascenders,
00848   // such as Devanagari and Thai.
00849   bool script_has_xheight() const {
00850     return script_has_xheight_;
00851   }
00852 
00853  private:
00854 
00855   struct UNICHAR_PROPERTIES {
00856     UNICHAR_PROPERTIES();
00857     // Initializes all properties to sensible default values.
00858     void Init();
00859     // Sets all ranges wide open. Initialization default in case there are
00860     // no useful values available.
00861     void SetRangesOpen();
00862     // Sets all ranges to empty. Used before expanding with font-based data.
00863     void SetRangesEmpty();
00864     // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
00865     // is emtpy.
00866     bool AnyRangeEmpty() const;
00867     // Expands the ranges with the ranges from the src properties.
00868     void ExpandRangesFrom(const UNICHAR_PROPERTIES& src);
00869     // Copies the properties from src into this.
00870     void CopyFrom(const UNICHAR_PROPERTIES& src);
00871 
00872     bool  isalpha;
00873     bool  islower;
00874     bool  isupper;
00875     bool  isdigit;
00876     bool  ispunctuation;
00877     bool  isngram;
00878     bool  enabled;
00879     // Possible limits of the top and bottom of the bounding box in
00880     // baseline-normalized coordinates, ie, where the baseline is
00881     // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
00882     // (See normalis.h for the definitions).
00883     uinT8 min_bottom;
00884     uinT8 max_bottom;
00885     uinT8 min_top;
00886     uinT8 max_top;
00887     // Statstics of the widths of bounding box, relative to the median advance.
00888     float width;
00889     float width_sd;
00890     // Stats of the x-bearing and advance, also relative to the median advance.
00891     float bearing;
00892     float bearing_sd;
00893     float advance;
00894     float advance_sd;
00895     int   script_id;
00896     UNICHAR_ID other_case;  // id of the corresponding upper/lower case unichar
00897     Direction direction;  // direction of this unichar
00898     // Mirror property is useful for reverse DAWG lookup for words in
00899     // right-to-left languages (e.g. "(word)" would be in
00900     // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
00901     // However, what we want in our DAWG is
00902     // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
00903     // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
00904     UNICHAR_ID mirror;
00905     // A string of unichar_ids that represent the corresponding normed string.
00906     // For awkward characters like em-dash, this gives hyphen.
00907     // For ligatures, this gives the string of normal unichars.
00908     GenericVector<UNICHAR_ID> normed_ids;
00909     STRING normed;  // normalized version of this unichar
00910     // Contains meta information about the fragment if a unichar represents
00911     // a fragment of a character, otherwise should be set to NULL.
00912     // It is assumed that character fragments are added to the unicharset
00913     // after the corresponding 'base' characters.
00914     CHAR_FRAGMENT *fragment;
00915   };
00916 
00917   struct UNICHAR_SLOT {
00918     char representation[UNICHAR_LEN + 1];
00919     UNICHAR_PROPERTIES properties;
00920   };
00921 
00922   // Internal recursive version of encode_string above.
00923   // str is the start of the whole string.
00924   // str_index is the current position in str.
00925   // str_length is the length of str.
00926   // encoding is a working encoding of str.
00927   // lengths is a working set of lengths of each element of encoding.
00928   // best_total_length is the longest length of str that has been successfully
00929   // encoded so far.
00930   // On return:
00931   // best_encoding contains the encoding that used the longest part of str.
00932   // best_lengths (may be null) contains the lengths of best_encoding.
00933   void encode_string(const char* str, int str_index, int str_length,
00934                      GenericVector<UNICHAR_ID>* encoding,
00935                      GenericVector<char>* lengths,
00936                      int* best_total_length,
00937                      GenericVector<UNICHAR_ID>* best_encoding,
00938                      GenericVector<char>* best_lengths) const;
00939 
00940   // Gets the properties for a grapheme string, combining properties for
00941   // multiple characters in a meaningful way where possible.
00942   // Returns false if no valid match was found in the unicharset.
00943   // NOTE that script_id, mirror, and other_case refer to this unicharset on
00944   // return and will need redirecting if the target unicharset is different.
00945   bool GetStrProperties(const char* utf8_str,
00946                         UNICHAR_PROPERTIES* props) const;
00947 
00948   // Load ourselves from a "file" where our only interface to the file is
00949   // an implementation of fgets().  This is the parsing primitive accessed by
00950   // the public routines load_from_file() and load_from_inmemory_file().
00951   bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
00952                       bool skip_fragments);
00953 
00954   UNICHAR_SLOT* unichars;
00955   UNICHARMAP ids;
00956   int size_used;
00957   int size_reserved;
00958   char** script_table;
00959   int script_table_size_used;
00960   int script_table_size_reserved;
00961   const char* null_script;
00962   // True if the unichars have their tops/bottoms set.
00963   bool top_bottom_set_;
00964   // True if the unicharset has significant upper/lower case chars.
00965   bool script_has_upper_lower_;
00966   // True if the unicharset has a significant mean-line with significant
00967   // ascenders above that.
00968   bool script_has_xheight_;
00969 
00970   // A few convenient script name-to-id mapping without using hash.
00971   // These are initialized when unicharset file is loaded.  Anything
00972   // missing from this list can be looked up using get_script_id_from_name.
00973   int null_sid_;
00974   int common_sid_;
00975   int latin_sid_;
00976   int cyrillic_sid_;
00977   int greek_sid_;
00978   int han_sid_;
00979   int hiragana_sid_;
00980   int katakana_sid_;
00981   // The most frequently occurring script in the charset.
00982   int default_sid_;
00983 };
00984 
00985 #endif  // TESSERACT_CCUTIL_UNICHARSET_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines