tesseract 3.04.01

UNICHARSET Class Reference

#include <unicharset.h>

List of all members.

Classes

struct  UNICHAR_PROPERTIES
struct  UNICHAR_SLOT

Public Types

enum  Direction {
  U_LEFT_TO_RIGHT = 0, U_RIGHT_TO_LEFT = 1, U_EUROPEAN_NUMBER = 2, U_EUROPEAN_NUMBER_SEPARATOR = 3,
  U_EUROPEAN_NUMBER_TERMINATOR = 4, U_ARABIC_NUMBER = 5, U_COMMON_NUMBER_SEPARATOR = 6, U_BLOCK_SEPARATOR = 7,
  U_SEGMENT_SEPARATOR = 8, U_WHITE_SPACE_NEUTRAL = 9, U_OTHER_NEUTRAL = 10, U_LEFT_TO_RIGHT_EMBEDDING = 11,
  U_LEFT_TO_RIGHT_OVERRIDE = 12, U_RIGHT_TO_LEFT_ARABIC = 13, U_RIGHT_TO_LEFT_EMBEDDING = 14, U_RIGHT_TO_LEFT_OVERRIDE = 15,
  U_POP_DIRECTIONAL_FORMAT = 16, U_DIR_NON_SPACING_MARK = 17, U_BOUNDARY_NEUTRAL = 18, U_CHAR_DIRECTION_COUNT
}

Public Member Functions

 UNICHARSET ()
 ~UNICHARSET ()
UNICHAR_ID unichar_to_id (const char *const unichar_repr) const
UNICHAR_ID unichar_to_id (const char *const unichar_repr, int length) const
int step (const char *str) const
bool encodable_string (const char *str, int *first_bad_position) const
bool encode_string (const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
const char * id_to_unichar (UNICHAR_ID id) const
const char * id_to_unichar_ext (UNICHAR_ID id) const
STRING debug_str (UNICHAR_ID id) const
STRING debug_str (const char *unichar_repr) const
void unichar_insert (const char *const unichar_repr)
bool contains_unichar_id (UNICHAR_ID unichar_id) const
bool contains_unichar (const char *const unichar_repr) const
bool contains_unichar (const char *const unichar_repr, int length) const
bool eq (UNICHAR_ID unichar_id, const char *const unichar_repr) const
void delete_pointers_in_unichars ()
void clear ()
int size () const
void reserve (int unichars_number)
bool save_to_file (const char *const filename) const
bool save_to_file (FILE *file) const
bool save_to_file (tesseract::TFile *file) const
bool save_to_string (STRING *str) const
bool load_from_inmemory_file (const char *const memory, int mem_size, bool skip_fragments)
bool load_from_inmemory_file (const char *const memory, int mem_size)
bool load_from_file (const char *const filename, bool skip_fragments)
bool load_from_file (const char *const filename)
bool load_from_file (FILE *file, bool skip_fragments)
bool load_from_file (FILE *file)
bool load_from_file (tesseract::TFile *file, bool skip_fragments)
void post_load_setup ()
bool major_right_to_left () const
void set_black_and_whitelist (const char *blacklist, const char *whitelist, const char *unblacklist)
void set_isalpha (UNICHAR_ID unichar_id, bool value)
void set_islower (UNICHAR_ID unichar_id, bool value)
void set_isupper (UNICHAR_ID unichar_id, bool value)
void set_isdigit (UNICHAR_ID unichar_id, bool value)
void set_ispunctuation (UNICHAR_ID unichar_id, bool value)
void set_isngram (UNICHAR_ID unichar_id, bool value)
void set_script (UNICHAR_ID unichar_id, const char *value)
void set_other_case (UNICHAR_ID unichar_id, UNICHAR_ID other_case)
void set_direction (UNICHAR_ID unichar_id, UNICHARSET::Direction value)
void set_mirror (UNICHAR_ID unichar_id, UNICHAR_ID mirror)
void set_normed (UNICHAR_ID unichar_id, const char *normed)
void set_normed_ids (UNICHAR_ID unichar_id)
bool get_isalpha (UNICHAR_ID unichar_id) const
bool get_islower (UNICHAR_ID unichar_id) const
bool get_isupper (UNICHAR_ID unichar_id) const
bool get_isdigit (UNICHAR_ID unichar_id) const
bool get_ispunctuation (UNICHAR_ID unichar_id) const
bool get_isngram (UNICHAR_ID unichar_id) const
bool get_isprivate (UNICHAR_ID unichar_id) const
bool top_bottom_useful () const
void set_ranges_empty ()
void SetPropertiesFromOther (const UNICHARSET &src)
void PartialSetPropertiesFromOther (int start_index, const UNICHARSET &src)
void ExpandRangesFromOther (const UNICHARSET &src)
void CopyFrom (const UNICHARSET &src)
void AppendOtherUnicharset (const UNICHARSET &src)
bool SizesDistinct (UNICHAR_ID id1, UNICHAR_ID id2) const
void get_top_bottom (UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
void set_top_bottom (UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
void get_width_stats (UNICHAR_ID unichar_id, float *width, float *width_sd) const
void set_width_stats (UNICHAR_ID unichar_id, float width, float width_sd)
void get_bearing_stats (UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
void set_bearing_stats (UNICHAR_ID unichar_id, float bearing, float bearing_sd)
void get_advance_stats (UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
void set_advance_stats (UNICHAR_ID unichar_id, float advance, float advance_sd)
bool PropertiesIncomplete (UNICHAR_ID unichar_id) const
int get_script (UNICHAR_ID unichar_id) const
unsigned int get_properties (UNICHAR_ID unichar_id) const
char get_chartype (UNICHAR_ID unichar_id) const
UNICHAR_ID get_other_case (UNICHAR_ID unichar_id) const
Direction get_direction (UNICHAR_ID unichar_id) const
UNICHAR_ID get_mirror (UNICHAR_ID unichar_id) const
UNICHAR_ID to_lower (UNICHAR_ID unichar_id) const
UNICHAR_ID to_upper (UNICHAR_ID unichar_id) const
bool has_special_codes () const
bool AnyRepeatedUnicodes () const
const CHAR_FRAGMENTget_fragment (UNICHAR_ID unichar_id) const
bool get_isalpha (const char *const unichar_repr) const
bool get_islower (const char *const unichar_repr) const
bool get_isupper (const char *const unichar_repr) const
bool get_isdigit (const char *const unichar_repr) const
bool get_ispunctuation (const char *const unichar_repr) const
unsigned int get_properties (const char *const unichar_repr) const
char get_chartype (const char *const unichar_repr) const
int get_script (const char *const unichar_repr) const
const CHAR_FRAGMENTget_fragment (const char *const unichar_repr) const
bool get_isalpha (const char *const unichar_repr, int length) const
bool get_islower (const char *const unichar_repr, int length) const
bool get_isupper (const char *const unichar_repr, int length) const
bool get_isdigit (const char *const unichar_repr, int length) const
bool get_ispunctuation (const char *const unichar_repr, int length) const
const char * get_normed_unichar (UNICHAR_ID unichar_id) const
const GenericVector< UNICHAR_ID > & normed_ids (UNICHAR_ID unichar_id) const
int get_script (const char *const unichar_repr, int length) const
int get_script_table_size () const
const char * get_script_from_script_id (int id) const
int get_script_id_from_name (const char *script_name) const
bool is_null_script (const char *script) const
int add_script (const char *script)
bool get_enabled (UNICHAR_ID unichar_id) const
int null_sid () const
int common_sid () const
int latin_sid () const
int cyrillic_sid () const
int greek_sid () const
int han_sid () const
int hiragana_sid () const
int katakana_sid () const
int default_sid () const
bool script_has_upper_lower () const
bool script_has_xheight () const

Static Public Member Functions

static STRING debug_utf8_str (const char *str)

Static Public Attributes

static const char * kCustomLigatures [][2]
static const char * kSpecialUnicharCodes [SPECIAL_UNICHAR_CODES_COUNT]

Detailed Description

Definition at line 139 of file unicharset.h.


Member Enumeration Documentation

Enumerator:
U_LEFT_TO_RIGHT 
U_RIGHT_TO_LEFT 
U_EUROPEAN_NUMBER 
U_EUROPEAN_NUMBER_SEPARATOR 
U_EUROPEAN_NUMBER_TERMINATOR 
U_ARABIC_NUMBER 
U_COMMON_NUMBER_SEPARATOR 
U_BLOCK_SEPARATOR 
U_SEGMENT_SEPARATOR 
U_WHITE_SPACE_NEUTRAL 
U_OTHER_NEUTRAL 
U_LEFT_TO_RIGHT_EMBEDDING 
U_LEFT_TO_RIGHT_OVERRIDE 
U_RIGHT_TO_LEFT_ARABIC 
U_RIGHT_TO_LEFT_EMBEDDING 
U_RIGHT_TO_LEFT_OVERRIDE 
U_POP_DIRECTIONAL_FORMAT 
U_DIR_NON_SPACING_MARK 
U_BOUNDARY_NEUTRAL 
U_CHAR_DIRECTION_COUNT 

Definition at line 150 of file unicharset.h.


Constructor & Destructor Documentation

UNICHARSET::UNICHARSET ( )

Definition at line 159 of file unicharset.cpp.

                       :
    unichars(NULL),
    ids(),
    size_used(0),
    size_reserved(0),
    script_table(NULL),
    script_table_size_used(0),
    null_script("NULL") {
  clear();
  for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
    unichar_insert(kSpecialUnicharCodes[i]);
    if (i == UNICHAR_JOINED)
      set_isngram(i, true);
  }
}
UNICHARSET::~UNICHARSET ( )

Definition at line 175 of file unicharset.cpp.

                        {
  clear();
}

Member Function Documentation

int UNICHARSET::add_script ( const char *  script)

Definition at line 1002 of file unicharset.cpp.

                                             {
  for (int i = 0; i < script_table_size_used; ++i) {
    if (strcmp(script, script_table[i]) == 0)
      return i;
  }
  if (script_table_size_reserved == 0) {
    script_table_size_reserved = 8;
    script_table = new char*[script_table_size_reserved];
  }
  if (script_table_size_used + 1 >= script_table_size_reserved) {
    char** new_script_table = new char*[script_table_size_reserved * 2];
    memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*));
    delete[] script_table;
    script_table = new_script_table;
      script_table_size_reserved = 2 * script_table_size_reserved;
  }
  script_table[script_table_size_used] = new char[strlen(script) + 1];
  strcpy(script_table[script_table_size_used], script);
  return script_table_size_used++;
}
bool UNICHARSET::AnyRepeatedUnicodes ( ) const

Definition at line 986 of file unicharset.cpp.

                                           {
  int start_id = 0;
  if (has_special_codes()) start_id = SPECIAL_UNICHAR_CODES_COUNT;
  for (int id = start_id; id < size_used; ++id) {
    // Convert to unicodes.
    GenericVector<int> unicodes;
    if (UNICHAR::UTF8ToUnicode(get_normed_unichar(id), &unicodes) &&
        unicodes.size() > 1) {
      for (int u = 1; u < unicodes.size(); ++u) {
        if (unicodes[u - 1] == unicodes[u]) return true;
      }
    }
  }
  return false;
}
void UNICHARSET::AppendOtherUnicharset ( const UNICHARSET src)

Definition at line 439 of file unicharset.cpp.

                                                            {
  int initial_used = size_used;
  for (int ch = 0; ch < src.size_used; ++ch) {
    const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
    const char* utf8 = src.id_to_unichar(ch);
    if (ch >= SPECIAL_UNICHAR_CODES_COUNT && src_props.AnyRangeEmpty()) {
      // Only use fully valid entries.
      tprintf("Bad properties for index %d, char %s: "
              "%d,%d %d,%d %g,%g %g,%g %g,%g\n",
              ch, utf8, src_props.min_bottom, src_props.max_bottom,
              src_props.min_top, src_props.max_top,
              src_props.width, src_props.width_sd,
              src_props.bearing, src_props.bearing_sd,
              src_props.advance, src_props.advance_sd);
      continue;
    }
    int id = size_used;
    if (contains_unichar(utf8)) {
      id = unichar_to_id(utf8);
      // Just expand current ranges.
      unichars[id].properties.ExpandRangesFrom(src_props);
    } else {
      unichar_insert(utf8);
      unichars[id].properties.SetRangesEmpty();
    }
  }
  // Set properties, including mirror and other_case, WITHOUT reordering
  // the unicharset.
  PartialSetPropertiesFromOther(initial_used, src);
}
void UNICHARSET::clear ( ) [inline]

Definition at line 266 of file unicharset.h.

               {
    if (script_table != NULL) {
      for (int i = 0; i < script_table_size_used; ++i)
        delete[] script_table[i];
      delete[] script_table;
      script_table = NULL;
      script_table_size_used = 0;
    }
    if (unichars != NULL) {
      delete_pointers_in_unichars();
      delete[] unichars;
      unichars = NULL;
    }
    script_table_size_reserved = 0;
    size_reserved = 0;
    size_used = 0;
    ids.clear();
    top_bottom_set_ = false;
    script_has_upper_lower_ = false;
    script_has_xheight_ = false;
    null_sid_ = 0;
    common_sid_ = 0;
    latin_sid_ = 0;
    cyrillic_sid_ = 0;
    greek_sid_ = 0;
    han_sid_ = 0;
    hiragana_sid_ = 0;
    katakana_sid_ = 0;
  }
int UNICHARSET::common_sid ( ) const [inline]

Definition at line 832 of file unicharset.h.

{ return common_sid_; }
bool UNICHARSET::contains_unichar ( const char *const  unichar_repr) const

Definition at line 644 of file unicharset.cpp.

                                                                      {
  return ids.contains(unichar_repr);
}
bool UNICHARSET::contains_unichar ( const char *const  unichar_repr,
int  length 
) const

Definition at line 648 of file unicharset.cpp.

                                                    {
  if (length == 0) {
    return false;
  }
  return ids.contains(unichar_repr, length);
}
bool UNICHARSET::contains_unichar_id ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 242 of file unicharset.h.

                                                        {
    return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
        unichar_id >= 0;
  }
void UNICHARSET::CopyFrom ( const UNICHARSET src)

Definition at line 423 of file unicharset.cpp.

                                               {
  clear();
  for (int ch = 0; ch < src.size_used; ++ch) {
    const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
    const char* utf8 = src.id_to_unichar(ch);
    unichar_insert(utf8);
    unichars[ch].properties.ExpandRangesFrom(src_props);
  }
  // Set properties, including mirror and other_case, WITHOUT reordering
  // the unicharset.
  PartialSetPropertiesFromOther(0, src);
}
int UNICHARSET::cyrillic_sid ( ) const [inline]

Definition at line 834 of file unicharset.h.

{ return cyrillic_sid_; }
STRING UNICHARSET::debug_str ( const char *  unichar_repr) const [inline]

Definition at line 233 of file unicharset.h.

                                                    {
    return debug_str(unichar_to_id(unichar_repr));
  }
STRING UNICHARSET::debug_str ( UNICHAR_ID  id) const

Definition at line 318 of file unicharset.cpp.

                                                {
  if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
  if (fragment) {
    return fragment->to_string();
  }
  const char* str = id_to_unichar(id);
  STRING result = debug_utf8_str(str);
  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
  if (get_isalpha(id)) {
    if (get_islower(id))
      result += "a";
    else if (get_isupper(id))
      result += "A";
    else
      result += "x";
  }
  // Append 0 if a digit.
  if (get_isdigit(id)) {
    result += "0";
  }
  // Append p is a punctuation symbol.
  if (get_ispunctuation(id)) {
    result += "p";
  }
  return result;
}
STRING UNICHARSET::debug_utf8_str ( const char *  str) [static]

Definition at line 294 of file unicharset.cpp.

                                                 {
  STRING result = str;
  result += " [";
  int step = 1;
  // Chop into unicodes and code each as hex.
  for (int i = 0; str[i] != '\0'; i += step) {
    char hex[sizeof(int) * 2 + 1];
    step = UNICHAR::utf8_step(str + i);
    if (step == 0) {
      step = 1;
      sprintf(hex, "%x", str[i]);
    } else {
      UNICHAR ch(str + i, step);
      sprintf(hex, "%x", ch.first_uni());
    }
    result += hex;
    result += " ";
  }
  result += "]";
  return result;
}
int UNICHARSET::default_sid ( ) const [inline]

Definition at line 839 of file unicharset.h.

{ return default_sid_; }
void UNICHARSET::delete_pointers_in_unichars ( ) [inline]

Definition at line 256 of file unicharset.h.

                                     {
    for (int i = 0; i < size_used; ++i) {
      if (unichars[i].properties.fragment != NULL) {
        delete unichars[i].properties.fragment;
        unichars[i].properties.fragment = NULL;
      }
    }
  }
bool UNICHARSET::encodable_string ( const char *  str,
int *  first_bad_position 
) const

Definition at line 222 of file unicharset.cpp.

                                                                 {
  GenericVector<UNICHAR_ID> encoding;
  return encode_string(str, true, &encoding, NULL, first_bad_position);
}
bool UNICHARSET::encode_string ( const char *  str,
bool  give_up_on_failure,
GenericVector< UNICHAR_ID > *  encoding,
GenericVector< char > *  lengths,
int *  encoded_length 
) const

Definition at line 234 of file unicharset.cpp.

                                                          {
  GenericVector<UNICHAR_ID> working_encoding;
  GenericVector<char> working_lengths;
  GenericVector<char> best_lengths;
  encoding->truncate(0);  // Just in case str is empty.
  int str_length = strlen(str);
  int str_pos = 0;
  bool perfect = true;
  while (str_pos < str_length) {
    encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
                  &str_pos, encoding, &best_lengths);
    if (str_pos < str_length) {
      // This is a non-match. Skip one utf-8 character.
      perfect = false;
      if (give_up_on_failure) break;
      int step = UNICHAR::utf8_step(str + str_pos);
      if (step == 0) step = 1;
      encoding->push_back(INVALID_UNICHAR_ID);
      best_lengths.push_back(step);
      str_pos += step;
      working_encoding = *encoding;
      working_lengths = best_lengths;
    }
  }
  if (lengths != NULL) *lengths = best_lengths;
  if (encoded_length != NULL) *encoded_length = str_pos;
  return perfect;
}
bool UNICHARSET::eq ( UNICHAR_ID  unichar_id,
const char *const  unichar_repr 
) const

Definition at line 656 of file unicharset.cpp.

                                                          {
  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
}
void UNICHARSET::ExpandRangesFromOther ( const UNICHARSET src)

Definition at line 410 of file unicharset.cpp.

                                                            {
  for (int ch = 0; ch < size_used; ++ch) {
    const char* utf8 = id_to_unichar(ch);
    UNICHAR_PROPERTIES properties;
    if (src.GetStrProperties(utf8, &properties)) {
      // Expand just the ranges from properties.
      unichars[ch].properties.ExpandRangesFrom(properties);
    }
  }
}
void UNICHARSET::get_advance_stats ( UNICHAR_ID  unichar_id,
float *  advance,
float *  advance_sd 
) const [inline]

Definition at line 588 of file unicharset.h.

                                                                  {
    if (INVALID_UNICHAR_ID == unichar_id) {
      *advance = *advance_sd = 0;
      return;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    *advance = unichars[unichar_id].properties.advance;
    *advance_sd = unichars[unichar_id].properties.advance_sd;
  }
void UNICHARSET::get_bearing_stats ( UNICHAR_ID  unichar_id,
float *  bearing,
float *  bearing_sd 
) const [inline]

Definition at line 571 of file unicharset.h.

                                                                  {
    if (INVALID_UNICHAR_ID == unichar_id) {
      *bearing = *bearing_sd = 0.0f;
      return;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    *bearing = unichars[unichar_id].properties.bearing;
    *bearing_sd = unichars[unichar_id].properties.bearing_sd;
  }
char UNICHARSET::get_chartype ( UNICHAR_ID  unichar_id) const

Definition at line 603 of file unicharset.cpp.

                                                 {
  if (this->get_isupper(id)) return 'A';
  if (this->get_islower(id)) return 'a';
  if (this->get_isalpha(id)) return 'x';
  if (this->get_isdigit(id)) return '0';
  if (this->get_ispunctuation(id)) return 'p';
  return 0;
}
char UNICHARSET::get_chartype ( const char *const  unichar_repr) const [inline]

Definition at line 719 of file unicharset.h.

                                                          {
    return get_chartype(unichar_to_id(unichar_repr));
  }
Direction UNICHARSET::get_direction ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 638 of file unicharset.h.

                                                       {
     if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
     ASSERT_HOST(contains_unichar_id(unichar_id));
     return unichars[unichar_id].properties.direction;
   }
bool UNICHARSET::get_enabled ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 826 of file unicharset.h.

                                                {
    return unichars[unichar_id].properties.enabled;
  }
const CHAR_FRAGMENT* UNICHARSET::get_fragment ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 682 of file unicharset.h.

                                                                 {
    if (INVALID_UNICHAR_ID == unichar_id) return NULL;
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.fragment;
  }
const CHAR_FRAGMENT* UNICHARSET::get_fragment ( const char *const  unichar_repr) const [inline]

Definition at line 732 of file unicharset.h.

                                                                          {
    if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
        !ids.contains(unichar_repr)) {
      return NULL;
    }
    return get_fragment(unichar_to_id(unichar_repr));
  }
bool UNICHARSET::get_isalpha ( const char *const  unichar_repr) const [inline]

Definition at line 689 of file unicharset.h.

                                                         {
    return get_isalpha(unichar_to_id(unichar_repr));
  }
bool UNICHARSET::get_isalpha ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 449 of file unicharset.h.

                                                {
    if (INVALID_UNICHAR_ID == unichar_id) return false;
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.isalpha;
  }
bool UNICHARSET::get_isalpha ( const char *const  unichar_repr,
int  length 
) const [inline]

Definition at line 742 of file unicharset.h.

                                 {
    return get_isalpha(unichar_to_id(unichar_repr, length));
  }
bool UNICHARSET::get_isdigit ( const char *const  unichar_repr) const [inline]

Definition at line 704 of file unicharset.h.

                                                         {
    return get_isdigit(unichar_to_id(unichar_repr));
  }
bool UNICHARSET::get_isdigit ( const char *const  unichar_repr,
int  length 
) const [inline]

Definition at line 763 of file unicharset.h.

                                 {
    return get_isdigit(unichar_to_id(unichar_repr, length));
  }
bool UNICHARSET::get_isdigit ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 470 of file unicharset.h.

                                                {
    if (INVALID_UNICHAR_ID == unichar_id) return false;
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.isdigit;
  }
bool UNICHARSET::get_islower ( const char *const  unichar_repr) const [inline]

Definition at line 694 of file unicharset.h.

                                                         {
    return get_islower(unichar_to_id(unichar_repr));
  }
bool UNICHARSET::get_islower ( const char *const  unichar_repr,
int  length 
) const [inline]

Definition at line 749 of file unicharset.h.

                                 {
    return get_islower(unichar_to_id(unichar_repr, length));
  }
bool UNICHARSET::get_islower ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 456 of file unicharset.h.

                                                {
    if (INVALID_UNICHAR_ID == unichar_id) return false;
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.islower;
  }
bool UNICHARSET::get_isngram ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 484 of file unicharset.h.

                                                {
    if (INVALID_UNICHAR_ID == unichar_id) return false;
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.isngram;
  }
bool UNICHARSET::get_isprivate ( UNICHAR_ID  unichar_id) const

Definition at line 363 of file unicharset.cpp.

                                                          {
  UNICHAR uc(id_to_unichar(unichar_id), -1);
  int uni = uc.first_uni();
  return (uni >= 0xE000 && uni <= 0xF8FF);
}
bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr) const [inline]

Definition at line 709 of file unicharset.h.

                                                               {
    return get_ispunctuation(unichar_to_id(unichar_repr));
  }
bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr,
int  length 
) const [inline]

Definition at line 770 of file unicharset.h.

                                            {
    return get_ispunctuation(unichar_to_id(unichar_repr, length));
  }
bool UNICHARSET::get_ispunctuation ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 477 of file unicharset.h.

                                                      {
    if (INVALID_UNICHAR_ID == unichar_id) return false;
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.ispunctuation;
  }
bool UNICHARSET::get_isupper ( const char *const  unichar_repr,
int  length 
) const [inline]

Definition at line 756 of file unicharset.h.

                                 {
    return get_isupper(unichar_to_id(unichar_repr, length));
  }
bool UNICHARSET::get_isupper ( const char *const  unichar_repr) const [inline]

Definition at line 699 of file unicharset.h.

                                                         {
    return get_isupper(unichar_to_id(unichar_repr));
  }
bool UNICHARSET::get_isupper ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 463 of file unicharset.h.

                                                {
    if (INVALID_UNICHAR_ID == unichar_id) return false;
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.isupper;
  }
UNICHAR_ID UNICHARSET::get_mirror ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 645 of file unicharset.h.

                                                     {
    if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.mirror;
  }
const char* UNICHARSET::get_normed_unichar ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 776 of file unicharset.h.

                                                              {
    if (unichar_id == UNICHAR_SPACE && has_special_codes()) return " ";
    return unichars[unichar_id].properties.normed.string();
  }
UNICHAR_ID UNICHARSET::get_other_case ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 631 of file unicharset.h.

                                                         {
    if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.other_case;
  }
unsigned int UNICHARSET::get_properties ( UNICHAR_ID  unichar_id) const

Definition at line 588 of file unicharset.cpp.

                                                           {
  unsigned int properties = 0;
  if (this->get_isalpha(id))
    properties |= ISALPHA_MASK;
  if (this->get_islower(id))
    properties |= ISLOWER_MASK;
  if (this->get_isupper(id))
    properties |= ISUPPER_MASK;
  if (this->get_isdigit(id))
    properties |= ISDIGIT_MASK;
  if (this->get_ispunctuation(id))
    properties |= ISPUNCTUATION_MASK;
  return properties;
}
unsigned int UNICHARSET::get_properties ( const char *const  unichar_repr) const [inline]

Definition at line 715 of file unicharset.h.

                                                                    {
    return get_properties(unichar_to_id(unichar_repr));
  }
int UNICHARSET::get_script ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 611 of file unicharset.h.

                                              {
    if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
    ASSERT_HOST(contains_unichar_id(unichar_id));
    return unichars[unichar_id].properties.script_id;
  }
int UNICHARSET::get_script ( const char *const  unichar_repr) const [inline]

Definition at line 726 of file unicharset.h.

                                                       {
    return get_script(unichar_to_id(unichar_repr));
  }
int UNICHARSET::get_script ( const char *const  unichar_repr,
int  length 
) const [inline]

Definition at line 791 of file unicharset.h.

                                   {
    return get_script(unichar_to_id(unichar_repr, length));
  }
const char* UNICHARSET::get_script_from_script_id ( int  id) const [inline]

Definition at line 802 of file unicharset.h.

                                                      {
    if (id >= script_table_size_used || id < 0)
      return null_script;
    return script_table[id];
  }
int UNICHARSET::get_script_id_from_name ( const char *  script_name) const

Definition at line 1080 of file unicharset.cpp.

                                                                     {
  for (int i = 0; i < script_table_size_used; ++i) {
    if (strcmp(script_name, script_table[i]) == 0)
      return i;
  }
  return 0;  // 0 is always the null_script
}
int UNICHARSET::get_script_table_size ( ) const [inline]

Definition at line 797 of file unicharset.h.

                                    {
    return script_table_size_used;
  }
void UNICHARSET::get_top_bottom ( UNICHAR_ID  unichar_id,
int *  min_bottom,
int *  max_bottom,
int *  min_top,
int *  max_top 
) const [inline]

Definition at line 526 of file unicharset.h.

                                                        {
    if (INVALID_UNICHAR_ID == unichar_id) {
      *min_bottom = *min_top = 0;
      *max_bottom = *max_top = 256;  // kBlnCellHeight
      return;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    *min_bottom = unichars[unichar_id].properties.min_bottom;
    *max_bottom = unichars[unichar_id].properties.max_bottom;
    *min_top = unichars[unichar_id].properties.min_top;
    *max_top = unichars[unichar_id].properties.max_top;
  }
void UNICHARSET::get_width_stats ( UNICHAR_ID  unichar_id,
float *  width,
float *  width_sd 
) const [inline]

Definition at line 554 of file unicharset.h.

                                                            {
    if (INVALID_UNICHAR_ID == unichar_id) {
      *width = 0.0f;
      *width_sd = 0.0f;;
      return;
    }
    ASSERT_HOST(contains_unichar_id(unichar_id));
    *width = unichars[unichar_id].properties.width;
    *width_sd = unichars[unichar_id].properties.width_sd;
  }
int UNICHARSET::greek_sid ( ) const [inline]

Definition at line 835 of file unicharset.h.

{ return greek_sid_; }
int UNICHARSET::han_sid ( ) const [inline]

Definition at line 836 of file unicharset.h.

{ return han_sid_; }
bool UNICHARSET::has_special_codes ( ) const [inline]

Definition at line 670 of file unicharset.h.

int UNICHARSET::hiragana_sid ( ) const [inline]

Definition at line 837 of file unicharset.h.

{ return hiragana_sid_; }
const char * UNICHARSET::id_to_unichar ( UNICHAR_ID  id) const

Definition at line 266 of file unicharset.cpp.

                                                         {
  if (id == INVALID_UNICHAR_ID) {
    return INVALID_UNICHAR;
  }
  ASSERT_HOST(id < this->size());
  return unichars[id].representation;
}
const char * UNICHARSET::id_to_unichar_ext ( UNICHAR_ID  id) const

Definition at line 274 of file unicharset.cpp.

                                                             {
  if (id == INVALID_UNICHAR_ID) {
    return INVALID_UNICHAR;
  }
  ASSERT_HOST(id < this->size());
  // Resolve from the kCustomLigatures table if this is a private encoding.
  if (get_isprivate(id)) {
    const char* ch = id_to_unichar(id);
    for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) {
      if (!strcmp(ch, kCustomLigatures[i][1])) {
        return kCustomLigatures[i][0];
      }
    }
  }
  // Otherwise return the stored representation.
  return unichars[id].representation;
}
bool UNICHARSET::is_null_script ( const char *  script) const [inline]

Definition at line 816 of file unicharset.h.

                                                {
    return script == null_script;
  }
int UNICHARSET::katakana_sid ( ) const [inline]

Definition at line 838 of file unicharset.h.

{ return katakana_sid_; }
int UNICHARSET::latin_sid ( ) const [inline]

Definition at line 833 of file unicharset.h.

{ return latin_sid_; }
bool UNICHARSET::load_from_file ( FILE *  file,
bool  skip_fragments 
)

Definition at line 744 of file unicharset.cpp.

                                                               {
  LocalFilePointer lfp(file);
  TessResultCallback2<char *, char *, int> *fgets_cb =
      NewPermanentTessCallback(&lfp, &LocalFilePointer::fgets);
  bool success = load_via_fgets(fgets_cb, skip_fragments);
  delete fgets_cb;
  return success;
}
bool UNICHARSET::load_from_file ( const char *const  filename,
bool  skip_fragments 
) [inline]

Definition at line 346 of file unicharset.h.

                                                                       {
    FILE* file = fopen(filename, "rb");
    if (file == NULL) return false;
    bool result = load_from_file(file, skip_fragments);
    fclose(file);
    return result;
  }
bool UNICHARSET::load_from_file ( const char *const  filename) [inline]

Definition at line 354 of file unicharset.h.

                                                  {
    return load_from_file(filename, false);
  }
bool UNICHARSET::load_from_file ( FILE *  file) [inline]

Definition at line 361 of file unicharset.h.

{ return load_from_file(file, false); }
bool UNICHARSET::load_from_file ( tesseract::TFile file,
bool  skip_fragments 
)

Definition at line 753 of file unicharset.cpp.

                                                                         {
  TessResultCallback2<char *, char *, int> *fgets_cb =
      NewPermanentTessCallback(file, &tesseract::TFile::FGets);
  bool success = load_via_fgets(fgets_cb, skip_fragments);
  delete fgets_cb;
  return success;
}
bool UNICHARSET::load_from_inmemory_file ( const char *const  memory,
int  mem_size 
) [inline]

Definition at line 339 of file unicharset.h.

                                                                       {
    return load_from_inmemory_file(memory, mem_size, false);
  }
bool UNICHARSET::load_from_inmemory_file ( const char *const  memory,
int  mem_size,
bool  skip_fragments 
)

Definition at line 724 of file unicharset.cpp.

                                                           {
  InMemoryFilePointer mem_fp(memory, mem_size);
  TessResultCallback2<char *, char *, int> *fgets_cb =
      NewPermanentTessCallback(&mem_fp, &InMemoryFilePointer::fgets);
  bool success = load_via_fgets(fgets_cb, skip_fragments);
  delete fgets_cb;
  return success;
}
bool UNICHARSET::major_right_to_left ( ) const

Definition at line 931 of file unicharset.cpp.

                                           {
  int ltr_count = 0;
  int rtl_count = 0;
  for (int id = 0; id < size_used; ++id) {
    int dir = get_direction(id);
    if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
    if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
        dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||
        dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
  }
  return rtl_count > ltr_count;
}
const GenericVector<UNICHAR_ID>& UNICHARSET::normed_ids ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 783 of file unicharset.h.

                                                                           {
    return unichars[unichar_id].properties.normed_ids;
  }
int UNICHARSET::null_sid ( ) const [inline]

Definition at line 831 of file unicharset.h.

{ return null_sid_; }
void UNICHARSET::PartialSetPropertiesFromOther ( int  start_index,
const UNICHARSET src 
)

Definition at line 380 of file unicharset.cpp.

                                                                      {
  for (int ch = start_index; ch < size_used; ++ch) {
    const char* utf8 = id_to_unichar(ch);
    UNICHAR_PROPERTIES properties;
    if (src.GetStrProperties(utf8, &properties)) {
      // Setup the script_id, other_case, and mirror properly.
      const char* script = src.get_script_from_script_id(properties.script_id);
      properties.script_id = add_script(script);
      const char* other_case = src.id_to_unichar(properties.other_case);
      if (contains_unichar(other_case)) {
        properties.other_case = unichar_to_id(other_case);
      } else {
        properties.other_case = ch;
      }
      const char* mirror_str = src.id_to_unichar(properties.mirror);
      if (contains_unichar(mirror_str)) {
        properties.mirror = unichar_to_id(mirror_str);
      } else {
        properties.mirror = ch;
      }
      unichars[ch].properties.CopyFrom(properties);
      set_normed_ids(ch);
    }
  }
}
void UNICHARSET::post_load_setup ( )

Definition at line 867 of file unicharset.cpp.

                                 {
  // Number of alpha chars with the case property minus those without,
  // in order to determine that half the alpha chars have case.
  int net_case_alphas = 0;
  int x_height_alphas = 0;
  int cap_height_alphas = 0;
  top_bottom_set_ = false;
  for (UNICHAR_ID id = 0; id < size_used; ++id) {
    int min_bottom = 0;
    int max_bottom = MAX_UINT8;
    int min_top = 0;
    int max_top = MAX_UINT8;
    get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
    if (min_top > 0)
      top_bottom_set_ = true;
    if (get_isalpha(id)) {
      if (get_islower(id) || get_isupper(id))
        ++net_case_alphas;
      else
        --net_case_alphas;
      if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
        ++x_height_alphas;
      else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
        ++cap_height_alphas;
    }
    set_normed_ids(id);
  }

  script_has_upper_lower_ = net_case_alphas > 0;
  script_has_xheight_ = script_has_upper_lower_ ||
      (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
       cap_height_alphas > x_height_alphas * kMinCapHeightFraction);

  null_sid_ = get_script_id_from_name(null_script);
  ASSERT_HOST(null_sid_ == 0);
  common_sid_ = get_script_id_from_name("Common");
  latin_sid_ = get_script_id_from_name("Latin");
  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
  greek_sid_ = get_script_id_from_name("Greek");
  han_sid_ = get_script_id_from_name("Han");
  hiragana_sid_ = get_script_id_from_name("Hiragana");
  katakana_sid_ = get_script_id_from_name("Katakana");

  // Compute default script. Use the highest-counting alpha script, that is
  // not the common script, as that still contains some "alphas".
  int* script_counts = new int[script_table_size_used];
  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
  for (int id = 0; id < size_used; ++id) {
    if (get_isalpha(id)) {
      ++script_counts[get_script(id)];
    }
  }
  default_sid_ = 0;
  for (int s = 1; s < script_table_size_used; ++s) {
    if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
      default_sid_ = s;
  }
  delete [] script_counts;
}
bool UNICHARSET::PropertiesIncomplete ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 604 of file unicharset.h.

                                                         {
    return unichars[unichar_id].properties.AnyRangeEmpty();
  }
void UNICHARSET::reserve ( int  unichars_number)

Definition at line 179 of file unicharset.cpp.

                                            {
  if (unichars_number > size_reserved) {
    UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
    for (int i = 0; i < size_used; ++i)
      unichars_new[i] = unichars[i];
    for (int j = size_used; j < unichars_number; ++j) {
      unichars_new[j].properties.script_id = add_script(null_script);
    }
    delete[] unichars;
    unichars = unichars_new;
    size_reserved = unichars_number;
  }
}
bool UNICHARSET::save_to_file ( const char *const  filename) const [inline]

Definition at line 306 of file unicharset.h.

                                                       {
    FILE* file = fopen(filename, "w+b");
    if (file == NULL) return false;
    bool result = save_to_file(file);
    fclose(file);
    return result;
  }
bool UNICHARSET::save_to_file ( FILE *  file) const [inline]

Definition at line 316 of file unicharset.h.

                                      {
    STRING str;
    if (!save_to_string(&str)) return false;
    if (fwrite(&str[0], str.length(), 1, file) != 1) return false;
    return true;
  }
bool UNICHARSET::save_to_file ( tesseract::TFile file) const [inline]

Definition at line 322 of file unicharset.h.

                                                {
    STRING str;
    if (!save_to_string(&str)) return false;
    if (file->FWrite(&str[0], str.length(), 1) != 1) return false;
    return true;
  }
bool UNICHARSET::save_to_string ( STRING str) const

Definition at line 661 of file unicharset.cpp.

                                                 {
  const int kFileBufSize = 1024;
  char buffer[kFileBufSize + 1];
  snprintf(buffer, kFileBufSize, "%d\n", this->size());
  *str = buffer;
  for (UNICHAR_ID id = 0; id < this->size(); ++id) {
    int min_bottom, max_bottom, min_top, max_top;
    get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
    float width, width_sd;
    get_width_stats(id, &width, &width_sd);
    float bearing, bearing_sd;
    get_bearing_stats(id, &bearing, &bearing_sd);
    float advance, advance_sd;
    get_advance_stats(id, &advance, &advance_sd);
    unsigned int properties = this->get_properties(id);
    if (strcmp(this->id_to_unichar(id), " ") == 0) {
      snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
              this->get_script_from_script_id(this->get_script(id)),
              this->get_other_case(id));
    } else {
      snprintf(buffer, kFileBufSize,
              "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %s %d %d %d %s\t# %s\n",
              this->id_to_unichar(id), properties,
              min_bottom, max_bottom, min_top, max_top, width, width_sd,
              bearing, bearing_sd, advance, advance_sd,
              this->get_script_from_script_id(this->get_script(id)),
              this->get_other_case(id), this->get_direction(id),
              this->get_mirror(id), this->get_normed_unichar(id),
              this->debug_str(id).string());
    }
    *str += buffer;
  }
  return true;
}
bool UNICHARSET::script_has_upper_lower ( ) const [inline]

Definition at line 842 of file unicharset.h.

                                      {
    return script_has_upper_lower_;
  }
bool UNICHARSET::script_has_xheight ( ) const [inline]

Definition at line 849 of file unicharset.h.

                                  {
    return script_has_xheight_;
  }
void UNICHARSET::set_advance_stats ( UNICHAR_ID  unichar_id,
float  advance,
float  advance_sd 
) [inline]

Definition at line 598 of file unicharset.h.

                                                          {
    unichars[unichar_id].properties.advance = advance;
    unichars[unichar_id].properties.advance_sd = advance_sd;
  }
void UNICHARSET::set_bearing_stats ( UNICHAR_ID  unichar_id,
float  bearing,
float  bearing_sd 
) [inline]

Definition at line 581 of file unicharset.h.

                                                          {
    unichars[unichar_id].properties.bearing = bearing;
    unichars[unichar_id].properties.bearing_sd = bearing_sd;
  }
void UNICHARSET::set_black_and_whitelist ( const char *  blacklist,
const char *  whitelist,
const char *  unblacklist 
)

Definition at line 948 of file unicharset.cpp.

                                                                  {
  bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
  // Set everything to default
  for (int ch = 0; ch < size_used; ++ch)
    unichars[ch].properties.enabled = def_enabled;
  if (!def_enabled) {
    // Enable the whitelist.
    GenericVector<UNICHAR_ID> encoding;
    encode_string(whitelist, false, &encoding, NULL, NULL);
    for (int i = 0; i < encoding.size(); ++i) {
      if (encoding[i] != INVALID_UNICHAR_ID)
        unichars[encoding[i]].properties.enabled = true;
    }
  }
  if (blacklist != NULL && blacklist[0] != '\0') {
    // Disable the blacklist.
    GenericVector<UNICHAR_ID> encoding;
    encode_string(blacklist, false, &encoding, NULL, NULL);
    for (int i = 0; i < encoding.size(); ++i) {
      if (encoding[i] != INVALID_UNICHAR_ID)
        unichars[encoding[i]].properties.enabled = false;
    }
  }
  if (unblacklist != NULL && unblacklist[0] != '\0') {
    // Re-enable the unblacklist.
    GenericVector<UNICHAR_ID> encoding;
    encode_string(unblacklist, false, &encoding, NULL, NULL);
    for (int i = 0; i < encoding.size(); ++i) {
      if (encoding[i] != INVALID_UNICHAR_ID)
        unichars[encoding[i]].properties.enabled = true;
    }
  }
}
void UNICHARSET::set_direction ( UNICHAR_ID  unichar_id,
UNICHARSET::Direction  value 
) [inline]

Definition at line 430 of file unicharset.h.

                                                                       {
    unichars[unichar_id].properties.direction = value;
  }
void UNICHARSET::set_isalpha ( UNICHAR_ID  unichar_id,
bool  value 
) [inline]

Definition at line 389 of file unicharset.h.

                                                      {
    unichars[unichar_id].properties.isalpha = value;
  }
void UNICHARSET::set_isdigit ( UNICHAR_ID  unichar_id,
bool  value 
) [inline]

Definition at line 404 of file unicharset.h.

                                                      {
    unichars[unichar_id].properties.isdigit = value;
  }
void UNICHARSET::set_islower ( UNICHAR_ID  unichar_id,
bool  value 
) [inline]

Definition at line 394 of file unicharset.h.

                                                      {
    unichars[unichar_id].properties.islower = value;
  }
void UNICHARSET::set_isngram ( UNICHAR_ID  unichar_id,
bool  value 
) [inline]

Definition at line 414 of file unicharset.h.

                                                      {
    unichars[unichar_id].properties.isngram = value;
  }
void UNICHARSET::set_ispunctuation ( UNICHAR_ID  unichar_id,
bool  value 
) [inline]

Definition at line 409 of file unicharset.h.

                                                            {
    unichars[unichar_id].properties.ispunctuation = value;
  }
void UNICHARSET::set_isupper ( UNICHAR_ID  unichar_id,
bool  value 
) [inline]

Definition at line 399 of file unicharset.h.

                                                      {
    unichars[unichar_id].properties.isupper = value;
  }
void UNICHARSET::set_mirror ( UNICHAR_ID  unichar_id,
UNICHAR_ID  mirror 
) [inline]

Definition at line 435 of file unicharset.h.

                                                            {
    unichars[unichar_id].properties.mirror = mirror;
  }
void UNICHARSET::set_normed ( UNICHAR_ID  unichar_id,
const char *  normed 
) [inline]

Definition at line 440 of file unicharset.h.

                                                             {
    unichars[unichar_id].properties.normed = normed;
    unichars[unichar_id].properties.normed_ids.truncate(0);
  }
void UNICHARSET::set_normed_ids ( UNICHAR_ID  unichar_id)

Definition at line 348 of file unicharset.cpp.

                                                     {
  unichars[unichar_id].properties.normed_ids.truncate(0);
  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
    unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
  } else if (!encode_string(unichars[unichar_id].properties.normed.string(),
                            true, &unichars[unichar_id].properties.normed_ids,
                            NULL, NULL)) {
    unichars[unichar_id].properties.normed_ids.truncate(0);
    unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
  }
}
void UNICHARSET::set_other_case ( UNICHAR_ID  unichar_id,
UNICHAR_ID  other_case 
) [inline]

Definition at line 425 of file unicharset.h.

                                                                    {
    unichars[unichar_id].properties.other_case = other_case;
  }
void UNICHARSET::set_ranges_empty ( )

Definition at line 371 of file unicharset.cpp.

                                  {
  for (int id = 0; id < size_used; ++id) {
    unichars[id].properties.SetRangesEmpty();
  }
}
void UNICHARSET::set_script ( UNICHAR_ID  unichar_id,
const char *  value 
) [inline]

Definition at line 420 of file unicharset.h.

                                                            {
    unichars[unichar_id].properties.script_id = add_script(value);
  }
void UNICHARSET::set_top_bottom ( UNICHAR_ID  unichar_id,
int  min_bottom,
int  max_bottom,
int  min_top,
int  max_top 
) [inline]

Definition at line 540 of file unicharset.h.

                                                {
    unichars[unichar_id].properties.min_bottom =
        static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8));
    unichars[unichar_id].properties.max_bottom =
        static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8));
    unichars[unichar_id].properties.min_top =
        static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8));
    unichars[unichar_id].properties.max_top =
        static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));
  }
void UNICHARSET::set_width_stats ( UNICHAR_ID  unichar_id,
float  width,
float  width_sd 
) [inline]

Definition at line 565 of file unicharset.h.

                                                                           {
    unichars[unichar_id].properties.width = width;
    unichars[unichar_id].properties.width_sd = width_sd;
  }
void UNICHARSET::SetPropertiesFromOther ( const UNICHARSET src) [inline]

Definition at line 503 of file unicharset.h.

int UNICHARSET::size ( ) const [inline]

Definition at line 297 of file unicharset.h.

                   {
    return size_used;
  }
bool UNICHARSET::SizesDistinct ( UNICHAR_ID  id1,
UNICHAR_ID  id2 
) const

Definition at line 472 of file unicharset.cpp.

                                                                   {
  int overlap = MIN(unichars[id1].properties.max_top,
                    unichars[id2].properties.max_top) -
                MAX(unichars[id1].properties.min_top,
                    unichars[id2].properties.min_top);
  return overlap <= 0;
}
int UNICHARSET::step ( const char *  str) const

Definition at line 211 of file unicharset.cpp.

                                          {
  GenericVector<UNICHAR_ID> encoding;
  GenericVector<char> lengths;
  encode_string(str, true, &encoding, &lengths, NULL);
  if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
  return lengths[0];
}
UNICHAR_ID UNICHARSET::to_lower ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 652 of file unicharset.h.

                                                   {
    if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
    ASSERT_HOST(contains_unichar_id(unichar_id));
    if (unichars[unichar_id].properties.islower) return unichar_id;
    return unichars[unichar_id].properties.other_case;
  }
UNICHAR_ID UNICHARSET::to_upper ( UNICHAR_ID  unichar_id) const [inline]

Definition at line 660 of file unicharset.h.

                                                   {
    if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
    ASSERT_HOST(contains_unichar_id(unichar_id));
    if (unichars[unichar_id].properties.isupper) return unichar_id;
    return unichars[unichar_id].properties.other_case;
  }
bool UNICHARSET::top_bottom_useful ( ) const [inline]

Definition at line 495 of file unicharset.h.

                                 {
    return top_bottom_set_;
  }
void UNICHARSET::unichar_insert ( const char *const  unichar_repr)

Definition at line 612 of file unicharset.cpp.

                                                              {
  if (!ids.contains(unichar_repr)) {
    if (strlen(unichar_repr) > UNICHAR_LEN) {
      fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
              int(strlen(unichar_repr)), unichar_repr);
      return;
    }
    if (size_used == size_reserved) {
      if (size_used == 0)
        reserve(8);
      else
        reserve(2 * size_used);
    }

    strcpy(unichars[size_used].representation, unichar_repr);
    this->set_script(size_used, null_script);
    // If the given unichar_repr represents a fragmented character, set
    // fragment property to a pointer to CHAR_FRAGMENT class instance with
    // information parsed from the unichar representation. Use the script
    // of the base unichar for the fragmented character if possible.
    CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
    this->unichars[size_used].properties.fragment = frag;
    if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
      this->unichars[size_used].properties.script_id =
        this->get_script(frag->get_unichar());
    }
    this->unichars[size_used].properties.enabled = true;
    ids.insert(unichar_repr, size_used);
    ++size_used;
  }
}
UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr,
int  length 
) const

Definition at line 199 of file unicharset.cpp.

                                                       {
  assert(length > 0 && length <= UNICHAR_LEN);
  return ids.contains(unichar_repr, length) ?
    ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
}
UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr) const

Definition at line 194 of file unicharset.cpp.

                                                              {
  return ids.contains(unichar_repr) ?
    ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
}

Member Data Documentation

const char * UNICHARSET::kCustomLigatures [static]
Initial value:
 {
  {"ct", "\uE003"},  
  {"ſh", "\uE006"},  
  {"ſi", "\uE007"},  
  {"ſl", "\uE008"},  
  {"ſſ", "\uE009"},  
  {NULL, NULL}
}

Definition at line 144 of file unicharset.h.

const char * UNICHARSET::kSpecialUnicharCodes [static]
Initial value:
 {
    " ",
    "Joined",
    "|Broken|0|1"
}

Definition at line 147 of file unicharset.h.


The documentation for this class was generated from the following files:
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines