|
tesseract 3.04.01
|
#include <unicharset.h>
Classes | |
| struct | UNICHAR_PROPERTIES |
| struct | UNICHAR_SLOT |
Public Types | |
| enum | Direction { U_LEFT_TO_RIGHT = 0, U_RIGHT_TO_LEFT = 1, U_EUROPEAN_NUMBER = 2, U_EUROPEAN_NUMBER_SEPARATOR = 3, U_EUROPEAN_NUMBER_TERMINATOR = 4, U_ARABIC_NUMBER = 5, U_COMMON_NUMBER_SEPARATOR = 6, U_BLOCK_SEPARATOR = 7, U_SEGMENT_SEPARATOR = 8, U_WHITE_SPACE_NEUTRAL = 9, U_OTHER_NEUTRAL = 10, U_LEFT_TO_RIGHT_EMBEDDING = 11, U_LEFT_TO_RIGHT_OVERRIDE = 12, U_RIGHT_TO_LEFT_ARABIC = 13, U_RIGHT_TO_LEFT_EMBEDDING = 14, U_RIGHT_TO_LEFT_OVERRIDE = 15, U_POP_DIRECTIONAL_FORMAT = 16, U_DIR_NON_SPACING_MARK = 17, U_BOUNDARY_NEUTRAL = 18, U_CHAR_DIRECTION_COUNT } |
Public Member Functions | |
| UNICHARSET () | |
| ~UNICHARSET () | |
| UNICHAR_ID | unichar_to_id (const char *const unichar_repr) const |
| UNICHAR_ID | unichar_to_id (const char *const unichar_repr, int length) const |
| int | step (const char *str) const |
| bool | encodable_string (const char *str, int *first_bad_position) const |
| bool | encode_string (const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const |
| const char * | id_to_unichar (UNICHAR_ID id) const |
| const char * | id_to_unichar_ext (UNICHAR_ID id) const |
| STRING | debug_str (UNICHAR_ID id) const |
| STRING | debug_str (const char *unichar_repr) const |
| void | unichar_insert (const char *const unichar_repr) |
| bool | contains_unichar_id (UNICHAR_ID unichar_id) const |
| bool | contains_unichar (const char *const unichar_repr) const |
| bool | contains_unichar (const char *const unichar_repr, int length) const |
| bool | eq (UNICHAR_ID unichar_id, const char *const unichar_repr) const |
| void | delete_pointers_in_unichars () |
| void | clear () |
| int | size () const |
| void | reserve (int unichars_number) |
| bool | save_to_file (const char *const filename) const |
| bool | save_to_file (FILE *file) const |
| bool | save_to_file (tesseract::TFile *file) const |
| bool | save_to_string (STRING *str) const |
| bool | load_from_inmemory_file (const char *const memory, int mem_size, bool skip_fragments) |
| bool | load_from_inmemory_file (const char *const memory, int mem_size) |
| bool | load_from_file (const char *const filename, bool skip_fragments) |
| bool | load_from_file (const char *const filename) |
| bool | load_from_file (FILE *file, bool skip_fragments) |
| bool | load_from_file (FILE *file) |
| bool | load_from_file (tesseract::TFile *file, bool skip_fragments) |
| void | post_load_setup () |
| bool | major_right_to_left () const |
| void | set_black_and_whitelist (const char *blacklist, const char *whitelist, const char *unblacklist) |
| void | set_isalpha (UNICHAR_ID unichar_id, bool value) |
| void | set_islower (UNICHAR_ID unichar_id, bool value) |
| void | set_isupper (UNICHAR_ID unichar_id, bool value) |
| void | set_isdigit (UNICHAR_ID unichar_id, bool value) |
| void | set_ispunctuation (UNICHAR_ID unichar_id, bool value) |
| void | set_isngram (UNICHAR_ID unichar_id, bool value) |
| void | set_script (UNICHAR_ID unichar_id, const char *value) |
| void | set_other_case (UNICHAR_ID unichar_id, UNICHAR_ID other_case) |
| void | set_direction (UNICHAR_ID unichar_id, UNICHARSET::Direction value) |
| void | set_mirror (UNICHAR_ID unichar_id, UNICHAR_ID mirror) |
| void | set_normed (UNICHAR_ID unichar_id, const char *normed) |
| void | set_normed_ids (UNICHAR_ID unichar_id) |
| bool | get_isalpha (UNICHAR_ID unichar_id) const |
| bool | get_islower (UNICHAR_ID unichar_id) const |
| bool | get_isupper (UNICHAR_ID unichar_id) const |
| bool | get_isdigit (UNICHAR_ID unichar_id) const |
| bool | get_ispunctuation (UNICHAR_ID unichar_id) const |
| bool | get_isngram (UNICHAR_ID unichar_id) const |
| bool | get_isprivate (UNICHAR_ID unichar_id) const |
| bool | top_bottom_useful () const |
| void | set_ranges_empty () |
| void | SetPropertiesFromOther (const UNICHARSET &src) |
| void | PartialSetPropertiesFromOther (int start_index, const UNICHARSET &src) |
| void | ExpandRangesFromOther (const UNICHARSET &src) |
| void | CopyFrom (const UNICHARSET &src) |
| void | AppendOtherUnicharset (const UNICHARSET &src) |
| bool | SizesDistinct (UNICHAR_ID id1, UNICHAR_ID id2) const |
| void | get_top_bottom (UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const |
| void | set_top_bottom (UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top) |
| void | get_width_stats (UNICHAR_ID unichar_id, float *width, float *width_sd) const |
| void | set_width_stats (UNICHAR_ID unichar_id, float width, float width_sd) |
| void | get_bearing_stats (UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const |
| void | set_bearing_stats (UNICHAR_ID unichar_id, float bearing, float bearing_sd) |
| void | get_advance_stats (UNICHAR_ID unichar_id, float *advance, float *advance_sd) const |
| void | set_advance_stats (UNICHAR_ID unichar_id, float advance, float advance_sd) |
| bool | PropertiesIncomplete (UNICHAR_ID unichar_id) const |
| int | get_script (UNICHAR_ID unichar_id) const |
| unsigned int | get_properties (UNICHAR_ID unichar_id) const |
| char | get_chartype (UNICHAR_ID unichar_id) const |
| UNICHAR_ID | get_other_case (UNICHAR_ID unichar_id) const |
| Direction | get_direction (UNICHAR_ID unichar_id) const |
| UNICHAR_ID | get_mirror (UNICHAR_ID unichar_id) const |
| UNICHAR_ID | to_lower (UNICHAR_ID unichar_id) const |
| UNICHAR_ID | to_upper (UNICHAR_ID unichar_id) const |
| bool | has_special_codes () const |
| bool | AnyRepeatedUnicodes () const |
| const CHAR_FRAGMENT * | get_fragment (UNICHAR_ID unichar_id) const |
| bool | get_isalpha (const char *const unichar_repr) const |
| bool | get_islower (const char *const unichar_repr) const |
| bool | get_isupper (const char *const unichar_repr) const |
| bool | get_isdigit (const char *const unichar_repr) const |
| bool | get_ispunctuation (const char *const unichar_repr) const |
| unsigned int | get_properties (const char *const unichar_repr) const |
| char | get_chartype (const char *const unichar_repr) const |
| int | get_script (const char *const unichar_repr) const |
| const CHAR_FRAGMENT * | get_fragment (const char *const unichar_repr) const |
| bool | get_isalpha (const char *const unichar_repr, int length) const |
| bool | get_islower (const char *const unichar_repr, int length) const |
| bool | get_isupper (const char *const unichar_repr, int length) const |
| bool | get_isdigit (const char *const unichar_repr, int length) const |
| bool | get_ispunctuation (const char *const unichar_repr, int length) const |
| const char * | get_normed_unichar (UNICHAR_ID unichar_id) const |
| const GenericVector< UNICHAR_ID > & | normed_ids (UNICHAR_ID unichar_id) const |
| int | get_script (const char *const unichar_repr, int length) const |
| int | get_script_table_size () const |
| const char * | get_script_from_script_id (int id) const |
| int | get_script_id_from_name (const char *script_name) const |
| bool | is_null_script (const char *script) const |
| int | add_script (const char *script) |
| bool | get_enabled (UNICHAR_ID unichar_id) const |
| int | null_sid () const |
| int | common_sid () const |
| int | latin_sid () const |
| int | cyrillic_sid () const |
| int | greek_sid () const |
| int | han_sid () const |
| int | hiragana_sid () const |
| int | katakana_sid () const |
| int | default_sid () const |
| bool | script_has_upper_lower () const |
| bool | script_has_xheight () const |
Static Public Member Functions | |
| static STRING | debug_utf8_str (const char *str) |
Static Public Attributes | |
| static const char * | kCustomLigatures [][2] |
| static const char * | kSpecialUnicharCodes [SPECIAL_UNICHAR_CODES_COUNT] |
Definition at line 139 of file unicharset.h.
Definition at line 150 of file unicharset.h.
{
U_LEFT_TO_RIGHT = 0,
U_RIGHT_TO_LEFT = 1,
U_EUROPEAN_NUMBER = 2,
U_EUROPEAN_NUMBER_SEPARATOR = 3,
U_EUROPEAN_NUMBER_TERMINATOR = 4,
U_ARABIC_NUMBER = 5,
U_COMMON_NUMBER_SEPARATOR = 6,
U_BLOCK_SEPARATOR = 7,
U_SEGMENT_SEPARATOR = 8,
U_WHITE_SPACE_NEUTRAL = 9,
U_OTHER_NEUTRAL = 10,
U_LEFT_TO_RIGHT_EMBEDDING = 11,
U_LEFT_TO_RIGHT_OVERRIDE = 12,
U_RIGHT_TO_LEFT_ARABIC = 13,
U_RIGHT_TO_LEFT_EMBEDDING = 14,
U_RIGHT_TO_LEFT_OVERRIDE = 15,
U_POP_DIRECTIONAL_FORMAT = 16,
U_DIR_NON_SPACING_MARK = 17,
U_BOUNDARY_NEUTRAL = 18,
U_CHAR_DIRECTION_COUNT
};
| UNICHARSET::UNICHARSET | ( | ) |
Definition at line 159 of file unicharset.cpp.
:
unichars(NULL),
ids(),
size_used(0),
size_reserved(0),
script_table(NULL),
script_table_size_used(0),
null_script("NULL") {
clear();
for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
unichar_insert(kSpecialUnicharCodes[i]);
if (i == UNICHAR_JOINED)
set_isngram(i, true);
}
}
| UNICHARSET::~UNICHARSET | ( | ) |
Definition at line 175 of file unicharset.cpp.
{
clear();
}
| int UNICHARSET::add_script | ( | const char * | script | ) |
Definition at line 1002 of file unicharset.cpp.
{
for (int i = 0; i < script_table_size_used; ++i) {
if (strcmp(script, script_table[i]) == 0)
return i;
}
if (script_table_size_reserved == 0) {
script_table_size_reserved = 8;
script_table = new char*[script_table_size_reserved];
}
if (script_table_size_used + 1 >= script_table_size_reserved) {
char** new_script_table = new char*[script_table_size_reserved * 2];
memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*));
delete[] script_table;
script_table = new_script_table;
script_table_size_reserved = 2 * script_table_size_reserved;
}
script_table[script_table_size_used] = new char[strlen(script) + 1];
strcpy(script_table[script_table_size_used], script);
return script_table_size_used++;
}
| bool UNICHARSET::AnyRepeatedUnicodes | ( | ) | const |
Definition at line 986 of file unicharset.cpp.
{
int start_id = 0;
if (has_special_codes()) start_id = SPECIAL_UNICHAR_CODES_COUNT;
for (int id = start_id; id < size_used; ++id) {
// Convert to unicodes.
GenericVector<int> unicodes;
if (UNICHAR::UTF8ToUnicode(get_normed_unichar(id), &unicodes) &&
unicodes.size() > 1) {
for (int u = 1; u < unicodes.size(); ++u) {
if (unicodes[u - 1] == unicodes[u]) return true;
}
}
}
return false;
}
| void UNICHARSET::AppendOtherUnicharset | ( | const UNICHARSET & | src | ) |
Definition at line 439 of file unicharset.cpp.
{
int initial_used = size_used;
for (int ch = 0; ch < src.size_used; ++ch) {
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
const char* utf8 = src.id_to_unichar(ch);
if (ch >= SPECIAL_UNICHAR_CODES_COUNT && src_props.AnyRangeEmpty()) {
// Only use fully valid entries.
tprintf("Bad properties for index %d, char %s: "
"%d,%d %d,%d %g,%g %g,%g %g,%g\n",
ch, utf8, src_props.min_bottom, src_props.max_bottom,
src_props.min_top, src_props.max_top,
src_props.width, src_props.width_sd,
src_props.bearing, src_props.bearing_sd,
src_props.advance, src_props.advance_sd);
continue;
}
int id = size_used;
if (contains_unichar(utf8)) {
id = unichar_to_id(utf8);
// Just expand current ranges.
unichars[id].properties.ExpandRangesFrom(src_props);
} else {
unichar_insert(utf8);
unichars[id].properties.SetRangesEmpty();
}
}
// Set properties, including mirror and other_case, WITHOUT reordering
// the unicharset.
PartialSetPropertiesFromOther(initial_used, src);
}
| void UNICHARSET::clear | ( | ) | [inline] |
Definition at line 266 of file unicharset.h.
{
if (script_table != NULL) {
for (int i = 0; i < script_table_size_used; ++i)
delete[] script_table[i];
delete[] script_table;
script_table = NULL;
script_table_size_used = 0;
}
if (unichars != NULL) {
delete_pointers_in_unichars();
delete[] unichars;
unichars = NULL;
}
script_table_size_reserved = 0;
size_reserved = 0;
size_used = 0;
ids.clear();
top_bottom_set_ = false;
script_has_upper_lower_ = false;
script_has_xheight_ = false;
null_sid_ = 0;
common_sid_ = 0;
latin_sid_ = 0;
cyrillic_sid_ = 0;
greek_sid_ = 0;
han_sid_ = 0;
hiragana_sid_ = 0;
katakana_sid_ = 0;
}
| int UNICHARSET::common_sid | ( | ) | const [inline] |
Definition at line 832 of file unicharset.h.
{ return common_sid_; }
| bool UNICHARSET::contains_unichar | ( | const char *const | unichar_repr | ) | const |
Definition at line 644 of file unicharset.cpp.
{
return ids.contains(unichar_repr);
}
| bool UNICHARSET::contains_unichar | ( | const char *const | unichar_repr, |
| int | length | ||
| ) | const |
Definition at line 648 of file unicharset.cpp.
{
if (length == 0) {
return false;
}
return ids.contains(unichar_repr, length);
}
| bool UNICHARSET::contains_unichar_id | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 242 of file unicharset.h.
{
return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
unichar_id >= 0;
}
| void UNICHARSET::CopyFrom | ( | const UNICHARSET & | src | ) |
Definition at line 423 of file unicharset.cpp.
{
clear();
for (int ch = 0; ch < src.size_used; ++ch) {
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
const char* utf8 = src.id_to_unichar(ch);
unichar_insert(utf8);
unichars[ch].properties.ExpandRangesFrom(src_props);
}
// Set properties, including mirror and other_case, WITHOUT reordering
// the unicharset.
PartialSetPropertiesFromOther(0, src);
}
| int UNICHARSET::cyrillic_sid | ( | ) | const [inline] |
Definition at line 834 of file unicharset.h.
{ return cyrillic_sid_; }
| STRING UNICHARSET::debug_str | ( | const char * | unichar_repr | ) | const [inline] |
Definition at line 233 of file unicharset.h.
{
return debug_str(unichar_to_id(unichar_repr));
}
| STRING UNICHARSET::debug_str | ( | UNICHAR_ID | id | ) | const |
Definition at line 318 of file unicharset.cpp.
{
if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
const CHAR_FRAGMENT *fragment = this->get_fragment(id);
if (fragment) {
return fragment->to_string();
}
const char* str = id_to_unichar(id);
STRING result = debug_utf8_str(str);
// Append a for lower alpha, A for upper alpha, and x if alpha but neither.
if (get_isalpha(id)) {
if (get_islower(id))
result += "a";
else if (get_isupper(id))
result += "A";
else
result += "x";
}
// Append 0 if a digit.
if (get_isdigit(id)) {
result += "0";
}
// Append p is a punctuation symbol.
if (get_ispunctuation(id)) {
result += "p";
}
return result;
}
| STRING UNICHARSET::debug_utf8_str | ( | const char * | str | ) | [static] |
Definition at line 294 of file unicharset.cpp.
{
STRING result = str;
result += " [";
int step = 1;
// Chop into unicodes and code each as hex.
for (int i = 0; str[i] != '\0'; i += step) {
char hex[sizeof(int) * 2 + 1];
step = UNICHAR::utf8_step(str + i);
if (step == 0) {
step = 1;
sprintf(hex, "%x", str[i]);
} else {
UNICHAR ch(str + i, step);
sprintf(hex, "%x", ch.first_uni());
}
result += hex;
result += " ";
}
result += "]";
return result;
}
| int UNICHARSET::default_sid | ( | ) | const [inline] |
Definition at line 839 of file unicharset.h.
{ return default_sid_; }
| void UNICHARSET::delete_pointers_in_unichars | ( | ) | [inline] |
Definition at line 256 of file unicharset.h.
{
for (int i = 0; i < size_used; ++i) {
if (unichars[i].properties.fragment != NULL) {
delete unichars[i].properties.fragment;
unichars[i].properties.fragment = NULL;
}
}
}
| bool UNICHARSET::encodable_string | ( | const char * | str, |
| int * | first_bad_position | ||
| ) | const |
Definition at line 222 of file unicharset.cpp.
{
GenericVector<UNICHAR_ID> encoding;
return encode_string(str, true, &encoding, NULL, first_bad_position);
}
| bool UNICHARSET::encode_string | ( | const char * | str, |
| bool | give_up_on_failure, | ||
| GenericVector< UNICHAR_ID > * | encoding, | ||
| GenericVector< char > * | lengths, | ||
| int * | encoded_length | ||
| ) | const |
Definition at line 234 of file unicharset.cpp.
{
GenericVector<UNICHAR_ID> working_encoding;
GenericVector<char> working_lengths;
GenericVector<char> best_lengths;
encoding->truncate(0); // Just in case str is empty.
int str_length = strlen(str);
int str_pos = 0;
bool perfect = true;
while (str_pos < str_length) {
encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
&str_pos, encoding, &best_lengths);
if (str_pos < str_length) {
// This is a non-match. Skip one utf-8 character.
perfect = false;
if (give_up_on_failure) break;
int step = UNICHAR::utf8_step(str + str_pos);
if (step == 0) step = 1;
encoding->push_back(INVALID_UNICHAR_ID);
best_lengths.push_back(step);
str_pos += step;
working_encoding = *encoding;
working_lengths = best_lengths;
}
}
if (lengths != NULL) *lengths = best_lengths;
if (encoded_length != NULL) *encoded_length = str_pos;
return perfect;
}
| bool UNICHARSET::eq | ( | UNICHAR_ID | unichar_id, |
| const char *const | unichar_repr | ||
| ) | const |
Definition at line 656 of file unicharset.cpp.
{
return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
}
| void UNICHARSET::ExpandRangesFromOther | ( | const UNICHARSET & | src | ) |
Definition at line 410 of file unicharset.cpp.
{
for (int ch = 0; ch < size_used; ++ch) {
const char* utf8 = id_to_unichar(ch);
UNICHAR_PROPERTIES properties;
if (src.GetStrProperties(utf8, &properties)) {
// Expand just the ranges from properties.
unichars[ch].properties.ExpandRangesFrom(properties);
}
}
}
| void UNICHARSET::get_advance_stats | ( | UNICHAR_ID | unichar_id, |
| float * | advance, | ||
| float * | advance_sd | ||
| ) | const [inline] |
Definition at line 588 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) {
*advance = *advance_sd = 0;
return;
}
ASSERT_HOST(contains_unichar_id(unichar_id));
*advance = unichars[unichar_id].properties.advance;
*advance_sd = unichars[unichar_id].properties.advance_sd;
}
| void UNICHARSET::get_bearing_stats | ( | UNICHAR_ID | unichar_id, |
| float * | bearing, | ||
| float * | bearing_sd | ||
| ) | const [inline] |
Definition at line 571 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) {
*bearing = *bearing_sd = 0.0f;
return;
}
ASSERT_HOST(contains_unichar_id(unichar_id));
*bearing = unichars[unichar_id].properties.bearing;
*bearing_sd = unichars[unichar_id].properties.bearing_sd;
}
| char UNICHARSET::get_chartype | ( | UNICHAR_ID | unichar_id | ) | const |
Definition at line 603 of file unicharset.cpp.
{
if (this->get_isupper(id)) return 'A';
if (this->get_islower(id)) return 'a';
if (this->get_isalpha(id)) return 'x';
if (this->get_isdigit(id)) return '0';
if (this->get_ispunctuation(id)) return 'p';
return 0;
}
| char UNICHARSET::get_chartype | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 719 of file unicharset.h.
{
return get_chartype(unichar_to_id(unichar_repr));
}
| Direction UNICHARSET::get_direction | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 638 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
ASSERT_HOST(contains_unichar_id(unichar_id));
return unichars[unichar_id].properties.direction;
}
| bool UNICHARSET::get_enabled | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 826 of file unicharset.h.
{
return unichars[unichar_id].properties.enabled;
}
| const CHAR_FRAGMENT* UNICHARSET::get_fragment | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 682 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) return NULL;
ASSERT_HOST(contains_unichar_id(unichar_id));
return unichars[unichar_id].properties.fragment;
}
| const CHAR_FRAGMENT* UNICHARSET::get_fragment | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 732 of file unicharset.h.
{
if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
!ids.contains(unichar_repr)) {
return NULL;
}
return get_fragment(unichar_to_id(unichar_repr));
}
| bool UNICHARSET::get_isalpha | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 689 of file unicharset.h.
{
return get_isalpha(unichar_to_id(unichar_repr));
}
| bool UNICHARSET::get_isalpha | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 449 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) return false;
ASSERT_HOST(contains_unichar_id(unichar_id));
return unichars[unichar_id].properties.isalpha;
}
| bool UNICHARSET::get_isalpha | ( | const char *const | unichar_repr, |
| int | length | ||
| ) | const [inline] |
Definition at line 742 of file unicharset.h.
{
return get_isalpha(unichar_to_id(unichar_repr, length));
}
| bool UNICHARSET::get_isdigit | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 704 of file unicharset.h.
{
return get_isdigit(unichar_to_id(unichar_repr));
}
| bool UNICHARSET::get_isdigit | ( | const char *const | unichar_repr, |
| int | length | ||
| ) | const [inline] |
Definition at line 763 of file unicharset.h.
{
return get_isdigit(unichar_to_id(unichar_repr, length));
}
| bool UNICHARSET::get_isdigit | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 470 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) return false;
ASSERT_HOST(contains_unichar_id(unichar_id));
return unichars[unichar_id].properties.isdigit;
}
| bool UNICHARSET::get_islower | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 694 of file unicharset.h.
{
return get_islower(unichar_to_id(unichar_repr));
}
| bool UNICHARSET::get_islower | ( | const char *const | unichar_repr, |
| int | length | ||
| ) | const [inline] |
Definition at line 749 of file unicharset.h.
{
return get_islower(unichar_to_id(unichar_repr, length));
}
| bool UNICHARSET::get_islower | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 456 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) return false;
ASSERT_HOST(contains_unichar_id(unichar_id));
return unichars[unichar_id].properties.islower;
}
| bool UNICHARSET::get_isngram | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 484 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) return false;
ASSERT_HOST(contains_unichar_id(unichar_id));
return unichars[unichar_id].properties.isngram;
}
| bool UNICHARSET::get_isprivate | ( | UNICHAR_ID | unichar_id | ) | const |
Definition at line 363 of file unicharset.cpp.
{
UNICHAR uc(id_to_unichar(unichar_id), -1);
int uni = uc.first_uni();
return (uni >= 0xE000 && uni <= 0xF8FF);
}
| bool UNICHARSET::get_ispunctuation | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 709 of file unicharset.h.
{
return get_ispunctuation(unichar_to_id(unichar_repr));
}
| bool UNICHARSET::get_ispunctuation | ( | const char *const | unichar_repr, |
| int | length | ||
| ) | const [inline] |
Definition at line 770 of file unicharset.h.
{
return get_ispunctuation(unichar_to_id(unichar_repr, length));
}
| bool UNICHARSET::get_ispunctuation | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 477 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) return false;
ASSERT_HOST(contains_unichar_id(unichar_id));
return unichars[unichar_id].properties.ispunctuation;
}
| bool UNICHARSET::get_isupper | ( | const char *const | unichar_repr, |
| int | length | ||
| ) | const [inline] |
Definition at line 756 of file unicharset.h.
{
return get_isupper(unichar_to_id(unichar_repr, length));
}
| bool UNICHARSET::get_isupper | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 699 of file unicharset.h.
{
return get_isupper(unichar_to_id(unichar_repr));
}
| bool UNICHARSET::get_isupper | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 463 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) return false;
ASSERT_HOST(contains_unichar_id(unichar_id));
return unichars[unichar_id].properties.isupper;
}
| UNICHAR_ID UNICHARSET::get_mirror | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 645 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
ASSERT_HOST(contains_unichar_id(unichar_id));
return unichars[unichar_id].properties.mirror;
}
| const char* UNICHARSET::get_normed_unichar | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 776 of file unicharset.h.
{
if (unichar_id == UNICHAR_SPACE && has_special_codes()) return " ";
return unichars[unichar_id].properties.normed.string();
}
| UNICHAR_ID UNICHARSET::get_other_case | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 631 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
ASSERT_HOST(contains_unichar_id(unichar_id));
return unichars[unichar_id].properties.other_case;
}
| unsigned int UNICHARSET::get_properties | ( | UNICHAR_ID | unichar_id | ) | const |
Definition at line 588 of file unicharset.cpp.
{
unsigned int properties = 0;
if (this->get_isalpha(id))
properties |= ISALPHA_MASK;
if (this->get_islower(id))
properties |= ISLOWER_MASK;
if (this->get_isupper(id))
properties |= ISUPPER_MASK;
if (this->get_isdigit(id))
properties |= ISDIGIT_MASK;
if (this->get_ispunctuation(id))
properties |= ISPUNCTUATION_MASK;
return properties;
}
| unsigned int UNICHARSET::get_properties | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 715 of file unicharset.h.
{
return get_properties(unichar_to_id(unichar_repr));
}
| int UNICHARSET::get_script | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 611 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
ASSERT_HOST(contains_unichar_id(unichar_id));
return unichars[unichar_id].properties.script_id;
}
| int UNICHARSET::get_script | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 726 of file unicharset.h.
{
return get_script(unichar_to_id(unichar_repr));
}
| int UNICHARSET::get_script | ( | const char *const | unichar_repr, |
| int | length | ||
| ) | const [inline] |
Definition at line 791 of file unicharset.h.
{
return get_script(unichar_to_id(unichar_repr, length));
}
| const char* UNICHARSET::get_script_from_script_id | ( | int | id | ) | const [inline] |
Definition at line 802 of file unicharset.h.
{
if (id >= script_table_size_used || id < 0)
return null_script;
return script_table[id];
}
| int UNICHARSET::get_script_id_from_name | ( | const char * | script_name | ) | const |
Definition at line 1080 of file unicharset.cpp.
{
for (int i = 0; i < script_table_size_used; ++i) {
if (strcmp(script_name, script_table[i]) == 0)
return i;
}
return 0; // 0 is always the null_script
}
| int UNICHARSET::get_script_table_size | ( | ) | const [inline] |
Definition at line 797 of file unicharset.h.
{
return script_table_size_used;
}
| void UNICHARSET::get_top_bottom | ( | UNICHAR_ID | unichar_id, |
| int * | min_bottom, | ||
| int * | max_bottom, | ||
| int * | min_top, | ||
| int * | max_top | ||
| ) | const [inline] |
Definition at line 526 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) {
*min_bottom = *min_top = 0;
*max_bottom = *max_top = 256; // kBlnCellHeight
return;
}
ASSERT_HOST(contains_unichar_id(unichar_id));
*min_bottom = unichars[unichar_id].properties.min_bottom;
*max_bottom = unichars[unichar_id].properties.max_bottom;
*min_top = unichars[unichar_id].properties.min_top;
*max_top = unichars[unichar_id].properties.max_top;
}
| void UNICHARSET::get_width_stats | ( | UNICHAR_ID | unichar_id, |
| float * | width, | ||
| float * | width_sd | ||
| ) | const [inline] |
Definition at line 554 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) {
*width = 0.0f;
*width_sd = 0.0f;;
return;
}
ASSERT_HOST(contains_unichar_id(unichar_id));
*width = unichars[unichar_id].properties.width;
*width_sd = unichars[unichar_id].properties.width_sd;
}
| int UNICHARSET::greek_sid | ( | ) | const [inline] |
Definition at line 835 of file unicharset.h.
{ return greek_sid_; }
| int UNICHARSET::han_sid | ( | ) | const [inline] |
Definition at line 836 of file unicharset.h.
{ return han_sid_; }
| bool UNICHARSET::has_special_codes | ( | ) | const [inline] |
Definition at line 670 of file unicharset.h.
{
return get_fragment(UNICHAR_BROKEN) != NULL &&
strcmp(id_to_unichar(UNICHAR_BROKEN),
kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
}
| int UNICHARSET::hiragana_sid | ( | ) | const [inline] |
Definition at line 837 of file unicharset.h.
{ return hiragana_sid_; }
| const char * UNICHARSET::id_to_unichar | ( | UNICHAR_ID | id | ) | const |
Definition at line 266 of file unicharset.cpp.
{
if (id == INVALID_UNICHAR_ID) {
return INVALID_UNICHAR;
}
ASSERT_HOST(id < this->size());
return unichars[id].representation;
}
| const char * UNICHARSET::id_to_unichar_ext | ( | UNICHAR_ID | id | ) | const |
Definition at line 274 of file unicharset.cpp.
{
if (id == INVALID_UNICHAR_ID) {
return INVALID_UNICHAR;
}
ASSERT_HOST(id < this->size());
// Resolve from the kCustomLigatures table if this is a private encoding.
if (get_isprivate(id)) {
const char* ch = id_to_unichar(id);
for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) {
if (!strcmp(ch, kCustomLigatures[i][1])) {
return kCustomLigatures[i][0];
}
}
}
// Otherwise return the stored representation.
return unichars[id].representation;
}
| bool UNICHARSET::is_null_script | ( | const char * | script | ) | const [inline] |
Definition at line 816 of file unicharset.h.
{
return script == null_script;
}
| int UNICHARSET::katakana_sid | ( | ) | const [inline] |
Definition at line 838 of file unicharset.h.
{ return katakana_sid_; }
| int UNICHARSET::latin_sid | ( | ) | const [inline] |
Definition at line 833 of file unicharset.h.
{ return latin_sid_; }
| bool UNICHARSET::load_from_file | ( | FILE * | file, |
| bool | skip_fragments | ||
| ) |
Definition at line 744 of file unicharset.cpp.
{
LocalFilePointer lfp(file);
TessResultCallback2<char *, char *, int> *fgets_cb =
NewPermanentTessCallback(&lfp, &LocalFilePointer::fgets);
bool success = load_via_fgets(fgets_cb, skip_fragments);
delete fgets_cb;
return success;
}
| bool UNICHARSET::load_from_file | ( | const char *const | filename, |
| bool | skip_fragments | ||
| ) | [inline] |
Definition at line 346 of file unicharset.h.
{
FILE* file = fopen(filename, "rb");
if (file == NULL) return false;
bool result = load_from_file(file, skip_fragments);
fclose(file);
return result;
}
| bool UNICHARSET::load_from_file | ( | const char *const | filename | ) | [inline] |
Definition at line 354 of file unicharset.h.
{
return load_from_file(filename, false);
}
| bool UNICHARSET::load_from_file | ( | FILE * | file | ) | [inline] |
Definition at line 361 of file unicharset.h.
{ return load_from_file(file, false); }
| bool UNICHARSET::load_from_file | ( | tesseract::TFile * | file, |
| bool | skip_fragments | ||
| ) |
Definition at line 753 of file unicharset.cpp.
{
TessResultCallback2<char *, char *, int> *fgets_cb =
NewPermanentTessCallback(file, &tesseract::TFile::FGets);
bool success = load_via_fgets(fgets_cb, skip_fragments);
delete fgets_cb;
return success;
}
| bool UNICHARSET::load_from_inmemory_file | ( | const char *const | memory, |
| int | mem_size | ||
| ) | [inline] |
Definition at line 339 of file unicharset.h.
{
return load_from_inmemory_file(memory, mem_size, false);
}
| bool UNICHARSET::load_from_inmemory_file | ( | const char *const | memory, |
| int | mem_size, | ||
| bool | skip_fragments | ||
| ) |
Definition at line 724 of file unicharset.cpp.
{
InMemoryFilePointer mem_fp(memory, mem_size);
TessResultCallback2<char *, char *, int> *fgets_cb =
NewPermanentTessCallback(&mem_fp, &InMemoryFilePointer::fgets);
bool success = load_via_fgets(fgets_cb, skip_fragments);
delete fgets_cb;
return success;
}
| bool UNICHARSET::major_right_to_left | ( | ) | const |
Definition at line 931 of file unicharset.cpp.
{
int ltr_count = 0;
int rtl_count = 0;
for (int id = 0; id < size_used; ++id) {
int dir = get_direction(id);
if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||
dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
}
return rtl_count > ltr_count;
}
| const GenericVector<UNICHAR_ID>& UNICHARSET::normed_ids | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 783 of file unicharset.h.
{
return unichars[unichar_id].properties.normed_ids;
}
| int UNICHARSET::null_sid | ( | ) | const [inline] |
Definition at line 831 of file unicharset.h.
{ return null_sid_; }
| void UNICHARSET::PartialSetPropertiesFromOther | ( | int | start_index, |
| const UNICHARSET & | src | ||
| ) |
Definition at line 380 of file unicharset.cpp.
{
for (int ch = start_index; ch < size_used; ++ch) {
const char* utf8 = id_to_unichar(ch);
UNICHAR_PROPERTIES properties;
if (src.GetStrProperties(utf8, &properties)) {
// Setup the script_id, other_case, and mirror properly.
const char* script = src.get_script_from_script_id(properties.script_id);
properties.script_id = add_script(script);
const char* other_case = src.id_to_unichar(properties.other_case);
if (contains_unichar(other_case)) {
properties.other_case = unichar_to_id(other_case);
} else {
properties.other_case = ch;
}
const char* mirror_str = src.id_to_unichar(properties.mirror);
if (contains_unichar(mirror_str)) {
properties.mirror = unichar_to_id(mirror_str);
} else {
properties.mirror = ch;
}
unichars[ch].properties.CopyFrom(properties);
set_normed_ids(ch);
}
}
}
| void UNICHARSET::post_load_setup | ( | ) |
Definition at line 867 of file unicharset.cpp.
{
// Number of alpha chars with the case property minus those without,
// in order to determine that half the alpha chars have case.
int net_case_alphas = 0;
int x_height_alphas = 0;
int cap_height_alphas = 0;
top_bottom_set_ = false;
for (UNICHAR_ID id = 0; id < size_used; ++id) {
int min_bottom = 0;
int max_bottom = MAX_UINT8;
int min_top = 0;
int max_top = MAX_UINT8;
get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
if (min_top > 0)
top_bottom_set_ = true;
if (get_isalpha(id)) {
if (get_islower(id) || get_isupper(id))
++net_case_alphas;
else
--net_case_alphas;
if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
++x_height_alphas;
else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
++cap_height_alphas;
}
set_normed_ids(id);
}
script_has_upper_lower_ = net_case_alphas > 0;
script_has_xheight_ = script_has_upper_lower_ ||
(x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
null_sid_ = get_script_id_from_name(null_script);
ASSERT_HOST(null_sid_ == 0);
common_sid_ = get_script_id_from_name("Common");
latin_sid_ = get_script_id_from_name("Latin");
cyrillic_sid_ = get_script_id_from_name("Cyrillic");
greek_sid_ = get_script_id_from_name("Greek");
han_sid_ = get_script_id_from_name("Han");
hiragana_sid_ = get_script_id_from_name("Hiragana");
katakana_sid_ = get_script_id_from_name("Katakana");
// Compute default script. Use the highest-counting alpha script, that is
// not the common script, as that still contains some "alphas".
int* script_counts = new int[script_table_size_used];
memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
for (int id = 0; id < size_used; ++id) {
if (get_isalpha(id)) {
++script_counts[get_script(id)];
}
}
default_sid_ = 0;
for (int s = 1; s < script_table_size_used; ++s) {
if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
default_sid_ = s;
}
delete [] script_counts;
}
| bool UNICHARSET::PropertiesIncomplete | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 604 of file unicharset.h.
{
return unichars[unichar_id].properties.AnyRangeEmpty();
}
| void UNICHARSET::reserve | ( | int | unichars_number | ) |
Definition at line 179 of file unicharset.cpp.
{
if (unichars_number > size_reserved) {
UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
for (int i = 0; i < size_used; ++i)
unichars_new[i] = unichars[i];
for (int j = size_used; j < unichars_number; ++j) {
unichars_new[j].properties.script_id = add_script(null_script);
}
delete[] unichars;
unichars = unichars_new;
size_reserved = unichars_number;
}
}
| bool UNICHARSET::save_to_file | ( | const char *const | filename | ) | const [inline] |
Definition at line 306 of file unicharset.h.
{
FILE* file = fopen(filename, "w+b");
if (file == NULL) return false;
bool result = save_to_file(file);
fclose(file);
return result;
}
| bool UNICHARSET::save_to_file | ( | FILE * | file | ) | const [inline] |
Definition at line 316 of file unicharset.h.
{
STRING str;
if (!save_to_string(&str)) return false;
if (fwrite(&str[0], str.length(), 1, file) != 1) return false;
return true;
}
| bool UNICHARSET::save_to_file | ( | tesseract::TFile * | file | ) | const [inline] |
Definition at line 322 of file unicharset.h.
{
STRING str;
if (!save_to_string(&str)) return false;
if (file->FWrite(&str[0], str.length(), 1) != 1) return false;
return true;
}
| bool UNICHARSET::save_to_string | ( | STRING * | str | ) | const |
Definition at line 661 of file unicharset.cpp.
{
const int kFileBufSize = 1024;
char buffer[kFileBufSize + 1];
snprintf(buffer, kFileBufSize, "%d\n", this->size());
*str = buffer;
for (UNICHAR_ID id = 0; id < this->size(); ++id) {
int min_bottom, max_bottom, min_top, max_top;
get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
float width, width_sd;
get_width_stats(id, &width, &width_sd);
float bearing, bearing_sd;
get_bearing_stats(id, &bearing, &bearing_sd);
float advance, advance_sd;
get_advance_stats(id, &advance, &advance_sd);
unsigned int properties = this->get_properties(id);
if (strcmp(this->id_to_unichar(id), " ") == 0) {
snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
this->get_script_from_script_id(this->get_script(id)),
this->get_other_case(id));
} else {
snprintf(buffer, kFileBufSize,
"%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %s %d %d %d %s\t# %s\n",
this->id_to_unichar(id), properties,
min_bottom, max_bottom, min_top, max_top, width, width_sd,
bearing, bearing_sd, advance, advance_sd,
this->get_script_from_script_id(this->get_script(id)),
this->get_other_case(id), this->get_direction(id),
this->get_mirror(id), this->get_normed_unichar(id),
this->debug_str(id).string());
}
*str += buffer;
}
return true;
}
| bool UNICHARSET::script_has_upper_lower | ( | ) | const [inline] |
Definition at line 842 of file unicharset.h.
{
return script_has_upper_lower_;
}
| bool UNICHARSET::script_has_xheight | ( | ) | const [inline] |
Definition at line 849 of file unicharset.h.
{
return script_has_xheight_;
}
| void UNICHARSET::set_advance_stats | ( | UNICHAR_ID | unichar_id, |
| float | advance, | ||
| float | advance_sd | ||
| ) | [inline] |
Definition at line 598 of file unicharset.h.
{
unichars[unichar_id].properties.advance = advance;
unichars[unichar_id].properties.advance_sd = advance_sd;
}
| void UNICHARSET::set_bearing_stats | ( | UNICHAR_ID | unichar_id, |
| float | bearing, | ||
| float | bearing_sd | ||
| ) | [inline] |
Definition at line 581 of file unicharset.h.
{
unichars[unichar_id].properties.bearing = bearing;
unichars[unichar_id].properties.bearing_sd = bearing_sd;
}
| void UNICHARSET::set_black_and_whitelist | ( | const char * | blacklist, |
| const char * | whitelist, | ||
| const char * | unblacklist | ||
| ) |
Definition at line 948 of file unicharset.cpp.
{
bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
// Set everything to default
for (int ch = 0; ch < size_used; ++ch)
unichars[ch].properties.enabled = def_enabled;
if (!def_enabled) {
// Enable the whitelist.
GenericVector<UNICHAR_ID> encoding;
encode_string(whitelist, false, &encoding, NULL, NULL);
for (int i = 0; i < encoding.size(); ++i) {
if (encoding[i] != INVALID_UNICHAR_ID)
unichars[encoding[i]].properties.enabled = true;
}
}
if (blacklist != NULL && blacklist[0] != '\0') {
// Disable the blacklist.
GenericVector<UNICHAR_ID> encoding;
encode_string(blacklist, false, &encoding, NULL, NULL);
for (int i = 0; i < encoding.size(); ++i) {
if (encoding[i] != INVALID_UNICHAR_ID)
unichars[encoding[i]].properties.enabled = false;
}
}
if (unblacklist != NULL && unblacklist[0] != '\0') {
// Re-enable the unblacklist.
GenericVector<UNICHAR_ID> encoding;
encode_string(unblacklist, false, &encoding, NULL, NULL);
for (int i = 0; i < encoding.size(); ++i) {
if (encoding[i] != INVALID_UNICHAR_ID)
unichars[encoding[i]].properties.enabled = true;
}
}
}
| void UNICHARSET::set_direction | ( | UNICHAR_ID | unichar_id, |
| UNICHARSET::Direction | value | ||
| ) | [inline] |
Definition at line 430 of file unicharset.h.
{
unichars[unichar_id].properties.direction = value;
}
| void UNICHARSET::set_isalpha | ( | UNICHAR_ID | unichar_id, |
| bool | value | ||
| ) | [inline] |
Definition at line 389 of file unicharset.h.
{
unichars[unichar_id].properties.isalpha = value;
}
| void UNICHARSET::set_isdigit | ( | UNICHAR_ID | unichar_id, |
| bool | value | ||
| ) | [inline] |
Definition at line 404 of file unicharset.h.
{
unichars[unichar_id].properties.isdigit = value;
}
| void UNICHARSET::set_islower | ( | UNICHAR_ID | unichar_id, |
| bool | value | ||
| ) | [inline] |
Definition at line 394 of file unicharset.h.
{
unichars[unichar_id].properties.islower = value;
}
| void UNICHARSET::set_isngram | ( | UNICHAR_ID | unichar_id, |
| bool | value | ||
| ) | [inline] |
Definition at line 414 of file unicharset.h.
{
unichars[unichar_id].properties.isngram = value;
}
| void UNICHARSET::set_ispunctuation | ( | UNICHAR_ID | unichar_id, |
| bool | value | ||
| ) | [inline] |
Definition at line 409 of file unicharset.h.
{
unichars[unichar_id].properties.ispunctuation = value;
}
| void UNICHARSET::set_isupper | ( | UNICHAR_ID | unichar_id, |
| bool | value | ||
| ) | [inline] |
Definition at line 399 of file unicharset.h.
{
unichars[unichar_id].properties.isupper = value;
}
| void UNICHARSET::set_mirror | ( | UNICHAR_ID | unichar_id, |
| UNICHAR_ID | mirror | ||
| ) | [inline] |
Definition at line 435 of file unicharset.h.
{
unichars[unichar_id].properties.mirror = mirror;
}
| void UNICHARSET::set_normed | ( | UNICHAR_ID | unichar_id, |
| const char * | normed | ||
| ) | [inline] |
Definition at line 440 of file unicharset.h.
{
unichars[unichar_id].properties.normed = normed;
unichars[unichar_id].properties.normed_ids.truncate(0);
}
| void UNICHARSET::set_normed_ids | ( | UNICHAR_ID | unichar_id | ) |
Definition at line 348 of file unicharset.cpp.
{
unichars[unichar_id].properties.normed_ids.truncate(0);
if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
} else if (!encode_string(unichars[unichar_id].properties.normed.string(),
true, &unichars[unichar_id].properties.normed_ids,
NULL, NULL)) {
unichars[unichar_id].properties.normed_ids.truncate(0);
unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
}
}
| void UNICHARSET::set_other_case | ( | UNICHAR_ID | unichar_id, |
| UNICHAR_ID | other_case | ||
| ) | [inline] |
Definition at line 425 of file unicharset.h.
{
unichars[unichar_id].properties.other_case = other_case;
}
| void UNICHARSET::set_ranges_empty | ( | ) |
Definition at line 371 of file unicharset.cpp.
{
for (int id = 0; id < size_used; ++id) {
unichars[id].properties.SetRangesEmpty();
}
}
| void UNICHARSET::set_script | ( | UNICHAR_ID | unichar_id, |
| const char * | value | ||
| ) | [inline] |
Definition at line 420 of file unicharset.h.
{
unichars[unichar_id].properties.script_id = add_script(value);
}
| void UNICHARSET::set_top_bottom | ( | UNICHAR_ID | unichar_id, |
| int | min_bottom, | ||
| int | max_bottom, | ||
| int | min_top, | ||
| int | max_top | ||
| ) | [inline] |
Definition at line 540 of file unicharset.h.
{
unichars[unichar_id].properties.min_bottom =
static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8));
unichars[unichar_id].properties.max_bottom =
static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8));
unichars[unichar_id].properties.min_top =
static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8));
unichars[unichar_id].properties.max_top =
static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));
}
| void UNICHARSET::set_width_stats | ( | UNICHAR_ID | unichar_id, |
| float | width, | ||
| float | width_sd | ||
| ) | [inline] |
Definition at line 565 of file unicharset.h.
{
unichars[unichar_id].properties.width = width;
unichars[unichar_id].properties.width_sd = width_sd;
}
| void UNICHARSET::SetPropertiesFromOther | ( | const UNICHARSET & | src | ) | [inline] |
Definition at line 503 of file unicharset.h.
{
PartialSetPropertiesFromOther(0, src);
}
| int UNICHARSET::size | ( | ) | const [inline] |
Definition at line 297 of file unicharset.h.
{
return size_used;
}
| bool UNICHARSET::SizesDistinct | ( | UNICHAR_ID | id1, |
| UNICHAR_ID | id2 | ||
| ) | const |
Definition at line 472 of file unicharset.cpp.
| int UNICHARSET::step | ( | const char * | str | ) | const |
Definition at line 211 of file unicharset.cpp.
{
GenericVector<UNICHAR_ID> encoding;
GenericVector<char> lengths;
encode_string(str, true, &encoding, &lengths, NULL);
if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
return lengths[0];
}
| UNICHAR_ID UNICHARSET::to_lower | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 652 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
ASSERT_HOST(contains_unichar_id(unichar_id));
if (unichars[unichar_id].properties.islower) return unichar_id;
return unichars[unichar_id].properties.other_case;
}
| UNICHAR_ID UNICHARSET::to_upper | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 660 of file unicharset.h.
{
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
ASSERT_HOST(contains_unichar_id(unichar_id));
if (unichars[unichar_id].properties.isupper) return unichar_id;
return unichars[unichar_id].properties.other_case;
}
| bool UNICHARSET::top_bottom_useful | ( | ) | const [inline] |
Definition at line 495 of file unicharset.h.
{
return top_bottom_set_;
}
| void UNICHARSET::unichar_insert | ( | const char *const | unichar_repr | ) |
Definition at line 612 of file unicharset.cpp.
{
if (!ids.contains(unichar_repr)) {
if (strlen(unichar_repr) > UNICHAR_LEN) {
fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
int(strlen(unichar_repr)), unichar_repr);
return;
}
if (size_used == size_reserved) {
if (size_used == 0)
reserve(8);
else
reserve(2 * size_used);
}
strcpy(unichars[size_used].representation, unichar_repr);
this->set_script(size_used, null_script);
// If the given unichar_repr represents a fragmented character, set
// fragment property to a pointer to CHAR_FRAGMENT class instance with
// information parsed from the unichar representation. Use the script
// of the base unichar for the fragmented character if possible.
CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
this->unichars[size_used].properties.fragment = frag;
if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
this->unichars[size_used].properties.script_id =
this->get_script(frag->get_unichar());
}
this->unichars[size_used].properties.enabled = true;
ids.insert(unichar_repr, size_used);
++size_used;
}
}
| UNICHAR_ID UNICHARSET::unichar_to_id | ( | const char *const | unichar_repr, |
| int | length | ||
| ) | const |
Definition at line 199 of file unicharset.cpp.
{
assert(length > 0 && length <= UNICHAR_LEN);
return ids.contains(unichar_repr, length) ?
ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
}
| UNICHAR_ID UNICHARSET::unichar_to_id | ( | const char *const | unichar_repr | ) | const |
Definition at line 194 of file unicharset.cpp.
{
return ids.contains(unichar_repr) ?
ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
}
const char * UNICHARSET::kCustomLigatures [static] |
{
{"ct", "\uE003"},
{"ſh", "\uE006"},
{"ſi", "\uE007"},
{"ſl", "\uE008"},
{"ſſ", "\uE009"},
{NULL, NULL}
}
Definition at line 144 of file unicharset.h.
const char * UNICHARSET::kSpecialUnicharCodes [static] |
{
" ",
"Joined",
"|Broken|0|1"
}
Definition at line 147 of file unicharset.h.