22 #include "config_auto.h" 57 static const
char kPermuterTypeNoPerm[] = "None";
58 static const
char kPermuterTypePuncPerm[] = "Punctuation";
59 static const
char kPermuterTypeTopPerm[] = "Top Choice";
60 static const
char kPermuterTypeLowerPerm[] = "Top Lower Case";
61 static const
char kPermuterTypeUpperPerm[] = "Top Upper Case";
62 static const
char kPermuterTypeNgramPerm[] = "Ngram";
63 static const
char kPermuterTypeNumberPerm[] = "Number";
64 static const
char kPermuterTypeUserPatPerm[] = "User Pattern";
65 static const
char kPermuterTypeSysDawgPerm[] = "System Dictionary";
66 static const
char kPermuterTypeDocDawgPerm[] = "Document Dictionary";
67 static const
char kPermuterTypeUserDawgPerm[] = "User Dictionary";
68 static const
char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary";
69 static const
char kPermuterTypeCompoundPerm[] = "Compound";
71 static const
char * const kPermuterTypeNames[] = {
73 kPermuterTypePuncPerm,
75 kPermuterTypeLowerPerm,
76 kPermuterTypeUpperPerm,
77 kPermuterTypeNgramPerm,
78 kPermuterTypeNumberPerm,
79 kPermuterTypeUserPatPerm,
80 kPermuterTypeSysDawgPerm,
81 kPermuterTypeDocDawgPerm,
82 kPermuterTypeUserDawgPerm,
83 kPermuterTypeFreqDawgPerm,
84 kPermuterTypeCompoundPerm
100 unichar_id_ = src_unichar_id;
101 rating_ = src_rating;
102 certainty_ = src_cert;
105 script_id_ = src_script_id;
124 matrix_cell_ = other.matrix_cell_;
125 min_xheight_ = other.min_xheight_;
126 max_xheight_ = other.max_xheight_;
128 classifier_ = other.classifier_;
129 fonts_ = other.fonts_;
141 matrix_cell_ = other.matrix_cell_;
142 min_xheight_ = other.min_xheight_;
143 max_xheight_ = other.max_xheight_;
145 classifier_ = other.classifier_;
146 fonts_ = other.fonts_;
157 tprintf(
"Baseline diff %g for %d v %d\n",
158 baseline_diff, unichar_id_, other.unichar_id_);
164 double denominator =
ClipToRange(std::min(this_range, other_range),
168 overlap /= denominator;
170 tprintf(
"PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n",
171 unichar_id_, other.unichar_id_, baseline_diff,
172 this_range, other_range, denominator, overlap);
181 BLOB_CHOICE_LIST* bc_list) {
183 BLOB_CHOICE_IT choice_it(bc_list);
184 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
185 choice_it.forward()) {
195 return kPermuterTypeNames[permuter];
201 switch (script_pos) {
220 : unicharset_(&unicharset){
224 if (unicharset.
encode_string(cleaned.c_str(),
true, &encoding, &lengths,
226 lengths.push_back(
'\0');
227 STRING src_lengths = &lengths[0];
246 const char *src_lengths,
249 uint8_t src_permuter) {
250 int src_string_len = strlen(src_string);
251 if (src_string_len == 0) {
254 this->
init(src_lengths ? strlen(src_lengths): src_string_len);
257 for (
int i = 0; i < length_; ++i) {
258 int unichar_length = src_lengths ? src_lengths[i] : 1;
260 unicharset_->
unichar_to_id(src_string+offset, unichar_length);
262 certainties_[i] = src_certainty;
263 offset += unichar_length;
266 adjust_factor_ = 1.0f;
267 rating_ = src_rating;
268 certainty_ = src_certainty;
269 permuter_ = src_permuter;
270 dangerous_ambig_found_ =
false;
277 delete[] unichar_ids_;
278 delete[] script_pos_;
280 delete[] certainties_;
284 return kPermuterTypeNames[permuter_];
292 BLOB_CHOICE_LIST* result = ratings->
get(coord.
col, coord.
row);
293 if (result ==
nullptr) {
294 result =
new BLOB_CHOICE_LIST;
295 ratings->
put(coord.
col, coord.
row, result);
304 for (
int i = 0; i < index; ++i)
306 int row = col + state_[index] - 1;
314 unichar_ids_[index] = blob_choice->
unichar_id();
316 state_[index] = blob_count;
317 certainties_[index] = blob_choice->
certainty();
327 for (
int i = 0; i < length_; ++i) {
328 if (unichar_ids_[i] == unichar_id) {
345 for (
int i = 0; i < num; ++i) {
347 state_[start - 1] += state_[start + i];
348 else if (start + num < length_)
349 state_[start + num] += state_[start + i];
351 for (
int i = start; i + num < length_; ++i) {
352 unichar_ids_[i] = unichar_ids_[i + num];
353 script_pos_[i] = script_pos_[i + num];
354 state_[i] = state_[i + num];
355 certainties_[i] = certainties_[i + num];
366 for (
int i = 0; i < length_ / 2; ++i) {
368 unichar_ids_[i] = unicharset_->
get_mirror(unichar_ids_[length_-1-i]);
369 unichar_ids_[length_-1-i] = unicharset_->
get_mirror(tmp_id);
371 if (length_ % 2 != 0) {
372 unichar_ids_[length_/2] = unicharset_->
get_mirror(unichar_ids_[length_/2]);
386 while (*start <
length() &&
405 while (start < end &&
417 if (end < start) { end = start; }
419 for (
int i = start; i < end; i++) {
421 unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
433 for (i = 0; i < length_; ++i) {
450 STRING *word_lengths_str)
const {
452 if (word_lengths_str !=
nullptr) *word_lengths_str =
"";
453 for (
int i = 0; i < length_; ++i) {
456 if (word_lengths_str !=
nullptr) {
457 *word_lengths_str += strlen(ch);
471 if (length_ == reserved_) {
487 while (reserved_ < length_ + second.
length()) {
491 for (
int i = 0; i < second.
length(); ++i) {
492 unichar_ids_[length_ + i] = other_unichar_ids[i];
493 state_[length_ + i] = second.state_[i];
494 certainties_[length_ + i] = second.certainties_[i];
497 length_ += second.
length();
498 if (second.adjust_factor_ > adjust_factor_)
499 adjust_factor_ = second.adjust_factor_;
500 rating_ += second.
rating();
503 if (second.dangerous_ambig_found_)
504 dangerous_ambig_found_ =
true;
522 while (reserved_ < source.
length()) {
526 unicharset_ = source.unicharset_;
528 for (
int i = 0; i < source.
length(); ++i) {
529 unichar_ids_[i] = other_unichar_ids[i];
530 state_[i] = source.state_[i];
531 certainties_[i] = source.certainties_[i];
534 length_ = source.
length();
535 adjust_factor_ = source.adjust_factor_;
536 rating_ = source.
rating();
541 dangerous_ambig_found_ = source.dangerous_ambig_found_;
552 for (
int i = 0; i < length_; ++i)
558 int position_counts[4] = { 0, 0, 0, 0 };
561 for (
int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
565 if (state_ !=
nullptr) {
566 for (
int i = 1; i < state_[blob_index]; ++i) {
568 tblob = word->
blobs[chunk_index];
577 position_counts[script_pos_[blob_index]]++;
584 tprintf(
"Most characters of %s are subscript or superscript.\n" 585 "That seems wrong, so I'll assume we got the baseline wrong\n",
588 for (
int i = 0; i < length_; i++) {
591 position_counts[sp]--;
602 for (
int blob_index = 0; blob_index < length_; ++blob_index) {
608 chunk_index += state_ !=
nullptr ? state_[blob_index] : 1;
616 if (positions != script_pos_) {
617 delete [] script_pos_;
619 memcpy(script_pos_, positions,
sizeof(positions[0]) * length);
624 for (
int i = 0; i < length_; ++i)
625 script_pos_[i] = position;
631 const TBOX& blob_box,
634 int top = blob_box.
top();
635 int bottom = blob_box.
bottom();
636 int min_bottom, max_bottom, min_top, max_top;
638 &min_bottom, &max_bottom,
646 }
else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
648 }
else if (bottom > sup_thresh_bot) {
654 tprintf(
"%s Character %s[bot:%d top: %d] " 655 "bot_range[%d,%d] top_range[%d, %d] " 656 "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
659 min_bottom, max_bottom, min_top, max_top,
660 sub_thresh_bot, sub_thresh_top,
669 int *sid =
new int[max_script];
671 for (x = 0; x < max_script; x++) sid[x] = 0;
672 for (x = 0; x < length_; ++x) {
690 for (x = 1; x < max_script; x++)
691 if (sid[x] >= sid[max_sid]) max_sid = x;
692 if (sid[max_sid] < length_ / 2)
700 int total_chunks = 0;
701 for (
int i = 0; i < length_; ++i) {
702 total_chunks += state_[i];
703 if (total_chunks > blob_position) {
712 int total_chunks = 0;
713 for (
int i = 0; i < length_; ++i) {
714 total_chunks += state_[i];
726 for (
int i = 0; i < length_; ++i) {
729 tprintf(
" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n",
730 rating_, certainty_, adjust_factor_, permuter_,
731 min_x_height_, max_x_height_, dangerous_ambig_found_);
733 for (
int i = 0; i < length_; ++i) {
737 for (
int i = 0; i < length_; ++i) {
741 for (
int i = 0; i < length_; ++i) {
745 for (
int i = 0; i < length_; ++i) {
746 tprintf(
"\t%.3f", certainties_[i]);
754 for (
int i = 0; i < length_; ++i)
762 #ifndef GRAPHICS_DISABLED 764 const int kNumColors = 6;
768 bool already_done = prev_drawn_state.
size() == length_;
769 if (!already_done) prev_drawn_state.
init_to_size(length_, 0);
770 for (
int i = 0; i < length_; ++i) {
771 if (prev_drawn_state[i] != state_[i]) {
772 already_done =
false;
774 prev_drawn_state[i] = state_[i];
776 if (already_done || word->
blobs.
empty())
return;
779 if (segm_window ==
nullptr) {
780 segm_window =
new ScrollView(
"Segmentation", 5, 10, 500, 256,
781 2000.0, 256.0,
true);
783 segm_window->
Clear();
788 for (
int c = 0; c < length_; ++c) {
791 for (
int i = 0; i < state_[c]; ++i, ++blob_index) {
794 blob->
plot(segm_window, color, color);
808 if (word2.
unicharset() != uchset)
return false;
813 if (w1end - w1start != w2end - w2start)
return false;
814 for (
int i = 0; i < w1end - w1start; i++) {
834 BLOB_CHOICE_LIST *ratings,
836 if (ratings->length() == 0) {
844 c_it.set_to_list(ratings);
845 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
846 c_it.data()->print(¤t_unicharset);
847 if (!c_it.at_last())
tprintf(
"\n");
UNICHAR_ID unichar_id() const
float max_xheight() const
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
const UNICHAR_ID * unichar_ids() const
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
void operator=(const ELIST_LINK &)
void UpdateStateForSplit(int blob_position)
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
GenericVector< TBLOB * > blobs
void ZoomToRectangle(int x1, int y1, int x2, int y2)
WERD_CHOICE & operator+=(const WERD_CHOICE &second)
TBOX bounding_box() const
const STRING & unichar_string() const
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
bool contains_unichar_id(UNICHAR_ID unichar_id) const
const int kMinSubscriptOffset
float min_x_height() const
MATRIX_COORD MatrixCoord(int index) const
static std::string CleanupString(const char *utf8_str)
Direction get_direction(UNICHAR_ID unichar_id) const
const char * id_to_unichar(UNICHAR_ID id) const
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
WERD_CHOICE & operator=(const WERD_CHOICE &source)
const double kMaxOverlapDenominator
void punct_stripped(int *start_core, int *end_core) const
void init_to_size(int size, const T &t)
const double kMinXHeightMatch
int GetTopScriptID() const
const char * id_to_unichar_ext(UNICHAR_ID id) const
float min_xheight() const
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
bool get_isdigit(UNICHAR_ID unichar_id) const
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
void DisplaySegmentation(TWERD *word)
const int kBlnBaselineOffset
#define ELISTIZE(CLASSNAME)
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET ¤t_unicharset)
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
int16_t fontinfo_id() const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
const int kMinSuperscriptOffset
char window_wait(ScrollView *win)
void put(ICOORD pos, const T &thing)
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
const char * string() const
DLLSYM void tprintf(const char *format,...)
const int kMaxDropCapBottom
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
void make_bad()
Set the fields in this choice to be default (bad) values.
WERD_CHOICE(const UNICHARSET *unicharset)
const char * permuter_name() const
int get_script_table_size() const
bool has_rtl_unichar_id() const
void GetNonSuperscriptSpan(int *start, int *end) const
UNICHAR_ID unichar_id(int index) const
const char * ScriptPosToString(enum ScriptPos script_pos)
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
int16_t fontinfo_id2() const
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
void remove_unichar_ids(int index, int num)
BLOB_CHOICE_LIST * blob_choices(int index, MATRIX *ratings) const
tesseract::ScriptPos BlobPosition(int index) const
float max_x_height() const
void reverse_and_mirror_unichar_ids()
int get_script(UNICHAR_ID unichar_id) const
int TotalOfStates() const
static tesseract::ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
WERD_CHOICE shallow_copy(int start, int end) const
void print_state(const char *msg) const
const double kMaxBaselineDrift
const UNICHARSET * unicharset() const
void SetAllScriptPositions(tesseract::ScriptPos position)