74 int16_t err_count = 0;
99 *accepted_match_count = 0;
127 int expected_outline_count;
129 if (
STRING (outlines_odd).contains (c))
131 else if (
STRING (outlines_2).contains (c))
132 expected_outline_count = 2;
134 expected_outline_count = 1;
135 return abs (outline_count - expected_outline_count);
139 bool good_quality_doc) {
140 if ((tessedit_good_quality_unrej && good_quality_doc))
141 unrej_good_quality_words(page_res_it);
142 doc_and_block_rejection(page_res_it, good_quality_doc);
143 if (unlv_tilde_crunching) {
144 tilde_crunch(page_res_it);
145 tilde_delete(page_res_it);
168 while (page_res_it.
word () !=
nullptr) {
169 check_debug_pt (page_res_it.
word (), 100);
171 word = page_res_it.
word ();
173 if (word->
reject_map[i].accept_if_good_quality ())
181 quality_rowrej_pc)) {
182 word = page_res_it.
word ();
184 (tessedit_unrej_any_wd ||
185 acceptable_word_string(*word->
uch_set,
189 unrej_good_chs(word, page_res_it.
row ()->
row);
195 current_row = page_res_it.
row ();
196 while ((page_res_it.
word () !=
nullptr) &&
197 (page_res_it.
row () == current_row))
200 check_debug_pt (page_res_it.
word (), 110);
205 current_block =
nullptr;
206 current_row =
nullptr;
207 while (page_res_it.
word () !=
nullptr) {
208 if (current_block != page_res_it.
block ()) {
209 current_block = page_res_it.
block ();
213 if (current_row != page_res_it.
row ()) {
214 current_row = page_res_it.
row ();
234 bool good_quality_doc) {
235 int16_t block_no = 0;
241 bool prev_word_rejected;
242 int16_t char_quality = 0;
243 int16_t accepted_char_quality;
248 if (tessedit_debug_doc_rejection) {
249 tprintf(
"REJECT ALL #chars: %d #Rejects: %d; \n",
254 if (tessedit_debug_doc_rejection) {
255 tprintf(
"NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
264 while ((word = page_res_it.
word()) !=
nullptr) {
265 current_block = page_res_it.
block();
269 tessedit_reject_block_percent) {
270 if (tessedit_debug_block_rejection) {
271 tprintf(
"REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
275 prev_word_rejected =
false;
276 while ((word = page_res_it.
word()) !=
nullptr &&
277 (page_res_it.
block() == current_block)) {
278 if (tessedit_preserve_blk_rej_perfect_wds) {
281 if (rej_word && tessedit_dont_blkrej_good_wds &&
283 acceptable_word_string(
288 word_char_quality(word, page_res_it.
row()->
row,
290 &accepted_char_quality);
302 if (tessedit_use_reject_spaces &&
303 prev_word_rejected &&
309 prev_word_rejected = rej_word;
313 if (tessedit_debug_block_rejection) {
314 tprintf(
"NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
321 while (page_res_it.
word() !=
nullptr &&
322 page_res_it.
block() == current_block) {
323 current_row = page_res_it.
row();
332 tessedit_reject_row_percent &&
335 tessedit_whole_wd_rej_row_percent) {
336 if (tessedit_debug_block_rejection) {
337 tprintf(
"REJECTING ROW %d #chars: %d; #Rejects: %d\n",
341 prev_word_rejected =
false;
342 while ((word = page_res_it.
word()) !=
nullptr &&
343 page_res_it.
row () == current_row) {
345 if (!tessedit_row_rej_good_docs && good_quality_doc) {
348 tessedit_good_doc_still_rowrej_wd;
349 }
else if (tessedit_preserve_row_rej_perfect_wds) {
353 if (rej_word && tessedit_dont_rowrej_good_wds &&
355 acceptable_word_string(*word->
uch_set,
359 word_char_quality(word, page_res_it.
row()->
row,
361 &accepted_char_quality);
373 if (tessedit_use_reject_spaces &&
374 prev_word_rejected &&
380 prev_word_rejected = rej_word;
384 if (tessedit_debug_block_rejection) {
385 tprintf(
"NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
388 while (page_res_it.
word() !=
nullptr &&
389 page_res_it.
row() == current_row)
408 while (page_res_it.
word () !=
nullptr) {
421 bool prev_potential_marked =
false;
422 bool found_terrible_word =
false;
426 while (page_res_it.
word() !=
nullptr) {
428 if (pb !=
nullptr && !pb->
IsText()) {
432 word = page_res_it.
word();
434 if (crunch_early_convert_bad_unlv_chs)
435 convert_bad_unlv_chs(word);
437 if (crunch_early_merge_tess_fails)
441 found_terrible_word =
false;
443 prev_potential_marked =
false;
446 ok_dict_word = safe_dict_word(word);
447 garbage_level = garbage_word(word, ok_dict_word);
450 (terrible_word_crunch (word, garbage_level))) {
451 if (crunch_debug > 0) {
452 tprintf (
"T CRUNCHING: \"%s\"\n",
456 if (prev_potential_marked) {
458 if (crunch_debug > 0) {
459 tprintf (
"P1 CRUNCHING: \"%s\"\n",
465 prev_potential_marked =
false;
467 found_terrible_word =
true;
470 (potential_word_crunch (word,
471 garbage_level, ok_dict_word))) {
472 if (found_terrible_word) {
473 if (crunch_debug > 0) {
474 tprintf (
"P2 CRUNCHING: \"%s\"\n",
479 else if (!prev_potential_marked) {
480 copy_it = page_res_it;
481 prev_potential_marked =
true;
482 if (crunch_debug > 1) {
483 tprintf (
"P3 CRUNCHING: \"%s\"\n",
489 found_terrible_word =
false;
491 prev_potential_marked =
false;
492 if (crunch_debug > 2) {
493 tprintf (
"NO CRUNCH: \"%s\"\n",
515 if (adjusted_len > crunch_rating_max)
516 adjusted_len = crunch_rating_max;
519 if (rating_per_ch > crunch_terrible_rating)
521 else if (crunch_terrible_garbage && (garbage_level ==
G_TERRIBLE))
524 (garbage_level !=
G_OK))
526 else if ((rating_per_ch > crunch_poor_garbage_rate) &&
527 (garbage_level !=
G_OK))
530 if (crunch_mode > 0) {
531 if (crunch_debug > 2) {
532 tprintf (
"Terrible_word_crunch (%d) on \"%s\"\n",
548 bool word_crunchable;
549 int poor_indicator_count = 0;
551 word_crunchable = !crunch_leave_accept_strings ||
553 (acceptable_word_string(*word->
uch_set,
558 if (adjusted_len > 10)
562 if (rating_per_ch > crunch_pot_poor_rate) {
563 if (crunch_debug > 2) {
564 tprintf(
"Potential poor rating on \"%s\"\n",
567 poor_indicator_count++;
570 if (word_crunchable &&
572 if (crunch_debug > 2) {
573 tprintf(
"Potential poor cert on \"%s\"\n",
576 poor_indicator_count++;
579 if (garbage_level !=
G_OK) {
580 if (crunch_debug > 2) {
581 tprintf(
"Potential garbage on \"%s\"\n",
584 poor_indicator_count++;
586 return poor_indicator_count >= crunch_pot_indicators;
592 bool deleting_from_bol =
false;
593 bool marked_delete_point =
false;
594 int16_t debug_delete_mode;
596 int16_t x_debug_delete_mode;
600 while (page_res_it.
word() !=
nullptr) {
601 word = page_res_it.
word();
603 delete_mode = word_deletable (word, debug_delete_mode);
606 if (crunch_debug > 0) {
607 tprintf (
"BOL CRUNCH DELETING(%d): \"%s\"\n",
612 deleting_from_bol =
true;
614 if (marked_delete_point) {
616 x_delete_mode = word_deletable (copy_it.
word (),
617 x_debug_delete_mode);
618 if (crunch_debug > 0) {
619 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
627 if (crunch_debug > 0) {
628 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
633 deleting_from_bol =
false;
634 marked_delete_point =
false;
637 if (!marked_delete_point) {
638 copy_it = page_res_it;
639 marked_delete_point =
true;
644 deleting_from_bol =
false;
646 marked_delete_point =
false;
652 if (!crunch_early_merge_tess_fails)
694 int isolated_digits = 0;
695 int isolated_alphas = 0;
696 int bad_char_count = 0;
701 int alpha_repetition_count = 0;
702 int longest_alpha_repetition_count = 0;
703 int longest_lower_run_len = 0;
704 int lower_string_count = 0;
705 int longest_upper_run_len = 0;
706 int upper_string_count = 0;
707 int total_alpha_count = 0;
708 int total_digit_count = 0;
710 for (; *str !=
'\0'; str += *(lengths++)) {
715 case SUBSEQUENT_UPPER:
717 state = SUBSEQUENT_UPPER;
718 upper_string_count++;
719 if (longest_upper_run_len < upper_string_count)
720 longest_upper_run_len = upper_string_count;
722 alpha_repetition_count++;
723 if (longest_alpha_repetition_count < alpha_repetition_count) {
724 longest_alpha_repetition_count = alpha_repetition_count;
729 alpha_repetition_count = 1;
738 alpha_repetition_count = 1;
739 upper_string_count = 1;
746 case SUBSEQUENT_LOWER:
748 state = SUBSEQUENT_LOWER;
749 lower_string_count++;
750 if (longest_lower_run_len < lower_string_count)
751 longest_lower_run_len = lower_string_count;
753 alpha_repetition_count++;
754 if (longest_alpha_repetition_count < alpha_repetition_count) {
755 longest_alpha_repetition_count = alpha_repetition_count;
760 alpha_repetition_count = 1;
769 alpha_repetition_count = 1;
770 lower_string_count = 1;
778 state = SUBSEQUENT_NUM;
791 if (*lengths == 1 && *str ==
' ')
820 if (crunch_include_numerals) {
821 total_alpha_count += total_digit_count - isolated_digits;
824 if (crunch_leave_ok_strings && len >= 4 &&
825 2 * (total_alpha_count - isolated_alphas) > len &&
826 longest_alpha_repetition_count < crunch_long_repetitions) {
827 if ((crunch_accept_ok &&
828 acceptable_word_string(*word->
uch_set, str, lengths) !=
830 longest_lower_run_len > crunch_leave_lc_strings ||
831 longest_upper_run_len > crunch_leave_uc_strings)
835 strpbrk(str,
" ") ==
nullptr &&
840 acceptable_word_string(*word->
uch_set, str, lengths) !=
844 ok_chars = len - bad_char_count - isolated_digits -
845 isolated_alphas - tess_rejs;
847 if (crunch_debug > 3) {
848 tprintf(
"garbage_word: \"%s\"\n",
850 tprintf(
"LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
852 bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
854 if (bad_char_count == 0 &&
856 (len > isolated_digits + isolated_alphas || len <= 2))
859 if (tess_rejs > ok_chars ||
860 (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
864 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
866 if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5)
871 dodgy_chars = 2 * tess_rejs + bad_char_count;
872 if ((len == 4 && dodgy_chars > 2) ||
873 (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
926 if ((failure_count (word) * 1.5) > word_len) {
938 if (rating_per_ch > crunch_del_rating) {
972 for (; *str !=
'\0'; str++) {
982 int16_t outline_count = 0;
983 int16_t small_outline_count = 0;
984 int16_t max_dimension;
985 float small_limit =
kBlnXHeight * crunch_small_outlines_size;
987 for (
int b = 0; b < word->
NumBlobs(); ++b) {
991 box = ol->bounding_box();
993 max_dimension = box.
height();
995 max_dimension = box.
width();
996 if (max_dimension < small_limit)
997 small_outline_count++;
1000 return small_outline_count >= outline_count;
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
int16_t word_outline_errs(WERD_RES *word)
uint32_t unsigned_size() const
WERD_RES * restart_page()
void tilde_delete(PAGE_RES_IT &page_res_it)
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
GenericVector< TBLOB * > blobs
POLY_BLOCK * poly_block() const
ROW_RES * prev_row() const
const STRING & unichar_string() const
int32_t whole_word_rej_count
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
BLOCK_RES * block() const
void AcceptIfGoodQuality(int index)
tesseract::BoxWord * bln_boxes
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
DocQualCallbacks(WERD_RES *word0)
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
int16_t word_blob_quality(WERD_RES *word, ROW *row)
bool get_isdigit(UNICHAR_ID unichar_id) const
void tilde_crunch(PAGE_RES_IT &page_res_it)
const UNICHARSET * uch_set
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
const int kBlnBaselineOffset
void convert_bad_unlv_chs(WERD_RES *word_res)
CRUNCH_MODE unlv_crunch_mode
int16_t failure_count(WERD_RES *word)
void unrej_good_chs(WERD_RES *word, ROW *row)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
const char * string() const
DLLSYM void tprintf(const char *format,...)
TBOX bounding_box() const
void set_unichar_id(UNICHAR_ID unichar_id, int index)
int16_t accepted_match_count
void CountMatchingBlobs(int index)
void reject_whole_page(PAGE_RES_IT &page_res_it)
PDBLK pdblk
Page Description Block.
void CountAcceptedBlobs(int index)
UNICHAR_ID unichar_id(int index) const
bool quality_recoverable_rejects()
WERD_CHOICE * best_choice
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
bool flag(WERD_FLAGS mask) const
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
const STRING & unichar_lengths() const
bool get_isupper(UNICHAR_ID unichar_id) const
int16_t count_outline_errs(char c, int16_t outline_count)
bool get_islower(UNICHAR_ID unichar_id) const
void rej_word_block_rej()
bool noise_outlines(TWERD *word)