|
tesseract 3.04.01
|
#include "tessvars.h"#include "scanutils.h"#include <ctype.h>#include <string.h>#include "genericvector.h"#include "reject.h"#include "control.h"#include "docqual.h"#include "globaloc.h"#include "globals.h"#include "helpers.h"#include "tesseractclass.h"Go to the source code of this file.
Namespaces | |
| namespace | tesseract |
Functions | |
| CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract | |
| void | reject_blanks (WERD_RES *word) |
| void | reject_poor_matches (WERD_RES *word) |
| float | compute_reject_threshold (WERD_CHOICE *word) |
| CLISTIZEH | ( | STRING | ) |
Definition at line 48 of file reject.cpp.
{
void Tesseract::set_done(WERD_RES *word, inT16 pass) {
word->done = word->tess_accepted &&
(strchr(word->best_choice->unichar_string().string(), ' ') == NULL);
bool word_is_ambig = word->best_choice->dangerous_ambig_found();
bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
word->best_choice->permuter() == FREQ_DAWG_PERM ||
word->best_choice->permuter() == USER_DAWG_PERM;
if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
one_ell_conflict(word, FALSE)) {
if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
word->done = FALSE;
}
if (word->done && ((!word_from_dict &&
word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
word->done = FALSE;
}
if (tessedit_rejection_debug) {
tprintf("set_done(): done=%d\n", word->done);
word->best_choice->print("");
}
}
/*************************************************************************
* make_reject_map()
*
* Sets the done flag to indicate whether the resylt is acceptable.
*
* Sets a reject map for the word.
*************************************************************************/
void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {
int i;
int offset;
flip_0O(word);
check_debug_pt(word, -1); // For trap only
set_done(word, pass); // Set acceptance
word->reject_map.initialise(word->best_choice->unichar_lengths().length());
reject_blanks(word);
/*
0: Rays original heuristic - the baseline
*/
if (tessedit_reject_mode == 0) {
if (!word->done)
reject_poor_matches(word);
} else if (tessedit_reject_mode == 5) {
/*
5: Reject I/1/l from words where there is no strong contextual confirmation;
the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
and the whole of any words which are very small
*/
if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
word->reject_map.rej_word_small_xht();
} else {
one_ell_conflict(word, TRUE);
/*
Originally the code here just used the done flag. Now I have duplicated
and unpacked the conditions for setting the done flag so that each
mechanism can be turned on or off independently. This works WITHOUT
affecting the done flag setting.
*/
if (rej_use_tess_accepted && !word->tess_accepted)
word->reject_map.rej_word_not_tess_accepted ();
if (rej_use_tess_blanks &&
(strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
word->reject_map.rej_word_contains_blanks ();
WERD_CHOICE* best_choice = word->best_choice;
if (rej_use_good_perm) {
if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
best_choice->permuter() == FREQ_DAWG_PERM ||
best_choice->permuter() == USER_DAWG_PERM) &&
(!rej_use_sensible_wd ||
acceptable_word_string(*word->uch_set,
best_choice->unichar_string().string(),
best_choice->unichar_lengths().string()) !=
AC_UNACCEPTABLE)) {
// PASSED TEST
} else if (best_choice->permuter() == NUMBER_PERM) {
if (rej_alphas_in_number_perm) {
for (i = 0, offset = 0;
best_choice->unichar_string()[offset] != '\0';
offset += best_choice->unichar_lengths()[i++]) {
if (word->reject_map[i].accepted() &&
word->uch_set->get_isalpha(
best_choice->unichar_string().string() + offset,
best_choice->unichar_lengths()[i]))
word->reject_map[i].setrej_bad_permuter();
// rej alpha
}
}
} else {
word->reject_map.rej_word_bad_permuter();
}
}
/* Ambig word rejection was here once !!*/
}
} else {
tprintf("BAD tessedit_reject_mode\n");
err_exit();
}
if (tessedit_image_border > -1)
reject_edge_blobs(word);
check_debug_pt (word, 10);
if (tessedit_rejection_debug) {
tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
tprintf("Certainty: %f Rating: %f\n",
word->best_choice->certainty (), word->best_choice->rating ());
tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
}
flip_hyphens(word);
check_debug_pt(word, 20);
}
} // namespace tesseract
| float compute_reject_threshold | ( | WERD_CHOICE * | word | ) |
Definition at line 226 of file reject.cpp.
{
float threshold; // rejection threshold
float bestgap = 0.0f; // biggest gap
float gapstart; // bottom of gap
// super iterator
BLOB_CHOICE_IT choice_it; // real iterator
int blob_count = word->length();
GenericVector<float> ratings;
ratings.init_to_size(blob_count, 0.0f);
for (int i = 0; i < blob_count; ++i) {
ratings[i] = word->certainty(i);
}
ratings.sort();
gapstart = ratings[0] - 1; // all reject if none better
if (blob_count >= 3) {
for (int index = 0; index < blob_count - 1; index++) {
if (ratings[index + 1] - ratings[index] > bestgap) {
bestgap = ratings[index + 1] - ratings[index];
// find biggest
gapstart = ratings[index];
}
}
}
threshold = gapstart + bestgap / 2;
return threshold;
}
| void reject_blanks | ( | WERD_RES * | word | ) |
Definition at line 178 of file reject.cpp.
{
inT16 i;
inT16 offset;
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
offset += word->best_choice->unichar_lengths()[i], i += 1) {
if (word->best_choice->unichar_string()[offset] == ' ')
//rej unrecognised blobs
word->reject_map[i].setrej_tess_failure ();
}
}
| void reject_poor_matches | ( | WERD_RES * | word | ) |
Definition at line 207 of file reject.cpp.
{
float threshold = compute_reject_threshold(word->best_choice);
for (int i = 0; i < word->best_choice->length(); ++i) {
if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
word->reject_map[i].setrej_tess_failure();
else if (word->best_choice->certainty(i) < threshold)
word->reject_map[i].setrej_poor_match();
}
}