|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: reject.cpp (Formerly reject.c) 00003 * Description: Rejection functions used in tessedit 00004 * Author: Phil Cheatle 00005 * Created: Wed Sep 23 16:50:21 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #pragma warning(disable:4305) // int/float warnings 00023 #endif 00024 00025 #include "tessvars.h" 00026 #ifdef __UNIX__ 00027 #include <assert.h> 00028 #include <errno.h> 00029 #endif 00030 #include "scanutils.h" 00031 #include <ctype.h> 00032 #include <string.h> 00033 #include "genericvector.h" 00034 #include "reject.h" 00035 #include "control.h" 00036 #include "docqual.h" 00037 #include "globaloc.h" // For err_exit. 00038 #include "globals.h" 00039 #include "helpers.h" 00040 00041 #include "tesseractclass.h" 00042 00043 // Include automatically generated configuration file if running autoconf. 00044 #ifdef HAVE_CONFIG_H 00045 #include "config_auto.h" 00046 #endif 00047 00048 CLISTIZEH (STRING) CLISTIZE (STRING) 00049 00050 /************************************************************************* 00051 * set_done() 00052 * 00053 * Set the done flag based on the word acceptability criteria 00054 *************************************************************************/ 00055 00056 namespace tesseract { 00057 void Tesseract::set_done(WERD_RES *word, inT16 pass) { 00058 word->done = word->tess_accepted && 00059 (strchr(word->best_choice->unichar_string().string(), ' ') == NULL); 00060 bool word_is_ambig = word->best_choice->dangerous_ambig_found(); 00061 bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM || 00062 word->best_choice->permuter() == FREQ_DAWG_PERM || 00063 word->best_choice->permuter() == USER_DAWG_PERM; 00064 if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) && 00065 one_ell_conflict(word, FALSE)) { 00066 if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n"); 00067 word->done = FALSE; 00068 } 00069 if (word->done && ((!word_from_dict && 00070 word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) { 00071 if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n"); 00072 word->done = FALSE; 00073 } 00074 if (tessedit_rejection_debug) { 00075 tprintf("set_done(): done=%d\n", word->done); 00076 word->best_choice->print(""); 00077 } 00078 } 00079 00080 00081 /************************************************************************* 00082 * make_reject_map() 00083 * 00084 * Sets the done flag to indicate whether the resylt is acceptable. 00085 * 00086 * Sets a reject map for the word. 00087 *************************************************************************/ 00088 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) { 00089 int i; 00090 int offset; 00091 00092 flip_0O(word); 00093 check_debug_pt(word, -1); // For trap only 00094 set_done(word, pass); // Set acceptance 00095 word->reject_map.initialise(word->best_choice->unichar_lengths().length()); 00096 reject_blanks(word); 00097 /* 00098 0: Rays original heuristic - the baseline 00099 */ 00100 if (tessedit_reject_mode == 0) { 00101 if (!word->done) 00102 reject_poor_matches(word); 00103 } else if (tessedit_reject_mode == 5) { 00104 /* 00105 5: Reject I/1/l from words where there is no strong contextual confirmation; 00106 the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls); 00107 and the whole of any words which are very small 00108 */ 00109 if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) { 00110 word->reject_map.rej_word_small_xht(); 00111 } else { 00112 one_ell_conflict(word, TRUE); 00113 /* 00114 Originally the code here just used the done flag. Now I have duplicated 00115 and unpacked the conditions for setting the done flag so that each 00116 mechanism can be turned on or off independently. This works WITHOUT 00117 affecting the done flag setting. 00118 */ 00119 if (rej_use_tess_accepted && !word->tess_accepted) 00120 word->reject_map.rej_word_not_tess_accepted (); 00121 00122 if (rej_use_tess_blanks && 00123 (strchr (word->best_choice->unichar_string().string (), ' ') != NULL)) 00124 word->reject_map.rej_word_contains_blanks (); 00125 00126 WERD_CHOICE* best_choice = word->best_choice; 00127 if (rej_use_good_perm) { 00128 if ((best_choice->permuter() == SYSTEM_DAWG_PERM || 00129 best_choice->permuter() == FREQ_DAWG_PERM || 00130 best_choice->permuter() == USER_DAWG_PERM) && 00131 (!rej_use_sensible_wd || 00132 acceptable_word_string(*word->uch_set, 00133 best_choice->unichar_string().string(), 00134 best_choice->unichar_lengths().string()) != 00135 AC_UNACCEPTABLE)) { 00136 // PASSED TEST 00137 } else if (best_choice->permuter() == NUMBER_PERM) { 00138 if (rej_alphas_in_number_perm) { 00139 for (i = 0, offset = 0; 00140 best_choice->unichar_string()[offset] != '\0'; 00141 offset += best_choice->unichar_lengths()[i++]) { 00142 if (word->reject_map[i].accepted() && 00143 word->uch_set->get_isalpha( 00144 best_choice->unichar_string().string() + offset, 00145 best_choice->unichar_lengths()[i])) 00146 word->reject_map[i].setrej_bad_permuter(); 00147 // rej alpha 00148 } 00149 } 00150 } else { 00151 word->reject_map.rej_word_bad_permuter(); 00152 } 00153 } 00154 /* Ambig word rejection was here once !!*/ 00155 } 00156 } else { 00157 tprintf("BAD tessedit_reject_mode\n"); 00158 err_exit(); 00159 } 00160 00161 if (tessedit_image_border > -1) 00162 reject_edge_blobs(word); 00163 00164 check_debug_pt (word, 10); 00165 if (tessedit_rejection_debug) { 00166 tprintf("Permuter Type = %d\n", word->best_choice->permuter ()); 00167 tprintf("Certainty: %f Rating: %f\n", 00168 word->best_choice->certainty (), word->best_choice->rating ()); 00169 tprintf("Dict word: %d\n", dict_word(*(word->best_choice))); 00170 } 00171 00172 flip_hyphens(word); 00173 check_debug_pt(word, 20); 00174 } 00175 } // namespace tesseract 00176 00177 00178 void reject_blanks(WERD_RES *word) { 00179 inT16 i; 00180 inT16 offset; 00181 00182 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0'; 00183 offset += word->best_choice->unichar_lengths()[i], i += 1) { 00184 if (word->best_choice->unichar_string()[offset] == ' ') 00185 //rej unrecognised blobs 00186 word->reject_map[i].setrej_tess_failure (); 00187 } 00188 } 00189 00190 namespace tesseract { 00191 void Tesseract::reject_I_1_L(WERD_RES *word) { 00192 inT16 i; 00193 inT16 offset; 00194 00195 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0'; 00196 offset += word->best_choice->unichar_lengths()[i], i += 1) { 00197 if (STRING (conflict_set_I_l_1). 00198 contains (word->best_choice->unichar_string()[offset])) { 00199 //rej 1Il conflict 00200 word->reject_map[i].setrej_1Il_conflict (); 00201 } 00202 } 00203 } 00204 } // namespace tesseract 00205 00206 00207 void reject_poor_matches(WERD_RES *word) { 00208 float threshold = compute_reject_threshold(word->best_choice); 00209 for (int i = 0; i < word->best_choice->length(); ++i) { 00210 if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) 00211 word->reject_map[i].setrej_tess_failure(); 00212 else if (word->best_choice->certainty(i) < threshold) 00213 word->reject_map[i].setrej_poor_match(); 00214 } 00215 } 00216 00217 00218 /********************************************************************** 00219 * compute_reject_threshold 00220 * 00221 * Set a rejection threshold for this word. 00222 * Initially this is a trivial function which looks for the largest 00223 * gap in the certainty value. 00224 **********************************************************************/ 00225 00226 float compute_reject_threshold(WERD_CHOICE* word) { 00227 float threshold; // rejection threshold 00228 float bestgap = 0.0f; // biggest gap 00229 float gapstart; // bottom of gap 00230 // super iterator 00231 BLOB_CHOICE_IT choice_it; // real iterator 00232 00233 int blob_count = word->length(); 00234 GenericVector<float> ratings; 00235 ratings.init_to_size(blob_count, 0.0f); 00236 for (int i = 0; i < blob_count; ++i) { 00237 ratings[i] = word->certainty(i); 00238 } 00239 ratings.sort(); 00240 gapstart = ratings[0] - 1; // all reject if none better 00241 if (blob_count >= 3) { 00242 for (int index = 0; index < blob_count - 1; index++) { 00243 if (ratings[index + 1] - ratings[index] > bestgap) { 00244 bestgap = ratings[index + 1] - ratings[index]; 00245 // find biggest 00246 gapstart = ratings[index]; 00247 } 00248 } 00249 } 00250 threshold = gapstart + bestgap / 2; 00251 00252 return threshold; 00253 } 00254 00255 00256 /************************************************************************* 00257 * reject_edge_blobs() 00258 * 00259 * If the word is perilously close to the edge of the image, reject those blobs 00260 * in the word which are too close to the edge as they could be clipped. 00261 *************************************************************************/ 00262 namespace tesseract { 00263 void Tesseract::reject_edge_blobs(WERD_RES *word) { 00264 TBOX word_box = word->word->bounding_box(); 00265 // Use the box_word as it is already denormed back to image coordinates. 00266 int blobcount = word->box_word->length(); 00267 00268 if (word_box.left() < tessedit_image_border || 00269 word_box.bottom() < tessedit_image_border || 00270 word_box.right() + tessedit_image_border > ImageWidth() - 1 || 00271 word_box.top() + tessedit_image_border > ImageHeight() - 1) { 00272 ASSERT_HOST(word->reject_map.length() == blobcount); 00273 for (int blobindex = 0; blobindex < blobcount; blobindex++) { 00274 TBOX blob_box = word->box_word->BlobBox(blobindex); 00275 if (blob_box.left() < tessedit_image_border || 00276 blob_box.bottom() < tessedit_image_border || 00277 blob_box.right() + tessedit_image_border > ImageWidth() - 1 || 00278 blob_box.top() + tessedit_image_border > ImageHeight() - 1) { 00279 word->reject_map[blobindex].setrej_edge_char(); 00280 // Close to edge 00281 } 00282 } 00283 } 00284 } 00285 00286 /********************************************************************** 00287 * one_ell_conflict() 00288 * 00289 * Identify words where there is a potential I/l/1 error. 00290 * - A bundle of contextual heuristics! 00291 **********************************************************************/ 00292 BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) { 00293 const char *word; 00294 const char *lengths; 00295 inT16 word_len; //its length 00296 inT16 first_alphanum_index_; 00297 inT16 first_alphanum_offset_; 00298 inT16 i; 00299 inT16 offset; 00300 BOOL8 non_conflict_set_char; //non conf set a/n? 00301 BOOL8 conflict = FALSE; 00302 BOOL8 allow_1s; 00303 ACCEPTABLE_WERD_TYPE word_type; 00304 BOOL8 dict_perm_type; 00305 BOOL8 dict_word_ok; 00306 int dict_word_type; 00307 00308 word = word_res->best_choice->unichar_string().string (); 00309 lengths = word_res->best_choice->unichar_lengths().string(); 00310 word_len = strlen (lengths); 00311 /* 00312 If there are no occurrences of the conflict set characters then the word 00313 is OK. 00314 */ 00315 if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL) 00316 return FALSE; 00317 00318 /* 00319 There is a conflict if there are NO other (confirmed) alphanumerics apart 00320 from those in the conflict set. 00321 */ 00322 00323 for (i = 0, offset = 0, non_conflict_set_char = FALSE; 00324 (i < word_len) && !non_conflict_set_char; offset += lengths[i++]) 00325 non_conflict_set_char = 00326 (word_res->uch_set->get_isalpha(word + offset, lengths[i]) || 00327 word_res->uch_set->get_isdigit(word + offset, lengths[i])) && 00328 !STRING (conflict_set_I_l_1).contains (word[offset]); 00329 if (!non_conflict_set_char) { 00330 if (update_map) 00331 reject_I_1_L(word_res); 00332 return TRUE; 00333 } 00334 00335 /* 00336 If the word is accepted by a dawg permuter, and the first alpha character 00337 is "I" or "l", check to see if the alternative is also a dawg word. If it 00338 is, then there is a potential error otherwise the word is ok. 00339 */ 00340 00341 dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) || 00342 (word_res->best_choice->permuter () == USER_DAWG_PERM) || 00343 (rej_trust_doc_dawg && 00344 (word_res->best_choice->permuter () == DOC_DAWG_PERM)) || 00345 (word_res->best_choice->permuter () == FREQ_DAWG_PERM); 00346 dict_word_type = dict_word(*(word_res->best_choice)); 00347 dict_word_ok = (dict_word_type > 0) && 00348 (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM)); 00349 00350 if ((rej_1Il_use_dict_word && dict_word_ok) || 00351 (rej_1Il_trust_permuter_type && dict_perm_type) || 00352 (dict_perm_type && dict_word_ok)) { 00353 first_alphanum_index_ = first_alphanum_index (word, lengths); 00354 first_alphanum_offset_ = first_alphanum_offset (word, lengths); 00355 if (lengths[first_alphanum_index_] == 1 && 00356 word[first_alphanum_offset_] == 'I') { 00357 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; 00358 if (safe_dict_word(word_res) > 0) { 00359 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; 00360 if (update_map) 00361 word_res->reject_map[first_alphanum_index_]. 00362 setrej_1Il_conflict(); 00363 return TRUE; 00364 } 00365 else { 00366 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; 00367 return FALSE; 00368 } 00369 } 00370 00371 if (lengths[first_alphanum_index_] == 1 && 00372 word[first_alphanum_offset_] == 'l') { 00373 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; 00374 if (safe_dict_word(word_res) > 0) { 00375 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; 00376 if (update_map) 00377 word_res->reject_map[first_alphanum_index_]. 00378 setrej_1Il_conflict(); 00379 return TRUE; 00380 } 00381 else { 00382 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; 00383 return FALSE; 00384 } 00385 } 00386 return FALSE; 00387 } 00388 00389 /* 00390 NEW 1Il code. The old code relied on permuter types too much. In fact, 00391 tess will use TOP_CHOICE permute for good things like "palette". 00392 In this code the string is examined independently to see if it looks like 00393 a well formed word. 00394 */ 00395 00396 /* 00397 REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a 00398 dictionary word. 00399 */ 00400 first_alphanum_index_ = first_alphanum_index (word, lengths); 00401 first_alphanum_offset_ = first_alphanum_offset (word, lengths); 00402 if (lengths[first_alphanum_index_] == 1 && 00403 word[first_alphanum_offset_] == 'l') { 00404 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; 00405 if (safe_dict_word(word_res) > 0) 00406 return FALSE; 00407 else 00408 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; 00409 } 00410 else if (lengths[first_alphanum_index_] == 1 && 00411 word[first_alphanum_offset_] == 'I') { 00412 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; 00413 if (safe_dict_word(word_res) > 0) 00414 return FALSE; 00415 else 00416 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; 00417 } 00418 /* 00419 For strings containing digits: 00420 If there are no alphas OR the numeric permuter liked the word, 00421 reject any non 1 conflict chs 00422 Else reject all conflict chs 00423 */ 00424 if (word_contains_non_1_digit (word, lengths)) { 00425 allow_1s = (alpha_count (word, lengths) == 0) || 00426 (word_res->best_choice->permuter () == NUMBER_PERM); 00427 00428 inT16 offset; 00429 conflict = FALSE; 00430 for (i = 0, offset = 0; word[offset] != '\0'; 00431 offset += word_res->best_choice->unichar_lengths()[i++]) { 00432 if ((!allow_1s || (word[offset] != '1')) && 00433 STRING (conflict_set_I_l_1).contains (word[offset])) { 00434 if (update_map) 00435 word_res->reject_map[i].setrej_1Il_conflict (); 00436 conflict = TRUE; 00437 } 00438 } 00439 return conflict; 00440 } 00441 /* 00442 For anything else. See if it conforms to an acceptable word type. If so, 00443 treat accordingly. 00444 */ 00445 word_type = acceptable_word_string(*word_res->uch_set, word, lengths); 00446 if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) { 00447 first_alphanum_index_ = first_alphanum_index (word, lengths); 00448 first_alphanum_offset_ = first_alphanum_offset (word, lengths); 00449 if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) { 00450 if (update_map) 00451 word_res->reject_map[first_alphanum_index_]. 00452 setrej_1Il_conflict (); 00453 return TRUE; 00454 } 00455 else 00456 return FALSE; 00457 } 00458 else if (word_type == AC_UPPER_CASE) { 00459 return FALSE; 00460 } 00461 else { 00462 if (update_map) 00463 reject_I_1_L(word_res); 00464 return TRUE; 00465 } 00466 } 00467 00468 00469 inT16 Tesseract::first_alphanum_index(const char *word, 00470 const char *word_lengths) { 00471 inT16 i; 00472 inT16 offset; 00473 00474 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { 00475 if (unicharset.get_isalpha(word + offset, word_lengths[i]) || 00476 unicharset.get_isdigit(word + offset, word_lengths[i])) 00477 return i; 00478 } 00479 return -1; 00480 } 00481 00482 inT16 Tesseract::first_alphanum_offset(const char *word, 00483 const char *word_lengths) { 00484 inT16 i; 00485 inT16 offset; 00486 00487 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { 00488 if (unicharset.get_isalpha(word + offset, word_lengths[i]) || 00489 unicharset.get_isdigit(word + offset, word_lengths[i])) 00490 return offset; 00491 } 00492 return -1; 00493 } 00494 00495 inT16 Tesseract::alpha_count(const char *word, 00496 const char *word_lengths) { 00497 inT16 i; 00498 inT16 offset; 00499 inT16 count = 0; 00500 00501 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { 00502 if (unicharset.get_isalpha (word + offset, word_lengths[i])) 00503 count++; 00504 } 00505 return count; 00506 } 00507 00508 00509 BOOL8 Tesseract::word_contains_non_1_digit(const char *word, 00510 const char *word_lengths) { 00511 inT16 i; 00512 inT16 offset; 00513 00514 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { 00515 if (unicharset.get_isdigit (word + offset, word_lengths[i]) && 00516 (word_lengths[i] != 1 || word[offset] != '1')) 00517 return TRUE; 00518 } 00519 return FALSE; 00520 } 00521 00522 /************************************************************************* 00523 * dont_allow_1Il() 00524 * Don't unreject LONE accepted 1Il conflict set chars 00525 *************************************************************************/ 00526 void Tesseract::dont_allow_1Il(WERD_RES *word) { 00527 int i = 0; 00528 int offset; 00529 int word_len = word->reject_map.length(); 00530 const char *s = word->best_choice->unichar_string().string(); 00531 const char *lengths = word->best_choice->unichar_lengths().string(); 00532 BOOL8 accepted_1Il = FALSE; 00533 00534 for (i = 0, offset = 0; i < word_len; 00535 offset += word->best_choice->unichar_lengths()[i++]) { 00536 if (word->reject_map[i].accepted()) { 00537 if (STRING(conflict_set_I_l_1).contains(s[offset])) { 00538 accepted_1Il = TRUE; 00539 } else { 00540 if (word->uch_set->get_isalpha(s + offset, lengths[i]) || 00541 word->uch_set->get_isdigit(s + offset, lengths[i])) 00542 return; // >=1 non 1Il ch accepted 00543 } 00544 } 00545 } 00546 if (!accepted_1Il) 00547 return; //Nothing to worry about 00548 00549 for (i = 0, offset = 0; i < word_len; 00550 offset += word->best_choice->unichar_lengths()[i++]) { 00551 if (STRING(conflict_set_I_l_1).contains(s[offset]) && 00552 word->reject_map[i].accepted()) 00553 word->reject_map[i].setrej_postNN_1Il(); 00554 } 00555 } 00556 00557 00558 inT16 Tesseract::count_alphanums(WERD_RES *word_res) { 00559 int count = 0; 00560 const WERD_CHOICE *best_choice = word_res->best_choice; 00561 for (int i = 0; i < word_res->reject_map.length(); ++i) { 00562 if ((word_res->reject_map[i].accepted()) && 00563 (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) || 00564 word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) { 00565 count++; 00566 } 00567 } 00568 return count; 00569 } 00570 00571 00572 // reject all if most rejected. 00573 void Tesseract::reject_mostly_rejects(WERD_RES *word) { 00574 /* Reject the whole of the word if the fraction of rejects exceeds a limit */ 00575 00576 if ((float) word->reject_map.reject_count() / word->reject_map.length() >= 00577 rej_whole_of_mostly_reject_word_fract) 00578 word->reject_map.rej_word_mostly_rej(); 00579 } 00580 00581 00582 BOOL8 Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) { 00583 inT16 char_quality; 00584 inT16 accepted_char_quality; 00585 00586 if (word->best_choice->unichar_lengths().length() <= 1) 00587 return FALSE; 00588 00589 if (!STRING(ok_repeated_ch_non_alphanum_wds). 00590 contains(word->best_choice->unichar_string()[0])) 00591 return FALSE; 00592 00593 UNICHAR_ID uch_id = word->best_choice->unichar_id(0); 00594 for (int i = 1; i < word->best_choice->length(); ++i) { 00595 if (word->best_choice->unichar_id(i) != uch_id) return FALSE; 00596 } 00597 00598 word_char_quality(word, row, &char_quality, &accepted_char_quality); 00599 00600 if ((word->best_choice->unichar_lengths().length () == char_quality) && 00601 (char_quality == accepted_char_quality)) 00602 return TRUE; 00603 else 00604 return FALSE; 00605 } 00606 00607 inT16 Tesseract::safe_dict_word(const WERD_RES *werd_res) { 00608 const WERD_CHOICE &word = *werd_res->best_choice; 00609 int dict_word_type = werd_res->tesseract->dict_word(word); 00610 return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type; 00611 } 00612 00613 // Note: After running this function word_res->ratings 00614 // might not contain the right BLOB_CHOICE corresponding to each character 00615 // in word_res->best_choice. 00616 void Tesseract::flip_hyphens(WERD_RES *word_res) { 00617 WERD_CHOICE *best_choice = word_res->best_choice; 00618 int i; 00619 int prev_right = -9999; 00620 int next_left; 00621 TBOX out_box; 00622 float aspect_ratio; 00623 00624 if (tessedit_lower_flip_hyphen <= 1) 00625 return; 00626 00627 int num_blobs = word_res->rebuild_word->NumBlobs(); 00628 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-"); 00629 for (i = 0; i < best_choice->length() && i < num_blobs; ++i) { 00630 TBLOB* blob = word_res->rebuild_word->blobs[i]; 00631 out_box = blob->bounding_box(); 00632 if (i + 1 == num_blobs) 00633 next_left = 9999; 00634 else 00635 next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left(); 00636 // Don't touch small or touching blobs - it is too dangerous. 00637 if ((out_box.width() > 8 * word_res->denorm.x_scale()) && 00638 (out_box.left() > prev_right) && (out_box.right() < next_left)) { 00639 aspect_ratio = out_box.width() / (float) out_box.height(); 00640 if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) { 00641 if (aspect_ratio >= tessedit_upper_flip_hyphen && 00642 word_res->uch_set->contains_unichar_id(unichar_dash) && 00643 word_res->uch_set->get_enabled(unichar_dash)) { 00644 /* Certain HYPHEN */ 00645 best_choice->set_unichar_id(unichar_dash, i); 00646 if (word_res->reject_map[i].rejected()) 00647 word_res->reject_map[i].setrej_hyphen_accept(); 00648 } 00649 if ((aspect_ratio > tessedit_lower_flip_hyphen) && 00650 word_res->reject_map[i].accepted()) 00651 //Suspected HYPHEN 00652 word_res->reject_map[i].setrej_hyphen (); 00653 } 00654 else if (best_choice->unichar_id(i) == unichar_dash) { 00655 if ((aspect_ratio >= tessedit_upper_flip_hyphen) && 00656 (word_res->reject_map[i].rejected())) 00657 word_res->reject_map[i].setrej_hyphen_accept(); 00658 //Certain HYPHEN 00659 00660 if ((aspect_ratio <= tessedit_lower_flip_hyphen) && 00661 (word_res->reject_map[i].accepted())) 00662 //Suspected HYPHEN 00663 word_res->reject_map[i].setrej_hyphen(); 00664 } 00665 } 00666 prev_right = out_box.right(); 00667 } 00668 } 00669 00670 // Note: After running this function word_res->ratings 00671 // might not contain the right BLOB_CHOICE corresponding to each character 00672 // in word_res->best_choice. 00673 void Tesseract::flip_0O(WERD_RES *word_res) { 00674 WERD_CHOICE *best_choice = word_res->best_choice; 00675 int i; 00676 TBOX out_box; 00677 00678 if (!tessedit_flip_0O) 00679 return; 00680 00681 int num_blobs = word_res->rebuild_word->NumBlobs(); 00682 for (i = 0; i < best_choice->length() && i < num_blobs; ++i) { 00683 TBLOB* blob = word_res->rebuild_word->blobs[i]; 00684 if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) || 00685 word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) { 00686 out_box = blob->bounding_box(); 00687 if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) || 00688 (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) 00689 return; //Beware words with sub/superscripts 00690 } 00691 } 00692 UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0"); 00693 UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O"); 00694 if (unichar_0 == INVALID_UNICHAR_ID || 00695 !word_res->uch_set->get_enabled(unichar_0) || 00696 unichar_O == INVALID_UNICHAR_ID || 00697 !word_res->uch_set->get_enabled(unichar_O)) { 00698 return; // 0 or O are not present/enabled in unicharset 00699 } 00700 for (i = 1; i < best_choice->length(); ++i) { 00701 if (best_choice->unichar_id(i) == unichar_0 || 00702 best_choice->unichar_id(i) == unichar_O) { 00703 /* A0A */ 00704 if ((i+1) < best_choice->length() && 00705 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00706 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) { 00707 best_choice->set_unichar_id(unichar_O, i); 00708 } 00709 /* A00A */ 00710 if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00711 (i+1) < best_choice->length() && 00712 (best_choice->unichar_id(i+1) == unichar_0 || 00713 best_choice->unichar_id(i+1) == unichar_O) && 00714 (i+2) < best_choice->length() && 00715 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) { 00716 best_choice->set_unichar_id(unichar_O, i); 00717 i++; 00718 } 00719 /* AA0<non digit or end of word> */ 00720 if ((i > 1) && 00721 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) && 00722 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00723 (((i+1) < best_choice->length() && 00724 !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) && 00725 !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") && 00726 !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) || 00727 (i == best_choice->length() - 1))) { 00728 best_choice->set_unichar_id(unichar_O, i); 00729 } 00730 /* 9O9 */ 00731 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00732 (i+1) < best_choice->length() && 00733 non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) { 00734 best_choice->set_unichar_id(unichar_0, i); 00735 } 00736 /* 9OOO */ 00737 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00738 (i+2) < best_choice->length() && 00739 (best_choice->unichar_id(i+1) == unichar_0 || 00740 best_choice->unichar_id(i+1) == unichar_O) && 00741 (best_choice->unichar_id(i+2) == unichar_0 || 00742 best_choice->unichar_id(i+2) == unichar_O)) { 00743 best_choice->set_unichar_id(unichar_0, i); 00744 best_choice->set_unichar_id(unichar_0, i+1); 00745 best_choice->set_unichar_id(unichar_0, i+2); 00746 i += 2; 00747 } 00748 /* 9OO<non upper> */ 00749 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00750 (i+2) < best_choice->length() && 00751 (best_choice->unichar_id(i+1) == unichar_0 || 00752 best_choice->unichar_id(i+1) == unichar_O) && 00753 !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) { 00754 best_choice->set_unichar_id(unichar_0, i); 00755 best_choice->set_unichar_id(unichar_0, i+1); 00756 i++; 00757 } 00758 /* 9O<non upper> */ 00759 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00760 (i+1) < best_choice->length() && 00761 !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) { 00762 best_choice->set_unichar_id(unichar_0, i); 00763 } 00764 /* 9[.,]OOO.. */ 00765 if ((i > 1) && 00766 (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") || 00767 word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) && 00768 (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) || 00769 best_choice->unichar_id(i-2) == unichar_O)) { 00770 if (best_choice->unichar_id(i-2) == unichar_O) { 00771 best_choice->set_unichar_id(unichar_0, i-2); 00772 } 00773 while (i < best_choice->length() && 00774 (best_choice->unichar_id(i) == unichar_O || 00775 best_choice->unichar_id(i) == unichar_0)) { 00776 best_choice->set_unichar_id(unichar_0, i); 00777 i++; 00778 } 00779 i--; 00780 } 00781 } 00782 } 00783 } 00784 00785 BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) { 00786 return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O"); 00787 } 00788 00789 BOOL8 Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) { 00790 return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0"); 00791 } 00792 } // namespace tesseract