tesseract 3.04.01

ccmain/reject.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        reject.cpp  (Formerly reject.c)
00003  * Description: Rejection functions used in tessedit
00004  * Author:              Phil Cheatle
00005  * Created:             Wed Sep 23 16:50:21 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #pragma warning(disable:4305)  // int/float warnings
00023 #endif
00024 
00025 #include          "tessvars.h"
00026 #ifdef __UNIX__
00027 #include          <assert.h>
00028 #include          <errno.h>
00029 #endif
00030 #include          "scanutils.h"
00031 #include          <ctype.h>
00032 #include          <string.h>
00033 #include          "genericvector.h"
00034 #include          "reject.h"
00035 #include          "control.h"
00036 #include          "docqual.h"
00037 #include          "globaloc.h"  // For err_exit.
00038 #include          "globals.h"
00039 #include          "helpers.h"
00040 
00041 #include "tesseractclass.h"
00042 
00043 // Include automatically generated configuration file if running autoconf.
00044 #ifdef HAVE_CONFIG_H
00045 #include "config_auto.h"
00046 #endif
00047 
00048 CLISTIZEH (STRING) CLISTIZE (STRING)
00049 
00050 /*************************************************************************
00051  * set_done()
00052  *
00053  * Set the done flag based on the word acceptability criteria
00054  *************************************************************************/
00055 
00056 namespace tesseract {
00057 void Tesseract::set_done(WERD_RES *word, inT16 pass) {
00058   word->done = word->tess_accepted &&
00059       (strchr(word->best_choice->unichar_string().string(), ' ') == NULL);
00060   bool word_is_ambig = word->best_choice->dangerous_ambig_found();
00061   bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
00062       word->best_choice->permuter() == FREQ_DAWG_PERM ||
00063       word->best_choice->permuter() == USER_DAWG_PERM;
00064   if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
00065       one_ell_conflict(word, FALSE)) {
00066     if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
00067     word->done = FALSE;
00068   }
00069   if (word->done && ((!word_from_dict &&
00070       word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
00071     if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
00072       word->done = FALSE;
00073   }
00074   if (tessedit_rejection_debug) {
00075     tprintf("set_done(): done=%d\n", word->done);
00076     word->best_choice->print("");
00077   }
00078 }
00079 
00080 
00081 /*************************************************************************
00082  * make_reject_map()
00083  *
00084  * Sets the done flag to indicate whether the resylt is acceptable.
00085  *
00086  * Sets a reject map for the word.
00087  *************************************************************************/
00088 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {
00089   int i;
00090   int offset;
00091 
00092   flip_0O(word);
00093   check_debug_pt(word, -1);     // For trap only
00094   set_done(word, pass);  // Set acceptance
00095   word->reject_map.initialise(word->best_choice->unichar_lengths().length());
00096   reject_blanks(word);
00097   /*
00098   0: Rays original heuristic - the baseline
00099   */
00100   if (tessedit_reject_mode == 0) {
00101     if (!word->done)
00102       reject_poor_matches(word);
00103   } else if (tessedit_reject_mode == 5) {
00104     /*
00105     5: Reject I/1/l from words where there is no strong contextual confirmation;
00106       the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
00107       and the whole of any words which are very small
00108     */
00109     if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
00110       word->reject_map.rej_word_small_xht();
00111     } else {
00112       one_ell_conflict(word, TRUE);
00113       /*
00114         Originally the code here just used the done flag. Now I have duplicated
00115         and unpacked the conditions for setting the done flag so that each
00116         mechanism can be turned on or off independently. This works WITHOUT
00117         affecting the done flag setting.
00118       */
00119       if (rej_use_tess_accepted && !word->tess_accepted)
00120         word->reject_map.rej_word_not_tess_accepted ();
00121 
00122       if (rej_use_tess_blanks &&
00123         (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
00124         word->reject_map.rej_word_contains_blanks ();
00125 
00126       WERD_CHOICE* best_choice = word->best_choice;
00127       if (rej_use_good_perm) {
00128         if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
00129              best_choice->permuter() == FREQ_DAWG_PERM ||
00130              best_choice->permuter() == USER_DAWG_PERM) &&
00131             (!rej_use_sensible_wd ||
00132              acceptable_word_string(*word->uch_set,
00133                                     best_choice->unichar_string().string(),
00134                                     best_choice->unichar_lengths().string()) !=
00135                                         AC_UNACCEPTABLE)) {
00136           // PASSED TEST
00137         } else if (best_choice->permuter() == NUMBER_PERM) {
00138           if (rej_alphas_in_number_perm) {
00139             for (i = 0, offset = 0;
00140                  best_choice->unichar_string()[offset] != '\0';
00141                  offset += best_choice->unichar_lengths()[i++]) {
00142               if (word->reject_map[i].accepted() &&
00143                   word->uch_set->get_isalpha(
00144                       best_choice->unichar_string().string() + offset,
00145                       best_choice->unichar_lengths()[i]))
00146                 word->reject_map[i].setrej_bad_permuter();
00147               // rej alpha
00148             }
00149           }
00150         } else {
00151           word->reject_map.rej_word_bad_permuter();
00152         }
00153       }
00154       /* Ambig word rejection was here once !!*/
00155     }
00156   } else {
00157     tprintf("BAD tessedit_reject_mode\n");
00158     err_exit();
00159   }
00160 
00161   if (tessedit_image_border > -1)
00162     reject_edge_blobs(word);
00163 
00164   check_debug_pt (word, 10);
00165   if (tessedit_rejection_debug) {
00166     tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
00167     tprintf("Certainty: %f     Rating: %f\n",
00168       word->best_choice->certainty (), word->best_choice->rating ());
00169     tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
00170   }
00171 
00172   flip_hyphens(word);
00173   check_debug_pt(word, 20);
00174 }
00175 }  // namespace tesseract
00176 
00177 
00178 void reject_blanks(WERD_RES *word) {
00179   inT16 i;
00180   inT16 offset;
00181 
00182   for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
00183        offset += word->best_choice->unichar_lengths()[i], i += 1) {
00184     if (word->best_choice->unichar_string()[offset] == ' ')
00185                                  //rej unrecognised blobs
00186       word->reject_map[i].setrej_tess_failure ();
00187   }
00188 }
00189 
00190 namespace tesseract {
00191 void Tesseract::reject_I_1_L(WERD_RES *word) {
00192   inT16 i;
00193   inT16 offset;
00194 
00195   for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
00196        offset += word->best_choice->unichar_lengths()[i], i += 1) {
00197     if (STRING (conflict_set_I_l_1).
00198     contains (word->best_choice->unichar_string()[offset])) {
00199                                  //rej 1Il conflict
00200       word->reject_map[i].setrej_1Il_conflict ();
00201     }
00202   }
00203 }
00204 }  // namespace tesseract
00205 
00206 
00207 void reject_poor_matches(WERD_RES *word) {
00208   float threshold = compute_reject_threshold(word->best_choice);
00209   for (int i = 0; i < word->best_choice->length(); ++i) {
00210     if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
00211       word->reject_map[i].setrej_tess_failure();
00212     else if (word->best_choice->certainty(i) < threshold)
00213       word->reject_map[i].setrej_poor_match();
00214   }
00215 }
00216 
00217 
00218 /**********************************************************************
00219  * compute_reject_threshold
00220  *
00221  * Set a rejection threshold for this word.
00222  * Initially this is a trivial function which looks for the largest
00223  * gap in the certainty value.
00224  **********************************************************************/
00225 
00226 float compute_reject_threshold(WERD_CHOICE* word) {
00227   float threshold;               // rejection threshold
00228   float bestgap = 0.0f;          // biggest gap
00229   float gapstart;                // bottom of gap
00230                                  // super iterator
00231   BLOB_CHOICE_IT choice_it;      // real iterator
00232 
00233   int blob_count = word->length();
00234   GenericVector<float> ratings;
00235   ratings.init_to_size(blob_count, 0.0f);
00236   for (int i = 0; i < blob_count; ++i) {
00237     ratings[i] = word->certainty(i);
00238   }
00239   ratings.sort();
00240   gapstart = ratings[0] - 1;     // all reject if none better
00241   if (blob_count >= 3) {
00242     for (int index = 0; index < blob_count - 1; index++) {
00243       if (ratings[index + 1] - ratings[index] > bestgap) {
00244         bestgap = ratings[index + 1] - ratings[index];
00245         // find biggest
00246         gapstart = ratings[index];
00247       }
00248     }
00249   }
00250   threshold = gapstart + bestgap / 2;
00251 
00252   return threshold;
00253 }
00254 
00255 
00256 /*************************************************************************
00257  * reject_edge_blobs()
00258  *
00259  * If the word is perilously close to the edge of the image, reject those blobs
00260  * in the word which are too close to the edge as they could be clipped.
00261  *************************************************************************/
00262 namespace tesseract {
00263 void Tesseract::reject_edge_blobs(WERD_RES *word) {
00264   TBOX word_box = word->word->bounding_box();
00265   // Use the box_word as it is already denormed back to image coordinates.
00266   int blobcount = word->box_word->length();
00267 
00268   if (word_box.left() < tessedit_image_border ||
00269       word_box.bottom() < tessedit_image_border ||
00270       word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
00271       word_box.top() + tessedit_image_border > ImageHeight() - 1) {
00272     ASSERT_HOST(word->reject_map.length() == blobcount);
00273     for (int blobindex = 0; blobindex < blobcount; blobindex++) {
00274       TBOX blob_box = word->box_word->BlobBox(blobindex);
00275       if (blob_box.left() < tessedit_image_border ||
00276           blob_box.bottom() < tessedit_image_border ||
00277           blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
00278           blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
00279         word->reject_map[blobindex].setrej_edge_char();
00280         // Close to edge
00281       }
00282     }
00283   }
00284 }
00285 
00286 /**********************************************************************
00287  * one_ell_conflict()
00288  *
00289  * Identify words where there is a potential I/l/1 error.
00290  * - A bundle of contextual heuristics!
00291  **********************************************************************/
00292 BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
00293   const char *word;
00294   const char *lengths;
00295   inT16 word_len;                //its length
00296   inT16 first_alphanum_index_;
00297   inT16 first_alphanum_offset_;
00298   inT16 i;
00299   inT16 offset;
00300   BOOL8 non_conflict_set_char;   //non conf set a/n?
00301   BOOL8 conflict = FALSE;
00302   BOOL8 allow_1s;
00303   ACCEPTABLE_WERD_TYPE word_type;
00304   BOOL8 dict_perm_type;
00305   BOOL8 dict_word_ok;
00306   int dict_word_type;
00307 
00308   word = word_res->best_choice->unichar_string().string ();
00309   lengths = word_res->best_choice->unichar_lengths().string();
00310   word_len = strlen (lengths);
00311   /*
00312     If there are no occurrences of the conflict set characters then the word
00313     is OK.
00314   */
00315   if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
00316     return FALSE;
00317 
00318   /*
00319     There is a conflict if there are NO other (confirmed) alphanumerics apart
00320     from those in the conflict set.
00321   */
00322 
00323   for (i = 0, offset = 0, non_conflict_set_char = FALSE;
00324        (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
00325     non_conflict_set_char =
00326         (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
00327             word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
00328         !STRING (conflict_set_I_l_1).contains (word[offset]);
00329   if (!non_conflict_set_char) {
00330     if (update_map)
00331       reject_I_1_L(word_res);
00332     return TRUE;
00333   }
00334 
00335   /*
00336     If the word is accepted by a dawg permuter, and the first alpha character
00337     is "I" or "l", check to see if the alternative is also a dawg word. If it
00338     is, then there is a potential error otherwise the word is ok.
00339   */
00340 
00341   dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00342     (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
00343     (rej_trust_doc_dawg &&
00344     (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
00345     (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
00346   dict_word_type = dict_word(*(word_res->best_choice));
00347   dict_word_ok = (dict_word_type > 0) &&
00348     (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
00349 
00350   if ((rej_1Il_use_dict_word && dict_word_ok) ||
00351     (rej_1Il_trust_permuter_type && dict_perm_type) ||
00352   (dict_perm_type && dict_word_ok)) {
00353     first_alphanum_index_ = first_alphanum_index (word, lengths);
00354     first_alphanum_offset_ = first_alphanum_offset (word, lengths);
00355     if (lengths[first_alphanum_index_] == 1 &&
00356         word[first_alphanum_offset_] == 'I') {
00357       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
00358       if (safe_dict_word(word_res) > 0) {
00359         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
00360         if (update_map)
00361           word_res->reject_map[first_alphanum_index_].
00362             setrej_1Il_conflict();
00363         return TRUE;
00364       }
00365       else {
00366         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
00367         return FALSE;
00368       }
00369     }
00370 
00371     if (lengths[first_alphanum_index_] == 1 &&
00372         word[first_alphanum_offset_] == 'l') {
00373       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
00374       if (safe_dict_word(word_res) > 0) {
00375         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
00376         if (update_map)
00377           word_res->reject_map[first_alphanum_index_].
00378             setrej_1Il_conflict();
00379         return TRUE;
00380       }
00381       else {
00382         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
00383         return FALSE;
00384       }
00385     }
00386     return FALSE;
00387   }
00388 
00389   /*
00390     NEW 1Il code. The old code relied on permuter types too much. In fact,
00391     tess will use TOP_CHOICE permute for good things like "palette".
00392     In this code the string is examined independently to see if it looks like
00393     a well formed word.
00394   */
00395 
00396   /*
00397     REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
00398     dictionary word.
00399   */
00400   first_alphanum_index_ = first_alphanum_index (word, lengths);
00401   first_alphanum_offset_ = first_alphanum_offset (word, lengths);
00402   if (lengths[first_alphanum_index_] == 1 &&
00403       word[first_alphanum_offset_] == 'l') {
00404     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
00405     if (safe_dict_word(word_res) > 0)
00406       return FALSE;
00407     else
00408       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
00409   }
00410   else if (lengths[first_alphanum_index_] == 1 &&
00411            word[first_alphanum_offset_] == 'I') {
00412     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
00413     if (safe_dict_word(word_res) > 0)
00414       return FALSE;
00415     else
00416       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
00417   }
00418   /*
00419     For strings containing digits:
00420       If there are no alphas OR the numeric permuter liked the word,
00421         reject any non 1 conflict chs
00422       Else reject all conflict chs
00423   */
00424   if (word_contains_non_1_digit (word, lengths)) {
00425     allow_1s = (alpha_count (word, lengths) == 0) ||
00426       (word_res->best_choice->permuter () == NUMBER_PERM);
00427 
00428     inT16 offset;
00429     conflict = FALSE;
00430     for (i = 0, offset = 0; word[offset] != '\0';
00431          offset += word_res->best_choice->unichar_lengths()[i++]) {
00432       if ((!allow_1s || (word[offset] != '1')) &&
00433       STRING (conflict_set_I_l_1).contains (word[offset])) {
00434         if (update_map)
00435           word_res->reject_map[i].setrej_1Il_conflict ();
00436         conflict = TRUE;
00437       }
00438     }
00439     return conflict;
00440   }
00441   /*
00442     For anything else. See if it conforms to an acceptable word type. If so,
00443     treat accordingly.
00444   */
00445   word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
00446   if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
00447     first_alphanum_index_ = first_alphanum_index (word, lengths);
00448     first_alphanum_offset_ = first_alphanum_offset (word, lengths);
00449     if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
00450       if (update_map)
00451         word_res->reject_map[first_alphanum_index_].
00452             setrej_1Il_conflict ();
00453       return TRUE;
00454     }
00455     else
00456       return FALSE;
00457   }
00458   else if (word_type == AC_UPPER_CASE) {
00459     return FALSE;
00460   }
00461   else {
00462     if (update_map)
00463       reject_I_1_L(word_res);
00464     return TRUE;
00465   }
00466 }
00467 
00468 
00469 inT16 Tesseract::first_alphanum_index(const char *word,
00470                                       const char *word_lengths) {
00471   inT16 i;
00472   inT16 offset;
00473 
00474   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
00475     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
00476         unicharset.get_isdigit(word + offset, word_lengths[i]))
00477       return i;
00478   }
00479   return -1;
00480 }
00481 
00482 inT16 Tesseract::first_alphanum_offset(const char *word,
00483                                        const char *word_lengths) {
00484   inT16 i;
00485   inT16 offset;
00486 
00487   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
00488     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
00489         unicharset.get_isdigit(word + offset, word_lengths[i]))
00490       return offset;
00491   }
00492   return -1;
00493 }
00494 
00495 inT16 Tesseract::alpha_count(const char *word,
00496                              const char *word_lengths) {
00497   inT16 i;
00498   inT16 offset;
00499   inT16 count = 0;
00500 
00501   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
00502     if (unicharset.get_isalpha (word + offset, word_lengths[i]))
00503       count++;
00504   }
00505   return count;
00506 }
00507 
00508 
00509 BOOL8 Tesseract::word_contains_non_1_digit(const char *word,
00510                                            const char *word_lengths) {
00511   inT16 i;
00512   inT16 offset;
00513 
00514   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
00515     if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
00516         (word_lengths[i] != 1 || word[offset] != '1'))
00517       return TRUE;
00518   }
00519   return FALSE;
00520 }
00521 
00522 /*************************************************************************
00523  * dont_allow_1Il()
00524  * Don't unreject LONE accepted 1Il conflict set chars
00525  *************************************************************************/
00526 void Tesseract::dont_allow_1Il(WERD_RES *word) {
00527   int i = 0;
00528   int offset;
00529   int word_len = word->reject_map.length();
00530   const char *s = word->best_choice->unichar_string().string();
00531   const char *lengths = word->best_choice->unichar_lengths().string();
00532   BOOL8 accepted_1Il = FALSE;
00533 
00534   for (i = 0, offset = 0; i < word_len;
00535        offset += word->best_choice->unichar_lengths()[i++]) {
00536     if (word->reject_map[i].accepted()) {
00537       if (STRING(conflict_set_I_l_1).contains(s[offset])) {
00538         accepted_1Il = TRUE;
00539       } else {
00540         if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
00541             word->uch_set->get_isdigit(s + offset, lengths[i]))
00542           return;                // >=1 non 1Il ch accepted
00543       }
00544     }
00545   }
00546   if (!accepted_1Il)
00547     return;                      //Nothing to worry about
00548 
00549   for (i = 0, offset = 0; i < word_len;
00550        offset += word->best_choice->unichar_lengths()[i++]) {
00551     if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
00552       word->reject_map[i].accepted())
00553       word->reject_map[i].setrej_postNN_1Il();
00554   }
00555 }
00556 
00557 
00558 inT16 Tesseract::count_alphanums(WERD_RES *word_res) {
00559   int count = 0;
00560   const WERD_CHOICE *best_choice = word_res->best_choice;
00561   for (int i = 0; i < word_res->reject_map.length(); ++i) {
00562     if ((word_res->reject_map[i].accepted()) &&
00563         (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
00564             word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
00565       count++;
00566     }
00567   }
00568   return count;
00569 }
00570 
00571 
00572 // reject all if most rejected.
00573 void Tesseract::reject_mostly_rejects(WERD_RES *word) {
00574   /* Reject the whole of the word if the fraction of rejects exceeds a limit */
00575 
00576   if ((float) word->reject_map.reject_count() / word->reject_map.length() >=
00577     rej_whole_of_mostly_reject_word_fract)
00578     word->reject_map.rej_word_mostly_rej();
00579 }
00580 
00581 
00582 BOOL8 Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
00583   inT16 char_quality;
00584   inT16 accepted_char_quality;
00585 
00586   if (word->best_choice->unichar_lengths().length() <= 1)
00587     return FALSE;
00588 
00589   if (!STRING(ok_repeated_ch_non_alphanum_wds).
00590     contains(word->best_choice->unichar_string()[0]))
00591     return FALSE;
00592 
00593   UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
00594   for (int i = 1; i < word->best_choice->length(); ++i) {
00595     if (word->best_choice->unichar_id(i) != uch_id) return FALSE;
00596   }
00597 
00598   word_char_quality(word, row, &char_quality, &accepted_char_quality);
00599 
00600   if ((word->best_choice->unichar_lengths().length () == char_quality) &&
00601     (char_quality == accepted_char_quality))
00602     return TRUE;
00603   else
00604     return FALSE;
00605 }
00606 
00607 inT16 Tesseract::safe_dict_word(const WERD_RES *werd_res) {
00608   const WERD_CHOICE &word = *werd_res->best_choice;
00609   int dict_word_type = werd_res->tesseract->dict_word(word);
00610   return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
00611 }
00612 
00613 // Note: After running this function word_res->ratings
00614 // might not contain the right BLOB_CHOICE corresponding to each character
00615 // in word_res->best_choice.
00616 void Tesseract::flip_hyphens(WERD_RES *word_res) {
00617   WERD_CHOICE *best_choice = word_res->best_choice;
00618   int i;
00619   int prev_right = -9999;
00620   int next_left;
00621   TBOX out_box;
00622   float aspect_ratio;
00623 
00624   if (tessedit_lower_flip_hyphen <= 1)
00625     return;
00626 
00627   int num_blobs = word_res->rebuild_word->NumBlobs();
00628   UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
00629   for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
00630     TBLOB* blob = word_res->rebuild_word->blobs[i];
00631     out_box = blob->bounding_box();
00632     if (i + 1 == num_blobs)
00633       next_left = 9999;
00634     else
00635       next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
00636     // Don't touch small or touching blobs - it is too dangerous.
00637     if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
00638         (out_box.left() > prev_right) && (out_box.right() < next_left)) {
00639       aspect_ratio = out_box.width() / (float) out_box.height();
00640       if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
00641         if (aspect_ratio >= tessedit_upper_flip_hyphen &&
00642             word_res->uch_set->contains_unichar_id(unichar_dash) &&
00643             word_res->uch_set->get_enabled(unichar_dash)) {
00644           /* Certain HYPHEN */
00645           best_choice->set_unichar_id(unichar_dash, i);
00646           if (word_res->reject_map[i].rejected())
00647             word_res->reject_map[i].setrej_hyphen_accept();
00648         }
00649         if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
00650           word_res->reject_map[i].accepted())
00651                                  //Suspected HYPHEN
00652           word_res->reject_map[i].setrej_hyphen ();
00653       }
00654       else if (best_choice->unichar_id(i) == unichar_dash) {
00655         if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
00656           (word_res->reject_map[i].rejected()))
00657           word_res->reject_map[i].setrej_hyphen_accept();
00658         //Certain HYPHEN
00659 
00660         if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
00661           (word_res->reject_map[i].accepted()))
00662                                  //Suspected HYPHEN
00663           word_res->reject_map[i].setrej_hyphen();
00664       }
00665     }
00666     prev_right = out_box.right();
00667   }
00668 }
00669 
00670 // Note: After running this function word_res->ratings
00671 // might not contain the right BLOB_CHOICE corresponding to each character
00672 // in word_res->best_choice.
00673 void Tesseract::flip_0O(WERD_RES *word_res) {
00674   WERD_CHOICE *best_choice = word_res->best_choice;
00675   int i;
00676   TBOX out_box;
00677 
00678   if (!tessedit_flip_0O)
00679     return;
00680 
00681   int num_blobs = word_res->rebuild_word->NumBlobs();
00682   for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
00683     TBLOB* blob = word_res->rebuild_word->blobs[i];
00684     if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
00685         word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
00686       out_box = blob->bounding_box();
00687       if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
00688         (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
00689         return;                  //Beware words with sub/superscripts
00690     }
00691   }
00692   UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
00693   UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
00694   if (unichar_0 == INVALID_UNICHAR_ID ||
00695       !word_res->uch_set->get_enabled(unichar_0) ||
00696       unichar_O == INVALID_UNICHAR_ID ||
00697       !word_res->uch_set->get_enabled(unichar_O)) {
00698     return;  // 0 or O are not present/enabled in unicharset
00699   }
00700   for (i = 1; i < best_choice->length(); ++i) {
00701     if (best_choice->unichar_id(i) == unichar_0 ||
00702         best_choice->unichar_id(i) == unichar_O) {
00703       /* A0A */
00704       if ((i+1) < best_choice->length() &&
00705           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00706           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
00707         best_choice->set_unichar_id(unichar_O, i);
00708       }
00709       /* A00A */
00710       if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00711           (i+1) < best_choice->length() &&
00712           (best_choice->unichar_id(i+1) == unichar_0 ||
00713            best_choice->unichar_id(i+1) == unichar_O) &&
00714           (i+2) < best_choice->length() &&
00715           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
00716         best_choice->set_unichar_id(unichar_O, i);
00717         i++;
00718       }
00719       /* AA0<non digit or end of word> */
00720       if ((i > 1) &&
00721           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
00722           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00723           (((i+1) < best_choice->length() &&
00724             !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
00725             !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
00726             !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
00727            (i == best_choice->length() - 1))) {
00728         best_choice->set_unichar_id(unichar_O, i);
00729       }
00730       /* 9O9 */
00731       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00732           (i+1) < best_choice->length() &&
00733           non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
00734         best_choice->set_unichar_id(unichar_0, i);
00735       }
00736       /* 9OOO */
00737       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00738           (i+2) < best_choice->length() &&
00739           (best_choice->unichar_id(i+1) == unichar_0 ||
00740            best_choice->unichar_id(i+1) == unichar_O) &&
00741           (best_choice->unichar_id(i+2) == unichar_0 ||
00742            best_choice->unichar_id(i+2) == unichar_O)) {
00743         best_choice->set_unichar_id(unichar_0, i);
00744         best_choice->set_unichar_id(unichar_0, i+1);
00745         best_choice->set_unichar_id(unichar_0, i+2);
00746         i += 2;
00747       }
00748       /* 9OO<non upper> */
00749       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00750           (i+2) < best_choice->length() &&
00751           (best_choice->unichar_id(i+1) == unichar_0 ||
00752           best_choice->unichar_id(i+1) == unichar_O) &&
00753           !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
00754         best_choice->set_unichar_id(unichar_0, i);
00755         best_choice->set_unichar_id(unichar_0, i+1);
00756         i++;
00757       }
00758       /* 9O<non upper> */
00759       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00760           (i+1) < best_choice->length() &&
00761           !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
00762         best_choice->set_unichar_id(unichar_0, i);
00763       }
00764       /* 9[.,]OOO.. */
00765       if ((i > 1) &&
00766           (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
00767               word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
00768           (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
00769            best_choice->unichar_id(i-2) == unichar_O)) {
00770         if (best_choice->unichar_id(i-2) == unichar_O) {
00771           best_choice->set_unichar_id(unichar_0, i-2);
00772         }
00773         while (i < best_choice->length() &&
00774                (best_choice->unichar_id(i) == unichar_O ||
00775                 best_choice->unichar_id(i) == unichar_0)) {
00776           best_choice->set_unichar_id(unichar_0, i);
00777           i++;
00778         }
00779         i--;
00780       }
00781     }
00782   }
00783 }
00784 
00785 BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
00786   return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
00787 }
00788 
00789 BOOL8 Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
00790   return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
00791 }
00792 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines