tesseract 3.04.01

dict/stopper.cpp

Go to the documentation of this file.
00001 /******************************************************************************
00002  **     Filename:    stopper.c
00003  **     Purpose:     Stopping criteria for word classifier.
00004  **     Author:      Dan Johnson
00005  **     History:     Mon Apr 29 14:56:49 1991, DSJ, Created.
00006  **
00007  **     (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00018 
00019 #include <stdio.h>
00020 #include <string.h>
00021 #include <ctype.h>
00022 #include <math.h>
00023 
00024 #include "stopper.h"
00025 #include "ambigs.h"
00026 #include "ccutil.h"
00027 #include "const.h"
00028 #include "danerror.h"
00029 #include "dict.h"
00030 #include "efio.h"
00031 #include "helpers.h"
00032 #include "matchdefs.h"
00033 #include "pageres.h"
00034 #include "params.h"
00035 #include "ratngs.h"
00036 #include "scanutils.h"
00037 #include "unichar.h"
00038 
00039 #ifdef _MSC_VER
00040 #pragma warning(disable:4244)  // Conversion warnings
00041 #pragma warning(disable:4800)  // int/bool warnings
00042 #endif
00043 
00044 using tesseract::ScriptPos;
00045 /*----------------------------------------------------------------------------
00046               Private Code
00047 ----------------------------------------------------------------------------*/
00048 
00049 namespace tesseract {
00050 
00051 bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice,
00052                             XHeightConsistencyEnum xheight_consistency) {
00053   float CertaintyThreshold = stopper_nondict_certainty_base;
00054   int WordSize;
00055 
00056   if (stopper_no_acceptable_choices) return false;
00057 
00058   if (best_choice.length() == 0) return false;
00059 
00060   bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
00061   bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
00062   bool is_case_ok = case_ok(best_choice, getUnicharset());
00063 
00064   if (stopper_debug_level >= 1) {
00065     const char *xht = "UNKNOWN";
00066     switch (xheight_consistency) {
00067       case XH_GOOD:  xht = "NORMAL"; break;
00068       case XH_SUBNORMAL:  xht = "SUBNORMAL"; break;
00069       case XH_INCONSISTENT:  xht = "INCONSISTENT"; break;
00070       default: xht = "UNKNOWN";
00071     }
00072     tprintf("\nStopper:  %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
00073             best_choice.unichar_string().string(),
00074             (is_valid_word ? 'y' : 'n'),
00075             (is_case_ok ? 'y' : 'n'),
00076             xht,
00077             best_choice.min_x_height(),
00078             best_choice.max_x_height());
00079   }
00080   // Do not accept invalid words in PASS1.
00081   if (reject_offset_ <= 0.0f && !is_valid_word) return false;
00082   if (is_valid_word && is_case_ok) {
00083     WordSize = LengthOfShortestAlphaRun(best_choice);
00084     WordSize -= stopper_smallword_size;
00085     if (WordSize < 0)
00086       WordSize = 0;
00087     CertaintyThreshold += WordSize * stopper_certainty_per_char;
00088   }
00089 
00090   if (stopper_debug_level >= 1)
00091     tprintf("Stopper:  Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
00092             best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
00093 
00094   if (no_dang_ambigs &&
00095       best_choice.certainty() > CertaintyThreshold &&
00096       xheight_consistency < XH_INCONSISTENT &&
00097       UniformCertainties(best_choice)) {
00098     return true;
00099   } else {
00100     if (stopper_debug_level >= 1) {
00101       tprintf("AcceptableChoice() returned false"
00102               " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
00103               no_dang_ambigs, best_choice.certainty(),
00104               CertaintyThreshold,
00105               UniformCertainties(best_choice));
00106     }
00107     return false;
00108   }
00109 }
00110 
00111 bool Dict::AcceptableResult(WERD_RES* word) {
00112   if (word->best_choice == NULL) return false;
00113   float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
00114   int WordSize;
00115 
00116   if (stopper_debug_level >= 1) {
00117     tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
00118             word->best_choice->debug_string().string(),
00119             (valid_word(*word->best_choice) ? 'y' : 'n'),
00120             (case_ok(*word->best_choice, getUnicharset()) ? 'y' : 'n'),
00121             word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
00122             word->best_choices.singleton() ? 'n' : 'y');
00123   }
00124 
00125   if (word->best_choice->length() == 0 || !word->best_choices.singleton())
00126     return false;
00127   if (valid_word(*word->best_choice) &&
00128       case_ok(*word->best_choice, getUnicharset())) {
00129     WordSize = LengthOfShortestAlphaRun(*word->best_choice);
00130     WordSize -= stopper_smallword_size;
00131     if (WordSize < 0)
00132       WordSize = 0;
00133     CertaintyThreshold += WordSize * stopper_certainty_per_char;
00134   }
00135 
00136   if (stopper_debug_level >= 1)
00137     tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f   ",
00138             word->best_choice->certainty(), CertaintyThreshold);
00139 
00140   if (word->best_choice->certainty() > CertaintyThreshold &&
00141       !stopper_no_acceptable_choices) {
00142     if (stopper_debug_level >= 1)
00143       tprintf("ACCEPTED\n");
00144     return true;
00145   } else {
00146     if (stopper_debug_level >= 1)
00147       tprintf("REJECTED\n");
00148     return false;
00149   }
00150 }
00151 
00152 bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
00153                             DANGERR *fixpt,
00154                             bool fix_replaceable,
00155                             MATRIX *ratings) {
00156   if (stopper_debug_level > 2) {
00157     tprintf("\nRunning NoDangerousAmbig() for %s\n",
00158             best_choice->debug_string().string());
00159   }
00160 
00161   // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
00162   // for each unichar id in BestChoice.
00163   BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
00164   int i;
00165   bool ambigs_found = false;
00166   // For each position in best_choice:
00167   // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
00168   // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
00169   // -- look for ambiguities corresponding to wrong_ngram in the list while
00170   //    adding the following unichar_ids from best_choice to wrong_ngram
00171   //
00172   // Repeat the above procedure twice: first time look through
00173   // ambigs to be replaced and replace all the ambiguities found;
00174   // second time look through dangerous ambiguities and construct
00175   // ambig_blob_choices with fake a blob choice for each ambiguity
00176   // and pass them to dawg_permute_and_select() to search for
00177   // ambiguous words in the dictionaries.
00178   //
00179   // Note that during the execution of the for loop (on the first pass)
00180   // if replacements are made the length of best_choice might change.
00181   for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
00182     bool replace = (fix_replaceable && pass == 0);
00183     const UnicharAmbigsVector &table = replace ?
00184       getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();
00185     if (!replace) {
00186       // Initialize ambig_blob_choices with lists containing a single
00187       // unichar id for the correspoding position in best_choice.
00188       // best_choice consisting from only the original letters will
00189       // have a rating of 0.0.
00190       for (i = 0; i < best_choice->length(); ++i) {
00191         BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
00192         BLOB_CHOICE_IT lst_it(lst);
00193         // TODO(rays/antonova) Put real xheights and y shifts here.
00194         lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
00195                                           0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
00196         ambig_blob_choices.push_back(lst);
00197       }
00198     }
00199     UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
00200     int wrong_ngram_index;
00201     int next_index;
00202     int blob_index = 0;
00203     for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
00204          ++i) {
00205       UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
00206       if (stopper_debug_level > 2) {
00207         tprintf("Looking for %s ngrams starting with %s:\n",
00208                 replace ? "replaceable" : "ambiguous",
00209                 getUnicharset().debug_str(curr_unichar_id).string());
00210       }
00211       int num_wrong_blobs = best_choice->state(i);
00212       wrong_ngram_index = 0;
00213       wrong_ngram[wrong_ngram_index] = curr_unichar_id;
00214       if (curr_unichar_id == INVALID_UNICHAR_ID ||
00215           curr_unichar_id >= table.size() ||
00216           table[curr_unichar_id] == NULL) {
00217         continue;  // there is no ambig spec for this unichar id
00218       }
00219       AmbigSpec_IT spec_it(table[curr_unichar_id]);
00220       for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
00221         const AmbigSpec *ambig_spec = spec_it.data();
00222         wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
00223         int compare = UnicharIdArrayUtils::compare(wrong_ngram,
00224                                                    ambig_spec->wrong_ngram);
00225         if (stopper_debug_level > 2) {
00226           tprintf("candidate ngram: ");
00227           UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
00228           tprintf("current ngram from spec: ");
00229           UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
00230           tprintf("comparison result: %d\n", compare);
00231         }
00232         if (compare == 0) {
00233           // Record the place where we found an ambiguity.
00234           if (fixpt != NULL) {
00235             UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
00236             fixpt->push_back(DANGERR_INFO(
00237                 blob_index, blob_index + num_wrong_blobs, replace,
00238                 getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
00239                 leftmost_id));
00240             if (stopper_debug_level > 1) {
00241               tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index,
00242                       blob_index + num_wrong_blobs, false,
00243                       getUnicharset().get_isngram(
00244                           ambig_spec->correct_ngram_id),
00245                       getUnicharset().id_to_unichar(leftmost_id));
00246             }
00247           }
00248 
00249           if (replace) {
00250             if (stopper_debug_level > 2) {
00251               tprintf("replace ambiguity with %s : ",
00252                       getUnicharset().id_to_unichar(
00253                           ambig_spec->correct_ngram_id));
00254               UnicharIdArrayUtils::print(
00255                   ambig_spec->correct_fragments, getUnicharset());
00256             }
00257             ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
00258                          ambig_spec->correct_ngram_id,
00259                          best_choice, ratings);
00260           } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
00261             // We found dang ambig - update ambig_blob_choices.
00262             if (stopper_debug_level > 2) {
00263               tprintf("found ambiguity: ");
00264               UnicharIdArrayUtils::print(
00265                   ambig_spec->correct_fragments, getUnicharset());
00266             }
00267             ambigs_found = true;
00268             for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
00269                  ++tmp_index) {
00270               // Add a blob choice for the corresponding fragment of the
00271               // ambiguity. These fake blob choices are initialized with
00272               // negative ratings (which are not possible for real blob
00273               // choices), so that dawg_permute_and_select() considers any
00274               // word not consisting of only the original letters a better
00275               // choice and stops searching for alternatives once such a
00276               // choice is found.
00277               BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
00278               bc_it.add_to_end(new BLOB_CHOICE(
00279                   ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
00280                   -1, 0, 1, 0, BCC_AMBIG));
00281             }
00282           }
00283           spec_it.forward();
00284         } else if (compare == -1) {
00285           if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
00286               ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
00287             // Add the next unichar id to wrong_ngram and keep looking for
00288             // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
00289             wrong_ngram[++wrong_ngram_index] =
00290               best_choice->unichar_id(next_index);
00291             num_wrong_blobs += best_choice->state(next_index);
00292           } else {
00293             break;  // no more matching ambigs in this AMBIG_SPEC_LIST
00294           }
00295         } else {
00296           spec_it.forward();
00297         }
00298       }  // end searching AmbigSpec_LIST
00299     }  // end searching best_choice
00300   }  // end searching replace and dangerous ambigs
00301 
00302   // If any ambiguities were found permute the constructed ambig_blob_choices
00303   // to see if an alternative dictionary word can be found.
00304   if (ambigs_found) {
00305     if (stopper_debug_level > 2) {
00306       tprintf("\nResulting ambig_blob_choices:\n");
00307       for (i = 0; i < ambig_blob_choices.length(); ++i) {
00308         print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
00309         tprintf("\n");
00310       }
00311     }
00312     WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
00313     ambigs_found = (alt_word->rating() < 0.0);
00314     if (ambigs_found) {
00315       if (stopper_debug_level >= 1) {
00316         tprintf ("Stopper: Possible ambiguous word = %s\n",
00317                  alt_word->debug_string().string());
00318       }
00319       if (fixpt != NULL) {
00320         // Note: Currently character choices combined from fragments can only
00321         // be generated by NoDangrousAmbigs(). This code should be updated if
00322         // the capability to produce classifications combined from character
00323         // fragments is added to other functions.
00324         int orig_i = 0;
00325         for (i = 0; i < alt_word->length(); ++i) {
00326           const UNICHARSET &uchset = getUnicharset();
00327           bool replacement_is_ngram =
00328               uchset.get_isngram(alt_word->unichar_id(i));
00329           UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
00330           if (replacement_is_ngram) {
00331             // we have to extract the leftmost unichar from the ngram.
00332             const char *str = uchset.id_to_unichar(leftmost_id);
00333             int step = uchset.step(str);
00334             if (step) leftmost_id = uchset.unichar_to_id(str, step);
00335           }
00336           int end_i = orig_i + alt_word->state(i);
00337           if (alt_word->state(i) > 1 ||
00338               (orig_i + 1 == end_i && replacement_is_ngram)) {
00339             // Compute proper blob indices.
00340             int blob_start = 0;
00341             for (int j = 0; j < orig_i; ++j)
00342               blob_start += best_choice->state(j);
00343             int blob_end = blob_start;
00344             for (int j = orig_i; j < end_i; ++j)
00345               blob_end += best_choice->state(j);
00346             fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true,
00347                                           replacement_is_ngram, leftmost_id));
00348             if (stopper_debug_level > 1) {
00349               tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
00350                       true, replacement_is_ngram,
00351                       uchset.id_to_unichar(leftmost_id));
00352             }
00353           }
00354           orig_i += alt_word->state(i);
00355         }
00356       }
00357     }
00358     delete alt_word;
00359   }
00360   if (output_ambig_words_file_ != NULL) {
00361     fprintf(output_ambig_words_file_, "\n");
00362   }
00363 
00364   ambig_blob_choices.delete_data_pointers();
00365   return !ambigs_found;
00366 }
00367 
00368 void Dict::EndDangerousAmbigs() {}
00369 
00370 void Dict::SettupStopperPass1() {
00371   reject_offset_ = 0.0;
00372 }
00373 
00374 void Dict::SettupStopperPass2() {
00375   reject_offset_ = stopper_phase2_certainty_rejection_offset;
00376 }
00377 
00378 void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
00379                         UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
00380                         MATRIX *ratings) {
00381   int num_blobs_to_replace = 0;
00382   int begin_blob_index = 0;
00383   int i;
00384   // Rating and certainty for the new BLOB_CHOICE are derived from the
00385   // replaced choices.
00386   float new_rating = 0.0f;
00387   float new_certainty = 0.0f;
00388   BLOB_CHOICE* old_choice = NULL;
00389   for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
00390     if (i >= wrong_ngram_begin_index) {
00391       int num_blobs = werd_choice->state(i);
00392       int col = begin_blob_index + num_blobs_to_replace;
00393       int row = col + num_blobs - 1;
00394       BLOB_CHOICE_LIST* choices = ratings->get(col, row);
00395       ASSERT_HOST(choices != NULL);
00396       old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
00397       ASSERT_HOST(old_choice != NULL);
00398       new_rating += old_choice->rating();
00399       new_certainty += old_choice->certainty();
00400       num_blobs_to_replace += num_blobs;
00401     } else {
00402       begin_blob_index += werd_choice->state(i);
00403     }
00404   }
00405   new_certainty /= wrong_ngram_size;
00406   // If there is no entry in the ratings matrix, add it.
00407   MATRIX_COORD coord(begin_blob_index,
00408                      begin_blob_index + num_blobs_to_replace - 1);
00409   if (!coord.Valid(*ratings)) {
00410     ratings->IncreaseBandSize(coord.row - coord.col + 1);
00411   }
00412   if (ratings->get(coord.col, coord.row) == NULL)
00413     ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
00414   BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
00415   BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
00416   if (choice != NULL) {
00417     // Already there. Upgrade if new rating better.
00418     if (new_rating < choice->rating())
00419       choice->set_rating(new_rating);
00420     if (new_certainty < choice->certainty())
00421       choice->set_certainty(new_certainty);
00422     // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
00423   } else {
00424     // Need a new choice with the correct_ngram_id.
00425     choice = new BLOB_CHOICE(*old_choice);
00426     choice->set_unichar_id(correct_ngram_id);
00427     choice->set_rating(new_rating);
00428     choice->set_certainty(new_certainty);
00429     choice->set_classifier(BCC_AMBIG);
00430     choice->set_matrix_cell(coord.col, coord.row);
00431     BLOB_CHOICE_IT it (new_choices);
00432     it.add_to_end(choice);
00433   }
00434   // Remove current unichar from werd_choice. On the last iteration
00435   // set the correct replacement unichar instead of removing a unichar.
00436   for (int replaced_count = 0; replaced_count < wrong_ngram_size;
00437        ++replaced_count) {
00438     if (replaced_count + 1 == wrong_ngram_size) {
00439       werd_choice->set_blob_choice(wrong_ngram_begin_index,
00440                                    num_blobs_to_replace, choice);
00441     } else {
00442       werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
00443     }
00444   }
00445   if (stopper_debug_level >= 1) {
00446       werd_choice->print("ReplaceAmbig() ");
00447       tprintf("Modified blob_choices: ");
00448       print_ratings_list("\n", new_choices, getUnicharset());
00449   }
00450 }
00451 
00452 int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) {
00453   int shortest = MAX_INT32;
00454   int curr_len = 0;
00455   for (int w = 0; w < WordChoice.length(); ++w) {
00456     if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) {
00457       curr_len++;
00458     } else if (curr_len > 0) {
00459       if (curr_len < shortest) shortest = curr_len;
00460       curr_len = 0;
00461     }
00462   }
00463   if (curr_len > 0 && curr_len < shortest) {
00464     shortest = curr_len;
00465   } else if (shortest == MAX_INT32) {
00466     shortest = 0;
00467   }
00468   return shortest;
00469 }
00470 
00471 int Dict::UniformCertainties(const WERD_CHOICE& word) {
00472   float Certainty;
00473   float WorstCertainty = MAX_FLOAT32;
00474   float CertaintyThreshold;
00475   FLOAT64 TotalCertainty;
00476   FLOAT64 TotalCertaintySquared;
00477   FLOAT64 Variance;
00478   FLOAT32 Mean, StdDev;
00479   int word_length = word.length();
00480 
00481   if (word_length < 3)
00482     return true;
00483 
00484   TotalCertainty = TotalCertaintySquared = 0.0;
00485   for (int i = 0; i < word_length; ++i) {
00486     Certainty = word.certainty(i);
00487     TotalCertainty += Certainty;
00488     TotalCertaintySquared += Certainty * Certainty;
00489     if (Certainty < WorstCertainty)
00490       WorstCertainty = Certainty;
00491   }
00492 
00493   // Subtract off worst certainty from statistics.
00494   word_length--;
00495   TotalCertainty -= WorstCertainty;
00496   TotalCertaintySquared -= WorstCertainty * WorstCertainty;
00497 
00498   Mean = TotalCertainty / word_length;
00499   Variance = ((word_length * TotalCertaintySquared -
00500     TotalCertainty * TotalCertainty) /
00501     (word_length * (word_length - 1)));
00502   if (Variance < 0.0)
00503     Variance = 0.0;
00504   StdDev = sqrt(Variance);
00505 
00506   CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
00507   if (CertaintyThreshold > stopper_nondict_certainty_base)
00508     CertaintyThreshold = stopper_nondict_certainty_base;
00509 
00510   if (word.certainty() < CertaintyThreshold) {
00511     if (stopper_debug_level >= 1)
00512       tprintf("Stopper: Non-uniform certainty = %4.1f"
00513               " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
00514               word.certainty(), Mean, StdDev, CertaintyThreshold);
00515     return false;
00516   } else {
00517     return true;
00518   }
00519 }
00520 
00521 } // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines