tesseract 3.04.01

wordrec/segsearch.cpp

Go to the documentation of this file.
00001 
00002 // File:        segsearch.h
00003 // Description: Segmentation search functions.
00004 // Author:      Daria Antonova
00005 // Created:     Mon Jun 23 11:26:43 PDT 2008
00006 //
00007 // (C) Copyright 2009, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #include "wordrec.h"
00021 
00022 #include "associate.h"
00023 #include "language_model.h"
00024 #include "matrix.h"
00025 #include "params.h"
00026 #include "lm_pain_points.h"
00027 #include "ratngs.h"
00028 
00029 namespace tesseract {
00030 
00031 void Wordrec::DoSegSearch(WERD_RES* word_res) {
00032   BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
00033   // Run Segmentation Search.
00034   SegSearch(word_res, &best_choice_bundle, NULL);
00035 }
00036 
00037 void Wordrec::SegSearch(WERD_RES* word_res,
00038                         BestChoiceBundle* best_choice_bundle,
00039                         BlamerBundle* blamer_bundle) {
00040   LMPainPoints pain_points(segsearch_max_pain_points,
00041                            segsearch_max_char_wh_ratio,
00042                            assume_fixed_pitch_char_segment,
00043                            &getDict(), segsearch_debug_level);
00044   // Compute scaling factor that will help us recover blob outline length
00045   // from classifier rating and certainty for the blob.
00046   float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
00047   GenericVector<SegSearchPending> pending;
00048   InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle,
00049                    blamer_bundle);
00050 
00051   if (!SegSearchDone(0)) {  // find a better choice
00052     if (chop_enable && word_res->chopped_word != NULL) {
00053       improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle,
00054                           blamer_bundle, &pain_points, &pending);
00055     }
00056     if (chop_debug) SEAM::PrintSeams("Final seam list:", word_res->seam_array);
00057 
00058     if (blamer_bundle != NULL &&
00059         !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) {
00060       blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer);
00061     }
00062   }
00063   // Keep trying to find a better path by fixing the "pain points".
00064 
00065   MATRIX_COORD pain_point;
00066   float pain_point_priority;
00067   int num_futile_classifications = 0;
00068   STRING blamer_debug;
00069   while (wordrec_enable_assoc &&
00070       (!SegSearchDone(num_futile_classifications) ||
00071           (blamer_bundle != NULL &&
00072               blamer_bundle->GuidedSegsearchStillGoing()))) {
00073     // Get the next valid "pain point".
00074     bool found_nothing = true;
00075     LMPainPointsType pp_type;
00076     while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) !=
00077         LM_PPTYPE_NUM) {
00078       if (!pain_point.Valid(*word_res->ratings)) {
00079         word_res->ratings->IncreaseBandSize(
00080             pain_point.row - pain_point.col + 1);
00081       }
00082       if (pain_point.Valid(*word_res->ratings) &&
00083           !word_res->ratings->Classified(pain_point.col, pain_point.row,
00084                                          getDict().WildcardID())) {
00085         found_nothing = false;
00086         break;
00087       }
00088     }
00089     if (found_nothing) {
00090       if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
00091       break;
00092     }
00093     ProcessSegSearchPainPoint(pain_point_priority, pain_point,
00094                               LMPainPoints::PainPointDescription(pp_type),
00095                               &pending, word_res, &pain_points, blamer_bundle);
00096 
00097     UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending,
00098                          word_res, &pain_points, best_choice_bundle,
00099                          blamer_bundle);
00100     if (!best_choice_bundle->updated) ++num_futile_classifications;
00101 
00102     if (segsearch_debug_level > 0) {
00103       tprintf("num_futile_classifications %d\n", num_futile_classifications);
00104     }
00105 
00106     best_choice_bundle->updated = false;  // reset updated
00107 
00108     // See if it's time to terminate SegSearch or time for starting a guided
00109     // search for the true path to find the blame for the incorrect best_choice.
00110     if (SegSearchDone(num_futile_classifications) &&
00111         blamer_bundle != NULL &&
00112         blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) {
00113       InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle,
00114                              &blamer_debug);
00115     }
00116   }  // end while loop exploring alternative paths
00117   if (blamer_bundle != NULL) {
00118     blamer_bundle->FinishSegSearch(word_res->best_choice,
00119                                    wordrec_debug_blamer, &blamer_debug);
00120   }
00121 
00122   if (segsearch_debug_level > 0) {
00123     tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
00124             language_model_->AcceptableChoiceFound());
00125   }
00126 }
00127 
00128 // Setup and run just the initial segsearch on an established matrix,
00129 // without doing any additional chopping or joining.
00130 void Wordrec::WordSearch(WERD_RES* word_res) {
00131   LMPainPoints pain_points(segsearch_max_pain_points,
00132                            segsearch_max_char_wh_ratio,
00133                            assume_fixed_pitch_char_segment,
00134                            &getDict(), segsearch_debug_level);
00135   GenericVector<SegSearchPending> pending;
00136   BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
00137   // Run Segmentation Search.
00138   InitialSegSearch(word_res, &pain_points, &pending, &best_choice_bundle, NULL);
00139   if (segsearch_debug_level > 0) {
00140     tprintf("Ending ratings matrix%s:\n",
00141             wordrec_enable_assoc ? " (with assoc)" : "");
00142     word_res->ratings->print(getDict().getUnicharset());
00143   }
00144 }
00145 
00146 
00147 // Setup and run just the initial segsearch on an established matrix,
00148 // without doing any additional chopping or joining.
00149 // (Internal factored version that can be used as part of the main SegSearch.)
00150 void Wordrec::InitialSegSearch(WERD_RES* word_res, LMPainPoints* pain_points,
00151                                GenericVector<SegSearchPending>* pending,
00152                                BestChoiceBundle* best_choice_bundle,
00153                                BlamerBundle* blamer_bundle) {
00154   if (segsearch_debug_level > 0) {
00155     tprintf("Starting SegSearch on ratings matrix%s:\n",
00156             wordrec_enable_assoc ? " (with assoc)" : "");
00157     word_res->ratings->print(getDict().getUnicharset());
00158   }
00159 
00160   pain_points->GenerateInitial(word_res);
00161 
00162   // Compute scaling factor that will help us recover blob outline length
00163   // from classifier rating and certainty for the blob.
00164   float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
00165 
00166   language_model_->InitForWord(prev_word_best_choice_,
00167                                assume_fixed_pitch_char_segment,
00168                                segsearch_max_char_wh_ratio, rating_cert_scale);
00169 
00170   // Initialize blamer-related information: map character boxes recorded in
00171   // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
00172   // ratings matrix. We expect this step to succeed, since when running the
00173   // chopper we checked that the correct chops are present.
00174   if (blamer_bundle != NULL) {
00175     blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
00176                                             wordrec_debug_blamer);
00177   }
00178 
00179   // pending[col] tells whether there is update work to do to combine
00180   // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
00181   // As the language model state is updated, pending entries are modified to
00182   // minimize duplication of work. It is important that during the update the
00183   // children are considered in the non-decreasing order of their column, since
00184   // this guarantees that all the parents would be up to date before an update
00185   // of a child is done.
00186   pending->init_to_size(word_res->ratings->dimension(), SegSearchPending());
00187 
00188   // Search the ratings matrix for the initial best path.
00189   (*pending)[0].SetColumnClassified();
00190   UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res,
00191                        pain_points, best_choice_bundle, blamer_bundle);
00192 }
00193 
00194 void Wordrec::UpdateSegSearchNodes(
00195     float rating_cert_scale,
00196     int starting_col,
00197     GenericVector<SegSearchPending>* pending,
00198     WERD_RES *word_res,
00199     LMPainPoints *pain_points,
00200     BestChoiceBundle *best_choice_bundle,
00201     BlamerBundle *blamer_bundle) {
00202   MATRIX *ratings = word_res->ratings;
00203   ASSERT_HOST(ratings->dimension() == pending->size());
00204   ASSERT_HOST(ratings->dimension() == best_choice_bundle->beam.size());
00205   for (int col = starting_col; col < ratings->dimension(); ++col) {
00206     if (!(*pending)[col].WorkToDo()) continue;
00207     int first_row = col;
00208     int last_row = MIN(ratings->dimension() - 1,
00209                        col + ratings->bandwidth() - 1);
00210     if ((*pending)[col].SingleRow() >= 0) {
00211       first_row = last_row = (*pending)[col].SingleRow();
00212     }
00213     if (segsearch_debug_level > 0) {
00214       tprintf("\n\nUpdateSegSearchNodes: col=%d, rows=[%d,%d], alljust=%d\n",
00215               col, first_row, last_row,
00216               (*pending)[col].IsRowJustClassified(MAX_INT32));
00217     }
00218     // Iterate over the pending list for this column.
00219     for (int row = first_row; row <= last_row; ++row) {
00220       // Update language model state of this child+parent pair.
00221       BLOB_CHOICE_LIST *current_node = ratings->get(col, row);
00222       LanguageModelState *parent_node =
00223           col == 0 ? NULL : best_choice_bundle->beam[col - 1];
00224       if (current_node != NULL &&
00225           language_model_->UpdateState((*pending)[col].IsRowJustClassified(row),
00226                                        col, row, current_node, parent_node,
00227                                        pain_points, word_res,
00228                                        best_choice_bundle, blamer_bundle) &&
00229           row + 1 < ratings->dimension()) {
00230         // Since the language model state of this entry changed, process all
00231         // the child column.
00232         (*pending)[row + 1].RevisitWholeColumn();
00233         if (segsearch_debug_level > 0) {
00234           tprintf("Added child col=%d to pending\n", row + 1);
00235         }
00236       }  // end if UpdateState.
00237     }  // end for row.
00238   }  // end for col.
00239   if (best_choice_bundle->best_vse != NULL) {
00240     ASSERT_HOST(word_res->StatesAllValid());
00241     if (best_choice_bundle->best_vse->updated) {
00242       pain_points->GenerateFromPath(rating_cert_scale,
00243                                     best_choice_bundle->best_vse, word_res);
00244       if (!best_choice_bundle->fixpt.empty()) {
00245         pain_points->GenerateFromAmbigs(best_choice_bundle->fixpt,
00246                                         best_choice_bundle->best_vse, word_res);
00247       }
00248     }
00249   }
00250   // The segsearch is completed. Reset all updated flags on all VSEs and reset
00251   // all pendings.
00252   for (int col = 0; col < pending->size(); ++col) {
00253     (*pending)[col].Clear();
00254     ViterbiStateEntry_IT
00255         vse_it(&best_choice_bundle->beam[col]->viterbi_state_entries);
00256     for (vse_it.mark_cycle_pt(); !vse_it.cycled_list(); vse_it.forward()) {
00257       vse_it.data()->updated = false;
00258     }
00259   }
00260 }
00261 
00262 void Wordrec::ProcessSegSearchPainPoint(
00263     float pain_point_priority,
00264     const MATRIX_COORD &pain_point, const char* pain_point_type,
00265     GenericVector<SegSearchPending>* pending, WERD_RES *word_res,
00266     LMPainPoints *pain_points, BlamerBundle *blamer_bundle) {
00267   if (segsearch_debug_level > 0) {
00268     tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n",
00269             pain_point_type, pain_point_priority,
00270             pain_point.col, pain_point.row);
00271   }
00272   ASSERT_HOST(pain_points != NULL);
00273   MATRIX *ratings = word_res->ratings;
00274   // Classify blob [pain_point.col pain_point.row]
00275   if (!pain_point.Valid(*ratings)) {
00276     ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col);
00277   }
00278   ASSERT_HOST(pain_point.Valid(*ratings));
00279   BLOB_CHOICE_LIST *classified = classify_piece(word_res->seam_array,
00280                                                 pain_point.col, pain_point.row,
00281                                                 pain_point_type,
00282                                                 word_res->chopped_word,
00283                                                 blamer_bundle);
00284   BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row);
00285   if (lst == NULL) {
00286     ratings->put(pain_point.col, pain_point.row, classified);
00287   } else {
00288     // We can not delete old BLOB_CHOICEs, since they might contain
00289     // ViterbiStateEntries that are parents of other "active" entries.
00290     // Thus if the matrix cell already contains classifications we add
00291     // the new ones to the beginning of the list.
00292     BLOB_CHOICE_IT it(lst);
00293     it.add_list_before(classified);
00294     delete classified;  // safe to delete, since empty after add_list_before()
00295     classified = NULL;
00296   }
00297 
00298   if (segsearch_debug_level > 0) {
00299     print_ratings_list("Updated ratings matrix with a new entry:",
00300                        ratings->get(pain_point.col, pain_point.row),
00301                        getDict().getUnicharset());
00302     ratings->print(getDict().getUnicharset());
00303   }
00304 
00305   // Insert initial "pain points" to join the newly classified blob
00306   // with its left and right neighbors.
00307   if (classified != NULL && !classified->empty()) {
00308     if (pain_point.col > 0) {
00309       pain_points->GeneratePainPoint(
00310           pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0,
00311           true, segsearch_max_char_wh_ratio, word_res);
00312     }
00313     if (pain_point.row + 1 < ratings->dimension()) {
00314       pain_points->GeneratePainPoint(
00315           pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0,
00316           true, segsearch_max_char_wh_ratio, word_res);
00317     }
00318   }
00319   (*pending)[pain_point.col].SetBlobClassified(pain_point.row);
00320 }
00321 
00322 // Resets enough of the results so that the Viterbi search is re-run.
00323 // Needed when the n-gram model is enabled, as the multi-length comparison
00324 // implementation will re-value existing paths to worse values.
00325 void Wordrec::ResetNGramSearch(WERD_RES* word_res,
00326                                BestChoiceBundle* best_choice_bundle,
00327                                GenericVector<SegSearchPending>* pending) {
00328   // TODO(rays) More refactoring required here.
00329   // Delete existing viterbi states.
00330   for (int col = 0; col < best_choice_bundle->beam.size(); ++col) {
00331     best_choice_bundle->beam[col]->Clear();
00332   }
00333   // Reset best_choice_bundle.
00334   word_res->ClearWordChoices();
00335   best_choice_bundle->best_vse = NULL;
00336   // Clear out all existing pendings and add a new one for the first column.
00337   (*pending)[0].SetColumnClassified();
00338   for (int i = 1; i < pending->size(); ++i)
00339     (*pending)[i].Clear();
00340 }
00341 
00342 void Wordrec::InitBlamerForSegSearch(WERD_RES *word_res,
00343                                      LMPainPoints *pain_points,
00344                                      BlamerBundle *blamer_bundle,
00345                                      STRING *blamer_debug) {
00346   pain_points->Clear();  // Clear pain points heap.
00347   TessResultCallback2<bool, int, int>* pp_cb = NewPermanentTessCallback(
00348       pain_points, &LMPainPoints::GenerateForBlamer,
00349       static_cast<double>(segsearch_max_char_wh_ratio), word_res);
00350   blamer_bundle->InitForSegSearch(word_res->best_choice, word_res->ratings,
00351                                   getDict().WildcardID(), wordrec_debug_blamer,
00352                                   blamer_debug, pp_cb);
00353   delete pp_cb;
00354 }
00355 
00356 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines