tesseract 3.04.01

ccstruct/pageres.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        pageres.cpp  (Formerly page_res.c)
00003  * Description: Hierarchy of results classes from PAGE_RES to WERD_RES
00004  *              and an iterator class to iterate over the words.
00005  * Main purposes:
00006  *              Easy way to iterate over the words without a 3-nested loop.
00007  *              Holds data used during word recognition.
00008  *              Holds information about alternative spacing paths.
00009  * Author:      Phil Cheatle
00010  * Created:     Tue Sep 22 08:42:49 BST 1992
00011  *
00012  * (C) Copyright 1992, Hewlett-Packard Ltd.
00013  ** Licensed under the Apache License, Version 2.0 (the "License");
00014  ** you may not use this file except in compliance with the License.
00015  ** You may obtain a copy of the License at
00016  ** http://www.apache.org/licenses/LICENSE-2.0
00017  ** Unless required by applicable law or agreed to in writing, software
00018  ** distributed under the License is distributed on an "AS IS" BASIS,
00019  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00020  ** See the License for the specific language governing permissions and
00021  ** limitations under the License.
00022  *
00023  **********************************************************************/
00024 #include          <stdlib.h>
00025 #ifdef __UNIX__
00026 #include          <assert.h>
00027 #endif
00028 #include          "blamer.h"
00029 #include          "pageres.h"
00030 #include          "blobs.h"
00031 
00032 ELISTIZE (BLOCK_RES)
00033 CLISTIZE (BLOCK_RES) ELISTIZE (ROW_RES) ELISTIZE (WERD_RES)
00034 
00035 // Gain factor for computing thresholds that determine the ambiguity of a word.
00036 static const double kStopperAmbiguityThresholdGain = 8.0;
00037 // Constant offset for computing thresholds that determine the ambiguity of a
00038 // word.
00039 static const double kStopperAmbiguityThresholdOffset = 1.5;
00040 // Max number of broken pieces to associate.
00041 const int kWordrecMaxNumJoinChunks = 4;
00042 // Max ratio of word box height to line size to allow it to be processed as
00043 // a line with other words.
00044 const double kMaxWordSizeRatio = 1.25;
00045 // Max ratio of line box height to line size to allow a new word to be added.
00046 const double kMaxLineSizeRatio = 1.25;
00047 // Max ratio of word gap to line size to allow a new word to be added.
00048 const double kMaxWordGapRatio = 2.0;
00049 
00050 // Computes and returns a threshold of certainty difference used to determine
00051 // which words to keep, based on the adjustment factors of the two words.
00052 // TODO(rays) This is horrible. Replace with an enhance params training model.
00053 static double StopperAmbigThreshold(double f1, double f2) {
00054   return (f2 - f1) * kStopperAmbiguityThresholdGain -
00055       kStopperAmbiguityThresholdOffset;
00056 }
00057 
00058 /*************************************************************************
00059  * PAGE_RES::PAGE_RES
00060  *
00061  * Constructor for page results
00062  *************************************************************************/
00063 PAGE_RES::PAGE_RES(
00064     bool merge_similar_words,
00065     BLOCK_LIST *the_block_list,
00066     WERD_CHOICE **prev_word_best_choice_ptr) {
00067   Init();
00068   BLOCK_IT block_it(the_block_list);
00069   BLOCK_RES_IT block_res_it(&block_res_list);
00070   for (block_it.mark_cycle_pt();
00071        !block_it.cycled_list(); block_it.forward()) {
00072     block_res_it.add_to_end(new BLOCK_RES(merge_similar_words,
00073                                           block_it.data()));
00074   }
00075   prev_word_best_choice = prev_word_best_choice_ptr;
00076 }
00077 
00078 /*************************************************************************
00079  * BLOCK_RES::BLOCK_RES
00080  *
00081  * Constructor for BLOCK results
00082  *************************************************************************/
00083 
00084 BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
00085   ROW_IT row_it (the_block->row_list ());
00086   ROW_RES_IT row_res_it(&row_res_list);
00087 
00088   char_count = 0;
00089   rej_count = 0;
00090   font_class = -1;               //not assigned
00091   x_height = -1.0;
00092   font_assigned = FALSE;
00093   bold = FALSE;
00094   italic = FALSE;
00095   row_count = 0;
00096 
00097   block = the_block;
00098 
00099   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00100     row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
00101   }
00102 }
00103 
00104 /*************************************************************************
00105  * ROW_RES::ROW_RES
00106  *
00107  * Constructor for ROW results
00108  *************************************************************************/
00109 
00110 ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
00111   WERD_IT word_it(the_row->word_list());
00112   WERD_RES_IT word_res_it(&word_res_list);
00113   WERD_RES *combo = NULL;        // current combination of fuzzies
00114   WERD *copy_word;
00115 
00116   char_count = 0;
00117   rej_count = 0;
00118   whole_word_rej_count = 0;
00119 
00120   row = the_row;
00121   bool add_next_word = false;
00122   TBOX union_box;
00123   float line_height = the_row->x_height() + the_row->ascenders() -
00124       the_row->descenders();
00125   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00126     WERD_RES* word_res = new WERD_RES(word_it.data());
00127     word_res->x_height = the_row->x_height();
00128     if (add_next_word) {
00129       ASSERT_HOST(combo != NULL);
00130       // We are adding this word to the combination.
00131       word_res->part_of_combo = TRUE;
00132       combo->copy_on(word_res);
00133     } else if (merge_similar_words) {
00134       union_box = word_res->word->bounding_box();
00135       add_next_word = !word_res->word->flag(W_REP_CHAR) &&
00136           union_box.height() <= line_height * kMaxWordSizeRatio;
00137       word_res->odd_size = !add_next_word;
00138     }
00139     WERD* next_word = word_it.data_relative(1);
00140     if (merge_similar_words) {
00141       if (add_next_word && !next_word->flag(W_REP_CHAR)) {
00142         // Next word will be added on if all of the following are true:
00143         // Not a rep char.
00144         // Box height small enough.
00145         // Union box height small enough.
00146         // Horizontal gap small enough.
00147         TBOX next_box = next_word->bounding_box();
00148         int prev_right = union_box.right();
00149         union_box += next_box;
00150         if (next_box.height() > line_height * kMaxWordSizeRatio ||
00151             union_box.height() > line_height * kMaxLineSizeRatio ||
00152             next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
00153           add_next_word = false;
00154         }
00155       }
00156       next_word->set_flag(W_FUZZY_NON, add_next_word);
00157     } else {
00158       add_next_word = next_word->flag(W_FUZZY_NON);
00159     }
00160     if (add_next_word) {
00161       if (combo == NULL) {
00162         copy_word = new WERD;
00163         *copy_word = *(word_it.data());  // deep copy
00164         combo = new WERD_RES(copy_word);
00165         combo->x_height = the_row->x_height();
00166         combo->combination = TRUE;
00167         word_res_it.add_to_end(combo);
00168       }
00169       word_res->part_of_combo = TRUE;
00170     } else {
00171       combo = NULL;
00172     }
00173     word_res_it.add_to_end(word_res);
00174   }
00175 }
00176 
00177 
00178 WERD_RES& WERD_RES::operator=(const WERD_RES & source) {
00179   this->ELIST_LINK::operator=(source);
00180   Clear();
00181   if (source.combination) {
00182     word = new WERD;
00183     *word = *(source.word);      // deep copy
00184   } else {
00185     word = source.word;          // pt to same word
00186   }
00187   if (source.bln_boxes != NULL)
00188     bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
00189   if (source.chopped_word != NULL)
00190     chopped_word = new TWERD(*source.chopped_word);
00191   if (source.rebuild_word != NULL)
00192     rebuild_word = new TWERD(*source.rebuild_word);
00193   // TODO(rays) Do we ever need to copy the seam_array?
00194   blob_row = source.blob_row;
00195   denorm = source.denorm;
00196   if (source.box_word != NULL)
00197     box_word = new tesseract::BoxWord(*source.box_word);
00198   best_state = source.best_state;
00199   correct_text = source.correct_text;
00200   blob_widths = source.blob_widths;
00201   blob_gaps = source.blob_gaps;
00202   // None of the uses of operator= require the ratings matrix to be copied,
00203   // so don't as it would be really slow.
00204 
00205   // Copy the cooked choices.
00206   WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&source.best_choices));
00207   WERD_CHOICE_IT wc_dest_it(&best_choices);
00208   for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
00209     const WERD_CHOICE *choice = wc_it.data();
00210     wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));
00211   }
00212   if (!wc_dest_it.empty()) {
00213     wc_dest_it.move_to_first();
00214     best_choice = wc_dest_it.data();
00215   } else {
00216     best_choice = NULL;
00217   }
00218 
00219   if (source.raw_choice != NULL) {
00220     raw_choice = new WERD_CHOICE(*source.raw_choice);
00221   } else {
00222     raw_choice = NULL;
00223   }
00224   if (source.ep_choice != NULL) {
00225     ep_choice = new WERD_CHOICE(*source.ep_choice);
00226   } else {
00227     ep_choice = NULL;
00228   }
00229   reject_map = source.reject_map;
00230   combination = source.combination;
00231   part_of_combo = source.part_of_combo;
00232   CopySimpleFields(source);
00233   if (source.blamer_bundle != NULL) {
00234     blamer_bundle =  new BlamerBundle(*(source.blamer_bundle));
00235   }
00236   return *this;
00237 }
00238 
00239 // Copies basic fields that don't involve pointers that might be useful
00240 // to copy when making one WERD_RES from another.
00241 void WERD_RES::CopySimpleFields(const WERD_RES& source) {
00242   tess_failed = source.tess_failed;
00243   tess_accepted = source.tess_accepted;
00244   tess_would_adapt = source.tess_would_adapt;
00245   done = source.done;
00246   unlv_crunch_mode = source.unlv_crunch_mode;
00247   small_caps = source.small_caps;
00248   odd_size = source.odd_size;
00249   italic = source.italic;
00250   bold = source.bold;
00251   fontinfo = source.fontinfo;
00252   fontinfo2 = source.fontinfo2;
00253   fontinfo_id_count = source.fontinfo_id_count;
00254   fontinfo_id2_count = source.fontinfo_id2_count;
00255   x_height = source.x_height;
00256   caps_height = source.caps_height;
00257   baseline_shift = source.baseline_shift;
00258   guessed_x_ht = source.guessed_x_ht;
00259   guessed_caps_ht = source.guessed_caps_ht;
00260   reject_spaces = source.reject_spaces;
00261   uch_set = source.uch_set;
00262   tesseract = source.tesseract;
00263 }
00264 
00265 // Initializes a blank (default constructed) WERD_RES from one that has
00266 // already been recognized.
00267 // Use SetupFor*Recognition afterwards to complete the setup and make
00268 // it ready for a retry recognition.
00269 void WERD_RES::InitForRetryRecognition(const WERD_RES& source) {
00270   word = source.word;
00271   CopySimpleFields(source);
00272   if (source.blamer_bundle != NULL) {
00273     blamer_bundle = new BlamerBundle();
00274     blamer_bundle->CopyTruth(*source.blamer_bundle);
00275   }
00276 }
00277 
00278 // Sets up the members used in recognition: bln_boxes, chopped_word,
00279 // seam_array, denorm.  Returns false if
00280 // the word is empty and sets up fake results.  If use_body_size is
00281 // true and row->body_size is set, then body_size will be used for
00282 // blob normalization instead of xheight + ascrise. This flag is for
00283 // those languages that are using CJK pitch model and thus it has to
00284 // be true if and only if tesseract->textord_use_cjk_fp_model is
00285 // true.
00286 // If allow_detailed_fx is true, the feature extractor will receive fine
00287 // precision outline information, allowing smoother features and better
00288 // features on low resolution images.
00289 // The norm_mode_hint sets the default mode for normalization in absence
00290 // of any of the above flags.
00291 // norm_box is used to override the word bounding box to determine the
00292 // normalization scale and offset.
00293 // Returns false if the word is empty and sets up fake results.
00294 bool WERD_RES::SetupForRecognition(const UNICHARSET& unicharset_in,
00295                                    tesseract::Tesseract* tess, Pix* pix,
00296                                    int norm_mode,
00297                                    const TBOX* norm_box,
00298                                    bool numeric_mode,
00299                                    bool use_body_size,
00300                                    bool allow_detailed_fx,
00301                                    ROW *row, const BLOCK* block) {
00302   tesseract::OcrEngineMode norm_mode_hint =
00303       static_cast<tesseract::OcrEngineMode>(norm_mode);
00304   tesseract = tess;
00305   POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
00306   if ((norm_mode_hint != tesseract::OEM_CUBE_ONLY &&
00307        word->cblob_list()->empty()) || (pb != NULL && !pb->IsText())) {
00308     // Empty words occur when all the blobs have been moved to the rej_blobs
00309     // list, which seems to occur frequently in junk.
00310     SetupFake(unicharset_in);
00311     word->set_flag(W_REP_CHAR, false);
00312     return false;
00313   }
00314   ClearResults();
00315   SetupWordScript(unicharset_in);
00316   chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);
00317   float word_xheight = use_body_size && row != NULL && row->body_size() > 0.0f
00318                      ? row->body_size() : x_height;
00319   chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
00320                             word_xheight, baseline_shift, numeric_mode,
00321                             norm_mode_hint, norm_box, &denorm);
00322   blob_row = row;
00323   SetupBasicsFromChoppedWord(unicharset_in);
00324   SetupBlamerBundle();
00325   int num_blobs = chopped_word->NumBlobs();
00326   ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);
00327   tess_failed = false;
00328   return true;
00329 }
00330 
00331 // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
00332 // accumulators from a made chopped word.  We presume the fields are already
00333 // empty.
00334 void WERD_RES::SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in) {
00335   bln_boxes = tesseract::BoxWord::CopyFromNormalized(chopped_word);
00336   start_seam_list(chopped_word, &seam_array);
00337   SetupBlobWidthsAndGaps();
00338   ClearWordChoices();
00339 }
00340 
00341 // Sets up the members used in recognition for an empty recognition result:
00342 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
00343 void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
00344   ClearResults();
00345   SetupWordScript(unicharset_in);
00346   chopped_word = new TWERD;
00347   rebuild_word = new TWERD;
00348   bln_boxes = new tesseract::BoxWord;
00349   box_word = new tesseract::BoxWord;
00350   int blob_count = word->cblob_list()->length();
00351   if (blob_count > 0) {
00352     BLOB_CHOICE** fake_choices = new BLOB_CHOICE*[blob_count];
00353     // For non-text blocks, just pass any blobs through to the box_word
00354     // and call the word failed with a fake classification.
00355     C_BLOB_IT b_it(word->cblob_list());
00356     int blob_id = 0;
00357     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00358       TBOX box = b_it.data()->bounding_box();
00359       box_word->InsertBox(box_word->length(), box);
00360       fake_choices[blob_id++] = new BLOB_CHOICE;
00361     }
00362     FakeClassifyWord(blob_count, fake_choices);
00363     delete [] fake_choices;
00364   } else {
00365     WERD_CHOICE* word = new WERD_CHOICE(&unicharset_in);
00366     word->make_bad();
00367     LogNewRawChoice(word);
00368     // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.
00369     LogNewCookedChoice(1, false, word);
00370   }
00371   tess_failed = true;
00372   done = true;
00373 }
00374 
00375 void WERD_RES::SetupWordScript(const UNICHARSET& uch) {
00376   uch_set = &uch;
00377   int script = uch.default_sid();
00378   word->set_script_id(script);
00379   word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight());
00380   word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
00381 }
00382 
00383 // Sets up the blamer_bundle if it is not null, using the initialized denorm.
00384 void WERD_RES::SetupBlamerBundle() {
00385   if (blamer_bundle != NULL) {
00386     blamer_bundle->SetupNormTruthWord(denorm);
00387   }
00388 }
00389 
00390 // Computes the blob_widths and blob_gaps from the chopped_word.
00391 void WERD_RES::SetupBlobWidthsAndGaps() {
00392   blob_widths.truncate(0);
00393   blob_gaps.truncate(0);
00394   int num_blobs = chopped_word->NumBlobs();
00395   for (int b = 0; b < num_blobs; ++b) {
00396     TBLOB *blob = chopped_word->blobs[b];
00397     TBOX box = blob->bounding_box();
00398     blob_widths.push_back(box.width());
00399     if (b + 1 < num_blobs) {
00400       blob_gaps.push_back(
00401           chopped_word->blobs[b + 1]->bounding_box().left() - box.right());
00402     }
00403   }
00404 }
00405 
00406 // Updates internal data to account for a new SEAM (chop) at the given
00407 // blob_number. Fixes the ratings matrix and states in the choices, as well
00408 // as the blob widths and gaps.
00409 void WERD_RES::InsertSeam(int blob_number, SEAM* seam) {
00410   // Insert the seam into the SEAMS array.
00411   seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
00412   seam_array.insert(seam, blob_number);
00413   if (ratings != NULL) {
00414     // Expand the ratings matrix.
00415     ratings = ratings->ConsumeAndMakeBigger(blob_number);
00416     // Fix all the segmentation states.
00417     if (raw_choice != NULL)
00418       raw_choice->UpdateStateForSplit(blob_number);
00419     WERD_CHOICE_IT wc_it(&best_choices);
00420     for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
00421       WERD_CHOICE* choice = wc_it.data();
00422       choice->UpdateStateForSplit(blob_number);
00423     }
00424     SetupBlobWidthsAndGaps();
00425   }
00426 }
00427 
00428 // Returns true if all the word choices except the first have adjust_factors
00429 // worse than the given threshold.
00430 bool WERD_RES::AlternativeChoiceAdjustmentsWorseThan(float threshold) const {
00431   // The choices are not changed by this iteration.
00432   WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
00433   for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
00434     WERD_CHOICE* choice = wc_it.data();
00435     if (choice->adjust_factor() <= threshold)
00436       return false;
00437   }
00438   return true;
00439 }
00440 
00441 // Returns true if the current word is ambiguous (by number of answers or
00442 // by dangerous ambigs.)
00443 bool WERD_RES::IsAmbiguous() {
00444   return !best_choices.singleton() || best_choice->dangerous_ambig_found();
00445 }
00446 
00447 // Returns true if the ratings matrix size matches the sum of each of the
00448 // segmentation states.
00449 bool WERD_RES::StatesAllValid() {
00450   int ratings_dim = ratings->dimension();
00451   if (raw_choice->TotalOfStates() != ratings_dim) {
00452     tprintf("raw_choice has total of states = %d vs ratings dim of %d\n",
00453             raw_choice->TotalOfStates(), ratings_dim);
00454     return false;
00455   }
00456   WERD_CHOICE_IT it(&best_choices);
00457   int index = 0;
00458   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
00459     WERD_CHOICE* choice = it.data();
00460     if (choice->TotalOfStates() != ratings_dim) {
00461       tprintf("Cooked #%d has total of states = %d vs ratings dim of %d\n",
00462               choice->TotalOfStates(), ratings_dim);
00463       return false;
00464     }
00465   }
00466   return true;
00467 }
00468 
00469 // Prints a list of words found if debug is true or the word result matches
00470 // the word_to_debug.
00471 void WERD_RES::DebugWordChoices(bool debug, const char* word_to_debug) {
00472   if (debug ||
00473       (word_to_debug != NULL && *word_to_debug != '\0' && best_choice != NULL &&
00474        best_choice->unichar_string() == STRING(word_to_debug))) {
00475     if (raw_choice != NULL)
00476       raw_choice->print("\nBest Raw Choice");
00477 
00478     WERD_CHOICE_IT it(&best_choices);
00479     int index = 0;
00480     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
00481       WERD_CHOICE* choice = it.data();
00482       STRING label;
00483       label.add_str_int("\nCooked Choice #", index);
00484       choice->print(label.string());
00485     }
00486   }
00487 }
00488 
00489 // Prints the top choice along with the accepted/done flags.
00490 void WERD_RES::DebugTopChoice(const char* msg) const {
00491   tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ",
00492           tess_accepted, tess_would_adapt, done);
00493   if (best_choice == NULL)
00494     tprintf("<Null choice>\n");
00495   else
00496     best_choice->print(msg);
00497 }
00498 
00499 // Removes from best_choices all choices which are not within a reasonable
00500 // range of the best choice.
00501 // TODO(rays) incorporate the information used here into the params training
00502 // re-ranker, in place of this heuristic that is based on the previous
00503 // adjustment factor.
00504 void WERD_RES::FilterWordChoices(int debug_level) {
00505   if (best_choice == NULL || best_choices.singleton())
00506     return;
00507 
00508   if (debug_level >= 2)
00509     best_choice->print("\nFiltering against best choice");
00510   WERD_CHOICE_IT it(&best_choices);
00511   int index = 0;
00512   for (it.forward(); !it.at_first(); it.forward(), ++index) {
00513     WERD_CHOICE* choice = it.data();
00514     float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
00515                                             choice->adjust_factor());
00516     // i, j index the blob choice in choice, best_choice.
00517     // chunk is an index into the chopped_word blobs (AKA chunks).
00518     // Since the two words may use different segmentations of the chunks, we
00519     // iterate over the chunks to find out whether a comparable blob
00520     // classification is much worse than the best result.
00521     int i = 0, j = 0, chunk = 0;
00522     // Each iteration of the while deals with 1 chunk. On entry choice_chunk
00523     // and best_chunk are the indices of the first chunk in the NEXT blob,
00524     // i.e. we don't have to increment i, j while chunk < choice_chunk and
00525     // best_chunk respectively.
00526     int choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
00527     while (i < choice->length() && j < best_choice->length()) {
00528       if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
00529           choice->certainty(i) - best_choice->certainty(j) < threshold) {
00530         if (debug_level >= 2) {
00531           STRING label;
00532           label.add_str_int("\nDiscarding bad choice #", index);
00533           choice->print(label.string());
00534           tprintf("i %d j %d Chunk %d Choice->Blob[i].Certainty %.4g"
00535               " BestChoice->ChunkCertainty[Chunk] %g Threshold %g\n",
00536               i, j, chunk, choice->certainty(i),
00537               best_choice->certainty(j), threshold);
00538         }
00539         delete it.extract();
00540         break;
00541       }
00542       ++chunk;
00543       // If needed, advance choice_chunk to keep up with chunk.
00544       while (choice_chunk < chunk && ++i < choice->length())
00545         choice_chunk += choice->state(i);
00546       // If needed, advance best_chunk to keep up with chunk.
00547       while (best_chunk < chunk && ++j < best_choice->length())
00548         best_chunk += best_choice->state(j);
00549     }
00550   }
00551 }
00552 
00553 void WERD_RES::ComputeAdaptionThresholds(float certainty_scale,
00554                                          float min_rating,
00555                                          float max_rating,
00556                                          float rating_margin,
00557                                          float* thresholds) {
00558   int chunk = 0;
00559   int end_chunk = best_choice->state(0);
00560   int end_raw_chunk = raw_choice->state(0);
00561   int raw_blob = 0;
00562   for (int i = 0; i < best_choice->length(); i++, thresholds++) {
00563     float avg_rating = 0.0f;
00564     int num_error_chunks = 0;
00565 
00566     // For each chunk in best choice blob i, count non-matching raw results.
00567     while (chunk < end_chunk) {
00568       if (chunk >= end_raw_chunk) {
00569         ++raw_blob;
00570         end_raw_chunk += raw_choice->state(raw_blob);
00571       }
00572       if (best_choice->unichar_id(i) !=
00573           raw_choice->unichar_id(raw_blob)) {
00574         avg_rating += raw_choice->certainty(raw_blob);
00575         ++num_error_chunks;
00576       }
00577       ++chunk;
00578     }
00579 
00580     if (num_error_chunks > 0) {
00581       avg_rating /= num_error_chunks;
00582       *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
00583     } else {
00584       *thresholds = max_rating;
00585     }
00586 
00587     if (*thresholds > max_rating)
00588       *thresholds = max_rating;
00589     if (*thresholds < min_rating)
00590       *thresholds = min_rating;
00591   }
00592 }
00593 
00594 // Saves a copy of the word_choice if it has the best unadjusted rating.
00595 // Returns true if the word_choice was the new best.
00596 bool WERD_RES::LogNewRawChoice(WERD_CHOICE* word_choice) {
00597   if (raw_choice == NULL || word_choice->rating() < raw_choice->rating()) {
00598     delete raw_choice;
00599     raw_choice = new WERD_CHOICE(*word_choice);
00600     raw_choice->set_permuter(TOP_CHOICE_PERM);
00601     return true;
00602   }
00603   return false;
00604 }
00605 
00606 // Consumes word_choice by adding it to best_choices, (taking ownership) if
00607 // the certainty for word_choice is some distance of the best choice in
00608 // best_choices, or by deleting the word_choice and returning false.
00609 // The best_choices list is kept in sorted order by rating. Duplicates are
00610 // removed, and the list is kept no longer than max_num_choices in length.
00611 // Returns true if the word_choice is still a valid pointer.
00612 bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,
00613                                   WERD_CHOICE* word_choice) {
00614   if (best_choice != NULL) {
00615     // Throw out obviously bad choices to save some work.
00616     // TODO(rays) Get rid of this! This piece of code produces different
00617     // results according to the order in which words are found, which is an
00618     // undesirable behavior. It would be better to keep all the choices and
00619     // prune them later when more information is available.
00620     float max_certainty_delta =
00621         StopperAmbigThreshold(best_choice->adjust_factor(),
00622                               word_choice->adjust_factor());
00623     if (max_certainty_delta > -kStopperAmbiguityThresholdOffset)
00624       max_certainty_delta = -kStopperAmbiguityThresholdOffset;
00625     if (word_choice->certainty() - best_choice->certainty() <
00626         max_certainty_delta) {
00627       if (debug) {
00628         STRING bad_string;
00629         word_choice->string_and_lengths(&bad_string, NULL);
00630         tprintf("Discarding choice \"%s\" with an overly low certainty"
00631                 " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
00632                 bad_string.string(), word_choice->certainty(),
00633                 best_choice->certainty(),
00634                 max_certainty_delta + best_choice->certainty());
00635       }
00636       delete word_choice;
00637       return false;
00638     }
00639   }
00640 
00641   // Insert in the list in order of increasing rating, but knock out worse
00642   // string duplicates.
00643   WERD_CHOICE_IT it(&best_choices);
00644   const STRING& new_str = word_choice->unichar_string();
00645   bool inserted = false;
00646   int num_choices = 0;
00647   if (!it.empty()) {
00648     do {
00649       WERD_CHOICE* choice = it.data();
00650       if (choice->rating() > word_choice->rating() && !inserted) {
00651         // Time to insert.
00652         it.add_before_stay_put(word_choice);
00653         inserted = true;
00654         if (num_choices == 0)
00655           best_choice = word_choice;  // This is the new best.
00656         ++num_choices;
00657       }
00658       if (choice->unichar_string() == new_str) {
00659         if (inserted) {
00660           // New is better.
00661           delete it.extract();
00662         } else {
00663           // Old is better.
00664           if (debug) {
00665             tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",
00666                     new_str.string(), word_choice->rating(), choice->rating());
00667           }
00668           delete word_choice;
00669           return false;
00670         }
00671       } else {
00672         ++num_choices;
00673         if (num_choices > max_num_choices)
00674           delete it.extract();
00675       }
00676       it.forward();
00677     } while (!it.at_first());
00678   }
00679   if (!inserted && num_choices < max_num_choices) {
00680     it.add_to_end(word_choice);
00681     inserted = true;
00682     if (num_choices == 0)
00683       best_choice = word_choice;  // This is the new best.
00684   }
00685   if (debug) {
00686     if (inserted)
00687       tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");
00688     else
00689       tprintf("Poor");
00690     word_choice->print(" Word Choice");
00691   }
00692   if (!inserted) {
00693     delete word_choice;
00694     return false;
00695   }
00696   return true;
00697 }
00698 
00699 
00700 // Simple helper moves the ownership of the pointer data from src to dest,
00701 // first deleting anything in dest, and nulling out src afterwards.
00702 template<class T> static void MovePointerData(T** dest, T**src) {
00703   delete *dest;
00704   *dest = *src;
00705   *src = NULL;
00706 }
00707 
00708 // Prints a brief list of all the best choices.
00709 void WERD_RES::PrintBestChoices() const {
00710   STRING alternates_str;
00711   WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
00712   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00713     if (!it.at_first()) alternates_str += "\", \"";
00714     alternates_str += it.data()->unichar_string();
00715   }
00716   tprintf("Alternates for \"%s\": {\"%s\"}\n",
00717           best_choice->unichar_string().string(), alternates_str.string());
00718 }
00719 
00720 // Returns the sum of the widths of the blob between start_blob and last_blob
00721 // inclusive.
00722 int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) {
00723   int result = 0;
00724   for (int b = start_blob; b <= last_blob; ++b) {
00725     result += blob_widths[b];
00726     if (b < last_blob)
00727       result += blob_gaps[b];
00728   }
00729   return result;
00730 }
00731 // Returns the width of a gap between the specified blob and the next one.
00732 int WERD_RES::GetBlobsGap(int blob_index) {
00733   if (blob_index < 0 || blob_index >= blob_gaps.size())
00734     return 0;
00735   return blob_gaps[blob_index];
00736 }
00737 
00738 // Returns the BLOB_CHOICE corresponding to the given index in the
00739 // best choice word taken from the appropriate cell in the ratings MATRIX.
00740 // Borrowed pointer, so do not delete. May return NULL if there is no
00741 // BLOB_CHOICE matching the unichar_id at the given index.
00742 BLOB_CHOICE* WERD_RES::GetBlobChoice(int index) const {
00743   if (index < 0 || index >= best_choice->length()) return NULL;
00744   BLOB_CHOICE_LIST* choices = GetBlobChoices(index);
00745   return FindMatchingChoice(best_choice->unichar_id(index), choices);
00746 }
00747 
00748 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
00749 // best choice word taken from the appropriate cell in the ratings MATRIX.
00750 // Borrowed pointer, so do not delete.
00751 BLOB_CHOICE_LIST* WERD_RES::GetBlobChoices(int index) const {
00752   return best_choice->blob_choices(index, ratings);
00753 }
00754 
00755 // Moves the results fields from word to this. This takes ownership of all
00756 // the data, so src can be destructed.
00757 void WERD_RES::ConsumeWordResults(WERD_RES* word) {
00758   denorm = word->denorm;
00759   blob_row = word->blob_row;
00760   MovePointerData(&chopped_word, &word->chopped_word);
00761   MovePointerData(&rebuild_word, &word->rebuild_word);
00762   MovePointerData(&box_word, &word->box_word);
00763   seam_array.delete_data_pointers();
00764   seam_array = word->seam_array;
00765   word->seam_array.clear();
00766   best_state.move(&word->best_state);
00767   correct_text.move(&word->correct_text);
00768   blob_widths.move(&word->blob_widths);
00769   blob_gaps.move(&word->blob_gaps);
00770   if (ratings != NULL) ratings->delete_matrix_pointers();
00771   MovePointerData(&ratings, &word->ratings);
00772   best_choice = word->best_choice;
00773   MovePointerData(&raw_choice, &word->raw_choice);
00774   best_choices.clear();
00775   WERD_CHOICE_IT wc_it(&best_choices);
00776   wc_it.add_list_after(&word->best_choices);
00777   reject_map = word->reject_map;
00778   if (word->blamer_bundle != NULL) {
00779     assert(blamer_bundle != NULL);
00780     blamer_bundle->CopyResults(*(word->blamer_bundle));
00781   }
00782   CopySimpleFields(*word);
00783 }
00784 
00785 // Replace the best choice and rebuild box word.
00786 // choice must be from the current best_choices list.
00787 void WERD_RES::ReplaceBestChoice(WERD_CHOICE* choice) {
00788   best_choice = choice;
00789   RebuildBestState();
00790   SetupBoxWord();
00791   // Make up a fake reject map of the right length to keep the
00792   // rejection pass happy.
00793   reject_map.initialise(best_state.length());
00794   done = tess_accepted = tess_would_adapt = true;
00795   SetScriptPositions();
00796 }
00797 
00798 // Builds the rebuild_word and sets the best_state from the chopped_word and
00799 // the best_choice->state.
00800 void WERD_RES::RebuildBestState() {
00801   ASSERT_HOST(best_choice != NULL);
00802   if (rebuild_word != NULL)
00803     delete rebuild_word;
00804   rebuild_word = new TWERD;
00805   if (seam_array.empty())
00806     start_seam_list(chopped_word, &seam_array);
00807   best_state.truncate(0);
00808   int start = 0;
00809   for (int i = 0; i < best_choice->length(); ++i) {
00810     int length = best_choice->state(i);
00811     best_state.push_back(length);
00812     if (length > 1) {
00813       SEAM::JoinPieces(seam_array, chopped_word->blobs, start,
00814                        start + length - 1);
00815     }
00816     TBLOB* blob = chopped_word->blobs[start];
00817     rebuild_word->blobs.push_back(new TBLOB(*blob));
00818     if (length > 1) {
00819       SEAM::BreakPieces(seam_array, chopped_word->blobs, start,
00820                         start + length - 1);
00821     }
00822     start += length;
00823   }
00824 }
00825 
00826 // Copies the chopped_word to the rebuild_word, faking a best_state as well.
00827 // Also sets up the output box_word.
00828 void WERD_RES::CloneChoppedToRebuild() {
00829   if (rebuild_word != NULL)
00830     delete rebuild_word;
00831   rebuild_word = new TWERD(*chopped_word);
00832   SetupBoxWord();
00833   int word_len = box_word->length();
00834   best_state.reserve(word_len);
00835   correct_text.reserve(word_len);
00836   for (int i = 0; i < word_len; ++i) {
00837     best_state.push_back(1);
00838     correct_text.push_back(STRING(""));
00839   }
00840 }
00841 
00842 // Sets/replaces the box_word with one made from the rebuild_word.
00843 void WERD_RES::SetupBoxWord() {
00844   if (box_word != NULL)
00845     delete box_word;
00846   rebuild_word->ComputeBoundingBoxes();
00847   box_word = tesseract::BoxWord::CopyFromNormalized(rebuild_word);
00848   box_word->ClipToOriginalWord(denorm.block(), word);
00849 }
00850 
00851 // Sets up the script positions in the output best_choice using the best_choice
00852 // to get the unichars, and the unicharset to get the target positions.
00853 void WERD_RES::SetScriptPositions() {
00854   best_choice->SetScriptPositions(small_caps, chopped_word);
00855 }
00856 // Sets all the blobs in all the words (raw choice and best choices) to be
00857 // the given position. (When a sub/superscript is recognized as a separate
00858 // word, it falls victim to the rule that a whole word cannot be sub or
00859 // superscript, so this function overrides that problem.)
00860 void WERD_RES::SetAllScriptPositions(tesseract::ScriptPos position) {
00861   raw_choice->SetAllScriptPositions(position);
00862   WERD_CHOICE_IT wc_it(&best_choices);
00863   for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward())
00864     wc_it.data()->SetAllScriptPositions(position);
00865 }
00866 
00867 // Classifies the word with some already-calculated BLOB_CHOICEs.
00868 // The choices are an array of blob_count pointers to BLOB_CHOICE,
00869 // providing a single classifier result for each blob.
00870 // The BLOB_CHOICEs are consumed and the word takes ownership.
00871 // The number of blobs in the box_word must match blob_count.
00872 void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
00873   // Setup the WERD_RES.
00874   ASSERT_HOST(box_word != NULL);
00875   ASSERT_HOST(blob_count == box_word->length());
00876   ClearWordChoices();
00877   ClearRatings();
00878   ratings = new MATRIX(blob_count, 1);
00879   for (int c = 0; c < blob_count; ++c) {
00880     BLOB_CHOICE_LIST* choice_list = new BLOB_CHOICE_LIST;
00881     BLOB_CHOICE_IT choice_it(choice_list);
00882     choice_it.add_after_then_move(choices[c]);
00883     ratings->put(c, c, choice_list);
00884   }
00885   FakeWordFromRatings();
00886   reject_map.initialise(blob_count);
00887   done = true;
00888 }
00889 
00890 // Creates a WERD_CHOICE for the word using the top choices from the leading
00891 // diagonal of the ratings matrix.
00892 void WERD_RES::FakeWordFromRatings() {
00893   int num_blobs = ratings->dimension();
00894   WERD_CHOICE* word_choice = new WERD_CHOICE(uch_set, num_blobs);
00895   word_choice->set_permuter(TOP_CHOICE_PERM);
00896   for (int b = 0; b < num_blobs; ++b) {
00897     UNICHAR_ID unichar_id = UNICHAR_SPACE;
00898     float rating = MAX_INT32;
00899     float certainty = -MAX_INT32;
00900     BLOB_CHOICE_LIST* choices = ratings->get(b, b);
00901     if (choices != NULL && !choices->empty()) {
00902       BLOB_CHOICE_IT bc_it(choices);
00903       BLOB_CHOICE* choice = bc_it.data();
00904       unichar_id = choice->unichar_id();
00905       rating = choice->rating();
00906       certainty = choice->certainty();
00907     }
00908     word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
00909                                                    certainty);
00910   }
00911   LogNewRawChoice(word_choice);
00912   // Ownership of word_choice taken by word here.
00913   LogNewCookedChoice(1, false, word_choice);
00914 }
00915 
00916 // Copies the best_choice strings to the correct_text for adaption/training.
00917 void WERD_RES::BestChoiceToCorrectText() {
00918   correct_text.clear();
00919   ASSERT_HOST(best_choice != NULL);
00920   for (int i = 0; i < best_choice->length(); ++i) {
00921     UNICHAR_ID choice_id = best_choice->unichar_id(i);
00922     const char* blob_choice = uch_set->id_to_unichar(choice_id);
00923     correct_text.push_back(STRING(blob_choice));
00924   }
00925 }
00926 
00927 // Merges 2 adjacent blobs in the result if the permanent callback
00928 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
00929 // callback box_cb is NULL or returns true, setting the merged blob
00930 // result to the class returned from class_cb.
00931 // Returns true if anything was merged.
00932 bool WERD_RES::ConditionalBlobMerge(
00933     TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb,
00934     TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb) {
00935   ASSERT_HOST(best_choice->length() == 0 || ratings != NULL);
00936   bool modified = false;
00937   for (int i = 0; i + 1 < best_choice->length(); ++i) {
00938     UNICHAR_ID new_id = class_cb->Run(best_choice->unichar_id(i),
00939                                       best_choice->unichar_id(i+1));
00940     if (new_id != INVALID_UNICHAR_ID &&
00941         (box_cb == NULL || box_cb->Run(box_word->BlobBox(i),
00942                                        box_word->BlobBox(i + 1)))) {
00943       // Raw choice should not be fixed.
00944       best_choice->set_unichar_id(new_id, i);
00945       modified = true;
00946       MergeAdjacentBlobs(i);
00947       const MATRIX_COORD& coord = best_choice->MatrixCoord(i);
00948       if (!coord.Valid(*ratings)) {
00949         ratings->IncreaseBandSize(coord.row + 1 - coord.col);
00950       }
00951       BLOB_CHOICE_LIST* blob_choices = GetBlobChoices(i);
00952       if (FindMatchingChoice(new_id, blob_choices) == NULL) {
00953         // Insert a fake result.
00954         BLOB_CHOICE* blob_choice = new BLOB_CHOICE;
00955         blob_choice->set_unichar_id(new_id);
00956         BLOB_CHOICE_IT bc_it(blob_choices);
00957         bc_it.add_before_then_move(blob_choice);
00958       }
00959     }
00960   }
00961   delete class_cb;
00962   delete box_cb;
00963   return modified;
00964 }
00965 
00966 // Merges 2 adjacent blobs in the result (index and index+1) and corrects
00967 // all the data to account for the change.
00968 void WERD_RES::MergeAdjacentBlobs(int index) {
00969   if (reject_map.length() == best_choice->length())
00970     reject_map.remove_pos(index);
00971   best_choice->remove_unichar_id(index + 1);
00972   rebuild_word->MergeBlobs(index, index + 2);
00973   box_word->MergeBoxes(index, index + 2);
00974   if (index + 1 < best_state.length()) {
00975     best_state[index] += best_state[index + 1];
00976     best_state.remove(index + 1);
00977   }
00978 }
00979 
00980 // TODO(tkielbus) Decide between keeping this behavior here or modifying the
00981 // training data.
00982 
00983 // Utility function for fix_quotes
00984 // Return true if the next character in the string (given the UTF8 length in
00985 // bytes) is a quote character.
00986 static int is_simple_quote(const char* signed_str, int length) {
00987   const unsigned char* str =
00988       reinterpret_cast<const unsigned char*>(signed_str);
00989   // Standard 1 byte quotes.
00990   return (length == 1 && (*str == '\'' || *str == '`')) ||
00991       // UTF-8 3 bytes curved quotes.
00992       (length == 3 && ((*str == 0xe2 &&
00993                         *(str + 1) == 0x80 &&
00994                         *(str + 2) == 0x98) ||
00995                        (*str == 0xe2 &&
00996                         *(str + 1) == 0x80 &&
00997                         *(str + 2) == 0x99)));
00998 }
00999 
01000 // Callback helper for fix_quotes returns a double quote if both
01001 // arguments are quote, otherwise INVALID_UNICHAR_ID.
01002 UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) {
01003   const char *ch = uch_set->id_to_unichar(id1);
01004   const char *next_ch = uch_set->id_to_unichar(id2);
01005   if (is_simple_quote(ch, strlen(ch)) &&
01006       is_simple_quote(next_ch, strlen(next_ch)))
01007     return uch_set->unichar_to_id("\"");
01008   return INVALID_UNICHAR_ID;
01009 }
01010 
01011 // Change pairs of quotes to double quotes.
01012 void WERD_RES::fix_quotes() {
01013   if (!uch_set->contains_unichar("\"") ||
01014       !uch_set->get_enabled(uch_set->unichar_to_id("\"")))
01015     return;  // Don't create it if it is disallowed.
01016 
01017   ConditionalBlobMerge(
01018       NewPermanentTessCallback(this, &WERD_RES::BothQuotes),
01019       NULL);
01020 }
01021 
01022 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
01023 // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
01024 UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) {
01025   const char *ch = uch_set->id_to_unichar(id1);
01026   const char *next_ch = uch_set->id_to_unichar(id2);
01027   if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
01028       (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~'))
01029     return uch_set->unichar_to_id("-");
01030   return INVALID_UNICHAR_ID;
01031 }
01032 
01033 // Callback helper for fix_hyphens returns true if box1 and box2 overlap
01034 // (assuming both on the same textline, are in order and a chopped em dash.)
01035 bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) {
01036   return box1.right() >= box2.left();
01037 }
01038 
01039 // Change pairs of hyphens to a single hyphen if the bounding boxes touch
01040 // Typically a long dash which has been segmented.
01041 void WERD_RES::fix_hyphens() {
01042   if (!uch_set->contains_unichar("-") ||
01043       !uch_set->get_enabled(uch_set->unichar_to_id("-")))
01044     return;  // Don't create it if it is disallowed.
01045 
01046   ConditionalBlobMerge(
01047       NewPermanentTessCallback(this, &WERD_RES::BothHyphens),
01048       NewPermanentTessCallback(this, &WERD_RES::HyphenBoxesOverlap));
01049 }
01050 
01051 // Callback helper for merge_tess_fails returns a space if both
01052 // arguments are space, otherwise INVALID_UNICHAR_ID.
01053 UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {
01054   if (id1 == id2 && id1 == uch_set->unichar_to_id(" "))
01055     return id1;
01056   else
01057     return INVALID_UNICHAR_ID;
01058 }
01059 
01060 // Change pairs of tess failures to a single one
01061 void WERD_RES::merge_tess_fails() {
01062   if (ConditionalBlobMerge(
01063       NewPermanentTessCallback(this, &WERD_RES::BothSpaces), NULL)) {
01064     int len = best_choice->length();
01065     ASSERT_HOST(reject_map.length() == len);
01066     ASSERT_HOST(box_word->length() == len);
01067   }
01068 }
01069 
01070 // Returns true if the collection of count pieces, starting at start, are all
01071 // natural connected components, ie there are no real chops involved.
01072 bool WERD_RES::PiecesAllNatural(int start, int count) const {
01073   // all seams must have no splits.
01074   for (int index = start; index < start + count - 1; ++index) {
01075     if (index >= 0 && index < seam_array.size()) {
01076       SEAM* seam = seam_array[index];
01077       if (seam != NULL && seam->HasAnySplits()) return false;
01078     }
01079   }
01080   return true;
01081 }
01082 
01083 
01084 WERD_RES::~WERD_RES () {
01085   Clear();
01086 }
01087 
01088 void WERD_RES::InitNonPointers() {
01089   tess_failed = FALSE;
01090   tess_accepted = FALSE;
01091   tess_would_adapt = FALSE;
01092   done = FALSE;
01093   unlv_crunch_mode = CR_NONE;
01094   small_caps = false;
01095   odd_size = false;
01096   italic = FALSE;
01097   bold = FALSE;
01098   // The fontinfos and tesseract count as non-pointers as they point to
01099   // data owned elsewhere.
01100   fontinfo = NULL;
01101   fontinfo2 = NULL;
01102   tesseract = NULL;
01103   fontinfo_id_count = 0;
01104   fontinfo_id2_count = 0;
01105   x_height = 0.0;
01106   caps_height = 0.0;
01107   baseline_shift = 0.0f;
01108   guessed_x_ht = TRUE;
01109   guessed_caps_ht = TRUE;
01110   combination = FALSE;
01111   part_of_combo = FALSE;
01112   reject_spaces = FALSE;
01113 }
01114 
01115 void WERD_RES::InitPointers() {
01116   word = NULL;
01117   bln_boxes = NULL;
01118   blob_row = NULL;
01119   uch_set = NULL;
01120   chopped_word = NULL;
01121   rebuild_word = NULL;
01122   box_word = NULL;
01123   ratings = NULL;
01124   best_choice = NULL;
01125   raw_choice = NULL;
01126   ep_choice = NULL;
01127   blamer_bundle = NULL;
01128 }
01129 
01130 void WERD_RES::Clear() {
01131   if (word != NULL && combination) {
01132     delete word;
01133   }
01134   word = NULL;
01135   delete blamer_bundle;
01136   blamer_bundle = NULL;
01137   ClearResults();
01138 }
01139 
01140 void WERD_RES::ClearResults() {
01141   done = false;
01142   fontinfo = NULL;
01143   fontinfo2 = NULL;
01144   fontinfo_id_count = 0;
01145   fontinfo_id2_count = 0;
01146   if (bln_boxes != NULL) {
01147     delete bln_boxes;
01148     bln_boxes = NULL;
01149   }
01150   blob_row = NULL;
01151   if (chopped_word != NULL) {
01152     delete chopped_word;
01153     chopped_word = NULL;
01154   }
01155   if (rebuild_word != NULL) {
01156     delete rebuild_word;
01157     rebuild_word = NULL;
01158   }
01159   if (box_word != NULL) {
01160     delete box_word;
01161     box_word = NULL;
01162   }
01163   best_state.clear();
01164   correct_text.clear();
01165   seam_array.delete_data_pointers();
01166   seam_array.clear();
01167   blob_widths.clear();
01168   blob_gaps.clear();
01169   ClearRatings();
01170   ClearWordChoices();
01171   if (blamer_bundle != NULL) blamer_bundle->ClearResults();
01172 }
01173 void WERD_RES::ClearWordChoices() {
01174   best_choice = NULL;
01175   if (raw_choice != NULL) {
01176     delete raw_choice;
01177     raw_choice = NULL;
01178   }
01179   best_choices.clear();
01180   if (ep_choice != NULL) {
01181     delete ep_choice;
01182     ep_choice = NULL;
01183   }
01184 }
01185 void WERD_RES::ClearRatings() {
01186   if (ratings != NULL) {
01187     ratings->delete_matrix_pointers();
01188     delete ratings;
01189     ratings = NULL;
01190   }
01191 }
01192 
01193 
01194 bool PAGE_RES_IT::operator ==(const PAGE_RES_IT &other) const {
01195   return word_res == other.word_res &&
01196       row_res == other.row_res &&
01197       block_res == other.block_res;
01198 }
01199 
01200 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
01201   ASSERT_HOST(page_res == other.page_res);
01202   if (other.block_res == NULL) {
01203     // other points to the end of the page.
01204     if (block_res == NULL)
01205       return 0;
01206     return -1;
01207   }
01208   if (block_res == NULL) {
01209     return 1; // we point to the end of the page.
01210   }
01211   if (block_res == other.block_res) {
01212     if (other.row_res == NULL || row_res == NULL) {
01213       // this should only happen if we hit an image block.
01214       return 0;
01215     }
01216     if (row_res == other.row_res) {
01217       // we point to the same block and row.
01218       ASSERT_HOST(other.word_res != NULL && word_res != NULL);
01219       if (word_res == other.word_res) {
01220         // we point to the same word!
01221         return 0;
01222       }
01223 
01224       WERD_RES_IT word_res_it(&row_res->word_res_list);
01225       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
01226            word_res_it.forward()) {
01227         if (word_res_it.data() == word_res) {
01228           return -1;
01229         } else if (word_res_it.data() == other.word_res) {
01230           return 1;
01231         }
01232       }
01233       ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
01234     }
01235 
01236     // we both point to the same block, but different rows.
01237     ROW_RES_IT row_res_it(&block_res->row_res_list);
01238     for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
01239          row_res_it.forward()) {
01240       if (row_res_it.data() == row_res) {
01241         return -1;
01242       } else if (row_res_it.data() == other.row_res) {
01243         return 1;
01244       }
01245     }
01246     ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
01247   }
01248 
01249   // We point to different blocks.
01250   BLOCK_RES_IT block_res_it(&page_res->block_res_list);
01251   for (block_res_it.mark_cycle_pt();
01252        !block_res_it.cycled_list(); block_res_it.forward()) {
01253     if (block_res_it.data() == block_res) {
01254       return -1;
01255     } else if (block_res_it.data() == other.block_res) {
01256       return 1;
01257     }
01258   }
01259   // Shouldn't happen...
01260   ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
01261   return 0;
01262 }
01263 
01264 // Inserts the new_word as a combination owned by a corresponding WERD_RES
01265 // before the current position. The simple fields of the WERD_RES are copied
01266 // from clone_res and the resulting WERD_RES is returned for further setup
01267 // with best_choice etc.
01268 WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res,
01269                                              WERD* new_word) {
01270   // Make a WERD_RES for the new_word.
01271   WERD_RES* new_res = new WERD_RES(new_word);
01272   new_res->CopySimpleFields(clone_res);
01273   new_res->combination = true;
01274   // Insert into the appropriate place in the ROW_RES.
01275   WERD_RES_IT wr_it(&row()->word_res_list);
01276   for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
01277     WERD_RES* word = wr_it.data();
01278     if (word == word_res)
01279       break;
01280   }
01281   ASSERT_HOST(!wr_it.cycled_list());
01282   wr_it.add_before_then_move(new_res);
01283   if (wr_it.at_first()) {
01284     // This is the new first word, so reset the member iterator so it
01285     // detects the cycled_list state correctly.
01286     ResetWordIterator();
01287   }
01288   return new_res;
01289 }
01290 
01291 // Helper computes the boundaries between blobs in the word. The blob bounds
01292 // are likely very poor, if they come from LSTM, where it only outputs the
01293 // character at one pixel within it, so we find the midpoints between them.
01294 static void ComputeBlobEnds(const WERD_RES& word, C_BLOB_LIST* next_word_blobs,
01295                             GenericVector<int>* blob_ends) {
01296   C_BLOB_IT blob_it(word.word->cblob_list());
01297   for (int i = 0; i < word.best_state.size(); ++i) {
01298     int length = word.best_state[i];
01299     // Get the bounding box of the fake blobs
01300     TBOX blob_box = blob_it.data()->bounding_box();
01301     blob_it.forward();
01302     for (int b = 1; b < length; ++b) {
01303       blob_box += blob_it.data()->bounding_box();
01304       blob_it.forward();
01305     }
01306     // This blob_box is crap, so for now we are only looking for the
01307     // boundaries between them.
01308     int blob_end = MAX_INT32;
01309     if (!blob_it.at_first() || next_word_blobs != NULL) {
01310       if (blob_it.at_first())
01311         blob_it.set_to_list(next_word_blobs);
01312       blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
01313     }
01314     blob_ends->push_back(blob_end);
01315   }
01316 }
01317 
01318 // Replaces the current WERD/WERD_RES with the given words. The given words
01319 // contain fake blobs that indicate the position of the characters. These are
01320 // replaced with real blobs from the current word as much as possible.
01321 void PAGE_RES_IT::ReplaceCurrentWord(
01322     tesseract::PointerVector<WERD_RES>* words) {
01323   if (words->empty()) {
01324     DeleteCurrentWord();
01325     return;
01326   }
01327   WERD_RES* input_word = word();
01328   // Set the BOL/EOL flags on the words from the input word.
01329   if (input_word->word->flag(W_BOL)) {
01330     (*words)[0]->word->set_flag(W_BOL, true);
01331   } else {
01332     (*words)[0]->word->set_blanks(1);
01333   }
01334   words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
01335 
01336   // Move the blobs from the input word to the new set of words.
01337   // If the input word_res is a combination, then the replacements will also be
01338   // combinations, and will own their own words. If the input word_res is not a
01339   // combination, then the final replacements will not be either, (although it
01340   // is allowed for the input words to be combinations) and their words
01341   // will get put on the row list. This maintains the ownership rules.
01342   WERD_IT w_it(row()->row->word_list());
01343   if (!input_word->combination) {
01344     for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
01345       WERD* word = w_it.data();
01346       if (word == input_word->word)
01347         break;
01348     }
01349     // w_it is now set to the input_word's word.
01350     ASSERT_HOST(!w_it.cycled_list());
01351   }
01352   // Insert into the appropriate place in the ROW_RES.
01353   WERD_RES_IT wr_it(&row()->word_res_list);
01354   for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
01355     WERD_RES* word = wr_it.data();
01356     if (word == input_word)
01357       break;
01358   }
01359   ASSERT_HOST(!wr_it.cycled_list());
01360   // Since we only have an estimate of the bounds between blobs, use the blob
01361   // x-middle as the determiner of where to put the blobs
01362   C_BLOB_IT src_b_it(input_word->word->cblob_list());
01363   src_b_it.sort(&C_BLOB::SortByXMiddle);
01364   C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
01365   rej_b_it.sort(&C_BLOB::SortByXMiddle);
01366   for (int w = 0; w < words->size(); ++w) {
01367     WERD_RES* word_w = (*words)[w];
01368     // Compute blob boundaries.
01369     GenericVector<int> blob_ends;
01370     C_BLOB_LIST* next_word_blobs =
01371         w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : NULL;
01372     ComputeBlobEnds(*word_w, next_word_blobs, &blob_ends);
01373     // Delete the fake blobs on the current word.
01374     word_w->word->cblob_list()->clear();
01375     C_BLOB_IT dest_it(word_w->word->cblob_list());
01376     // Build the box word as we move the blobs.
01377     tesseract::BoxWord* box_word = new tesseract::BoxWord;
01378     for (int i = 0; i < blob_ends.size(); ++i) {
01379       int end_x = blob_ends[i];
01380       TBOX blob_box;
01381       // Add the blobs up to end_x.
01382       while (!src_b_it.empty() &&
01383              src_b_it.data()->bounding_box().x_middle() < end_x) {
01384         blob_box += src_b_it.data()->bounding_box();
01385         dest_it.add_after_then_move(src_b_it.extract());
01386         src_b_it.forward();
01387       }
01388       while (!rej_b_it.empty() &&
01389              rej_b_it.data()->bounding_box().x_middle() < end_x) {
01390         blob_box += rej_b_it.data()->bounding_box();
01391         dest_it.add_after_then_move(rej_b_it.extract());
01392         rej_b_it.forward();
01393       }
01394       // Clip to the previously computed bounds. Although imperfectly accurate,
01395       // it is good enough, and much more complicated to determine where else
01396       // to clip.
01397       if (i > 0 && blob_box.left() < blob_ends[i - 1])
01398         blob_box.set_left(blob_ends[i - 1]);
01399       if (blob_box.right() > end_x)
01400         blob_box.set_right(end_x);
01401       box_word->InsertBox(i, blob_box);
01402     }
01403     // Fix empty boxes. If a very joined blob sits over multiple characters,
01404     // then we will have some empty boxes from using the middle, so look for
01405     // overlaps.
01406     for (int i = 0; i < box_word->length(); ++i) {
01407       TBOX box = box_word->BlobBox(i);
01408       if (box.null_box()) {
01409         // Nothing has its middle in the bounds of this blob, so use anything
01410         // that overlaps.
01411         for (dest_it.mark_cycle_pt(); !dest_it.cycled_list();
01412              dest_it.forward()) {
01413           TBOX blob_box = dest_it.data()->bounding_box();
01414           if (blob_box.left() < blob_ends[i] &&
01415               (i == 0 || blob_box.right() >= blob_ends[i - 1])) {
01416             if (i > 0 && blob_box.left() < blob_ends[i - 1])
01417               blob_box.set_left(blob_ends[i - 1]);
01418             if (blob_box.right() > blob_ends[i])
01419               blob_box.set_right(blob_ends[i]);
01420             box_word->ChangeBox(i, blob_box);
01421             break;
01422           }
01423         }
01424       }
01425     }
01426     delete word_w->box_word;
01427     word_w->box_word = box_word;
01428     if (!input_word->combination) {
01429       // Insert word_w->word into the ROW. It doesn't own its word, so the
01430       // ROW needs to own it.
01431       w_it.add_before_stay_put(word_w->word);
01432       word_w->combination = false;
01433     }
01434     (*words)[w] = NULL;  // We are taking ownership.
01435     wr_it.add_before_stay_put(word_w);
01436   }
01437   // We have taken ownership of the words.
01438   words->clear();
01439   // Delete the current word, which has been replaced. We could just call
01440   // DeleteCurrentWord, but that would iterate both lists again, and we know
01441   // we are already in the right place.
01442   if (!input_word->combination)
01443     delete w_it.extract();
01444   delete wr_it.extract();
01445   ResetWordIterator();
01446 }
01447 
01448 // Deletes the current WERD_RES and its underlying WERD.
01449 void PAGE_RES_IT::DeleteCurrentWord() {
01450   // Check that this word is as we expect. part_of_combos are NEVER iterated
01451   // by the normal iterator, so we should never be trying to delete them.
01452   ASSERT_HOST(!word_res->part_of_combo);
01453   if (!word_res->combination) {
01454     // Combinations own their own word, so we won't find the word on the
01455     // row's word_list, but it is legitimate to try to delete them.
01456     // Delete word from the ROW when not a combination.
01457     WERD_IT w_it(row()->row->word_list());
01458     for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
01459       if (w_it.data() == word_res->word) {
01460         break;
01461       }
01462     }
01463     ASSERT_HOST(!w_it.cycled_list());
01464     delete w_it.extract();
01465   }
01466   // Remove the WERD_RES for the new_word.
01467   // Remove the WORD_RES from the ROW_RES.
01468   WERD_RES_IT wr_it(&row()->word_res_list);
01469   for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
01470     if (wr_it.data() == word_res) {
01471       word_res = NULL;
01472       break;
01473     }
01474   }
01475   ASSERT_HOST(!wr_it.cycled_list());
01476   delete wr_it.extract();
01477   ResetWordIterator();
01478 }
01479 
01480 // Makes the current word a fuzzy space if not already fuzzy. Updates
01481 // corresponding part of combo if required.
01482 void PAGE_RES_IT::MakeCurrentWordFuzzy() {
01483   WERD* real_word = word_res->word;
01484   if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
01485     real_word->set_flag(W_FUZZY_SP, true);
01486     if (word_res->combination) {
01487       // The next word should be the corresponding part of combo, but we have
01488       // already stepped past it, so find it by search.
01489       WERD_RES_IT wr_it(&row()->word_res_list);
01490       for (wr_it.mark_cycle_pt();
01491            !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
01492       }
01493       wr_it.forward();
01494       ASSERT_HOST(wr_it.data()->part_of_combo);
01495       real_word = wr_it.data()->word;
01496       ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
01497                   !real_word->flag(W_FUZZY_NON));
01498       real_word->set_flag(W_FUZZY_SP, true);
01499     }
01500   }
01501 }
01502 
01503 /*************************************************************************
01504  * PAGE_RES_IT::restart_page
01505  *
01506  * Set things up at the start of the page
01507  *************************************************************************/
01508 
01509 WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) {
01510   block_res_it.set_to_list(&page_res->block_res_list);
01511   block_res_it.mark_cycle_pt();
01512   prev_block_res = NULL;
01513   prev_row_res = NULL;
01514   prev_word_res = NULL;
01515   block_res = NULL;
01516   row_res = NULL;
01517   word_res = NULL;
01518   next_block_res = NULL;
01519   next_row_res = NULL;
01520   next_word_res = NULL;
01521   internal_forward(true, empty_ok);
01522   return internal_forward(false, empty_ok);
01523 }
01524 
01525 // Recovers from operations on the current word, such as in InsertCloneWord
01526 // and DeleteCurrentWord.
01527 // Resets the word_res_it so that it is one past the next_word_res, as
01528 // it should be after internal_forward. If next_row_res != row_res,
01529 // then the next_word_res is in the next row, so there is no need to do
01530 // anything to word_res_it, but it is still a good idea to reset the pointers
01531 // word_res and prev_word_res, which are still in the current row.
01532 void PAGE_RES_IT::ResetWordIterator() {
01533   if (row_res == next_row_res) {
01534     // Reset the member iterator so it can move forward and detect the
01535     // cycled_list state correctly.
01536     word_res_it.move_to_first();
01537     for (word_res_it.mark_cycle_pt();
01538          !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
01539          word_res_it.forward()) {
01540       if (!word_res_it.data()->part_of_combo) {
01541         if (prev_row_res == row_res) prev_word_res = word_res;
01542         word_res = word_res_it.data();
01543       }
01544     }
01545     ASSERT_HOST(!word_res_it.cycled_list());
01546     word_res_it.forward();
01547   } else {
01548     // word_res_it is OK, but reset word_res and prev_word_res if needed.
01549     WERD_RES_IT wr_it(&row_res->word_res_list);
01550     for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
01551       if (!wr_it.data()->part_of_combo) {
01552         if (prev_row_res == row_res) prev_word_res = word_res;
01553         word_res = wr_it.data();
01554       }
01555     }
01556   }
01557 }
01558 
01559 /*************************************************************************
01560  * PAGE_RES_IT::internal_forward
01561  *
01562  * Find the next word on the page. If empty_ok is true, then non-text blocks
01563  * and text blocks with no text are visited as if they contain a single
01564  * imaginary word in a single imaginary row. (word() and row() both return NULL
01565  * in such a block and the return value is NULL.)
01566  * If empty_ok is false, the old behaviour is maintained. Each real word
01567  * is visited and empty and non-text blocks and rows are skipped.
01568  * new_block is used to initialize the iterators for a new block.
01569  * The iterator maintains pointers to block, row and word for the previous,
01570  * current and next words.  These are correct, regardless of block/row
01571  * boundaries. NULL values denote start and end of the page.
01572  *************************************************************************/
01573 
01574 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
01575   bool new_row = false;
01576 
01577   prev_block_res = block_res;
01578   prev_row_res = row_res;
01579   prev_word_res = word_res;
01580   block_res = next_block_res;
01581   row_res = next_row_res;
01582   word_res = next_word_res;
01583   next_block_res = NULL;
01584   next_row_res = NULL;
01585   next_word_res = NULL;
01586 
01587   while (!block_res_it.cycled_list()) {
01588     if (new_block) {
01589       new_block = false;
01590       row_res_it.set_to_list(&block_res_it.data()->row_res_list);
01591       row_res_it.mark_cycle_pt();
01592       if (row_res_it.empty() && empty_ok) {
01593         next_block_res = block_res_it.data();
01594         break;
01595       }
01596       new_row = true;
01597     }
01598     while (!row_res_it.cycled_list()) {
01599       if (new_row) {
01600         new_row = false;
01601         word_res_it.set_to_list(&row_res_it.data()->word_res_list);
01602         word_res_it.mark_cycle_pt();
01603       }
01604       // Skip any part_of_combo words.
01605       while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
01606         word_res_it.forward();
01607       if (!word_res_it.cycled_list()) {
01608         next_block_res = block_res_it.data();
01609         next_row_res = row_res_it.data();
01610         next_word_res = word_res_it.data();
01611         word_res_it.forward();
01612         goto foundword;
01613       }
01614       // end of row reached
01615       row_res_it.forward();
01616       new_row = true;
01617     }
01618     // end of block reached
01619     block_res_it.forward();
01620     new_block = true;
01621   }
01622   foundword:
01623   // Update prev_word_best_choice pointer.
01624   if (page_res != NULL && page_res->prev_word_best_choice != NULL) {
01625     *page_res->prev_word_best_choice =
01626       (new_block || prev_word_res == NULL) ? NULL : prev_word_res->best_choice;
01627   }
01628   return word_res;
01629 }
01630 
01631 /*************************************************************************
01632  * PAGE_RES_IT::restart_row()
01633  *
01634  * Move to the beginning (leftmost word) of the current row.
01635  *************************************************************************/
01636 WERD_RES *PAGE_RES_IT::restart_row() {
01637   ROW_RES *row = this->row();
01638   if (!row) return NULL;
01639   for (restart_page(); this->row() != row; forward()) {
01640     // pass
01641   }
01642   return word();
01643 }
01644 
01645 /*************************************************************************
01646  * PAGE_RES_IT::forward_paragraph
01647  *
01648  * Move to the beginning of the next paragraph, allowing empty blocks.
01649  *************************************************************************/
01650 
01651 WERD_RES *PAGE_RES_IT::forward_paragraph() {
01652   while (block_res == next_block_res &&
01653          (next_row_res != NULL && next_row_res->row != NULL &&
01654           row_res->row->para() == next_row_res->row->para())) {
01655     internal_forward(false, true);
01656   }
01657   return internal_forward(false, true);
01658 }
01659 
01660 /*************************************************************************
01661  * PAGE_RES_IT::forward_block
01662  *
01663  * Move to the beginning of the next block, allowing empty blocks.
01664  *************************************************************************/
01665 
01666 WERD_RES *PAGE_RES_IT::forward_block() {
01667   while (block_res == next_block_res) {
01668     internal_forward(false, true);
01669   }
01670   return internal_forward(false, true);
01671 }
01672 
01673 void PAGE_RES_IT::rej_stat_word() {
01674   inT16 chars_in_word;
01675   inT16 rejects_in_word = 0;
01676 
01677   chars_in_word = word_res->reject_map.length ();
01678   page_res->char_count += chars_in_word;
01679   block_res->char_count += chars_in_word;
01680   row_res->char_count += chars_in_word;
01681 
01682   rejects_in_word = word_res->reject_map.reject_count ();
01683 
01684   page_res->rej_count += rejects_in_word;
01685   block_res->rej_count += rejects_in_word;
01686   row_res->rej_count += rejects_in_word;
01687   if (chars_in_word == rejects_in_word)
01688     row_res->whole_word_rej_count += rejects_in_word;
01689 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines