|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: pageres.cpp (Formerly page_res.c) 00003 * Description: Hierarchy of results classes from PAGE_RES to WERD_RES 00004 * and an iterator class to iterate over the words. 00005 * Main purposes: 00006 * Easy way to iterate over the words without a 3-nested loop. 00007 * Holds data used during word recognition. 00008 * Holds information about alternative spacing paths. 00009 * Author: Phil Cheatle 00010 * Created: Tue Sep 22 08:42:49 BST 1992 00011 * 00012 * (C) Copyright 1992, Hewlett-Packard Ltd. 00013 ** Licensed under the Apache License, Version 2.0 (the "License"); 00014 ** you may not use this file except in compliance with the License. 00015 ** You may obtain a copy of the License at 00016 ** http://www.apache.org/licenses/LICENSE-2.0 00017 ** Unless required by applicable law or agreed to in writing, software 00018 ** distributed under the License is distributed on an "AS IS" BASIS, 00019 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00020 ** See the License for the specific language governing permissions and 00021 ** limitations under the License. 00022 * 00023 **********************************************************************/ 00024 #include <stdlib.h> 00025 #ifdef __UNIX__ 00026 #include <assert.h> 00027 #endif 00028 #include "blamer.h" 00029 #include "pageres.h" 00030 #include "blobs.h" 00031 00032 ELISTIZE (BLOCK_RES) 00033 CLISTIZE (BLOCK_RES) ELISTIZE (ROW_RES) ELISTIZE (WERD_RES) 00034 00035 // Gain factor for computing thresholds that determine the ambiguity of a word. 00036 static const double kStopperAmbiguityThresholdGain = 8.0; 00037 // Constant offset for computing thresholds that determine the ambiguity of a 00038 // word. 00039 static const double kStopperAmbiguityThresholdOffset = 1.5; 00040 // Max number of broken pieces to associate. 00041 const int kWordrecMaxNumJoinChunks = 4; 00042 // Max ratio of word box height to line size to allow it to be processed as 00043 // a line with other words. 00044 const double kMaxWordSizeRatio = 1.25; 00045 // Max ratio of line box height to line size to allow a new word to be added. 00046 const double kMaxLineSizeRatio = 1.25; 00047 // Max ratio of word gap to line size to allow a new word to be added. 00048 const double kMaxWordGapRatio = 2.0; 00049 00050 // Computes and returns a threshold of certainty difference used to determine 00051 // which words to keep, based on the adjustment factors of the two words. 00052 // TODO(rays) This is horrible. Replace with an enhance params training model. 00053 static double StopperAmbigThreshold(double f1, double f2) { 00054 return (f2 - f1) * kStopperAmbiguityThresholdGain - 00055 kStopperAmbiguityThresholdOffset; 00056 } 00057 00058 /************************************************************************* 00059 * PAGE_RES::PAGE_RES 00060 * 00061 * Constructor for page results 00062 *************************************************************************/ 00063 PAGE_RES::PAGE_RES( 00064 bool merge_similar_words, 00065 BLOCK_LIST *the_block_list, 00066 WERD_CHOICE **prev_word_best_choice_ptr) { 00067 Init(); 00068 BLOCK_IT block_it(the_block_list); 00069 BLOCK_RES_IT block_res_it(&block_res_list); 00070 for (block_it.mark_cycle_pt(); 00071 !block_it.cycled_list(); block_it.forward()) { 00072 block_res_it.add_to_end(new BLOCK_RES(merge_similar_words, 00073 block_it.data())); 00074 } 00075 prev_word_best_choice = prev_word_best_choice_ptr; 00076 } 00077 00078 /************************************************************************* 00079 * BLOCK_RES::BLOCK_RES 00080 * 00081 * Constructor for BLOCK results 00082 *************************************************************************/ 00083 00084 BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) { 00085 ROW_IT row_it (the_block->row_list ()); 00086 ROW_RES_IT row_res_it(&row_res_list); 00087 00088 char_count = 0; 00089 rej_count = 0; 00090 font_class = -1; //not assigned 00091 x_height = -1.0; 00092 font_assigned = FALSE; 00093 bold = FALSE; 00094 italic = FALSE; 00095 row_count = 0; 00096 00097 block = the_block; 00098 00099 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00100 row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data())); 00101 } 00102 } 00103 00104 /************************************************************************* 00105 * ROW_RES::ROW_RES 00106 * 00107 * Constructor for ROW results 00108 *************************************************************************/ 00109 00110 ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) { 00111 WERD_IT word_it(the_row->word_list()); 00112 WERD_RES_IT word_res_it(&word_res_list); 00113 WERD_RES *combo = NULL; // current combination of fuzzies 00114 WERD *copy_word; 00115 00116 char_count = 0; 00117 rej_count = 0; 00118 whole_word_rej_count = 0; 00119 00120 row = the_row; 00121 bool add_next_word = false; 00122 TBOX union_box; 00123 float line_height = the_row->x_height() + the_row->ascenders() - 00124 the_row->descenders(); 00125 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00126 WERD_RES* word_res = new WERD_RES(word_it.data()); 00127 word_res->x_height = the_row->x_height(); 00128 if (add_next_word) { 00129 ASSERT_HOST(combo != NULL); 00130 // We are adding this word to the combination. 00131 word_res->part_of_combo = TRUE; 00132 combo->copy_on(word_res); 00133 } else if (merge_similar_words) { 00134 union_box = word_res->word->bounding_box(); 00135 add_next_word = !word_res->word->flag(W_REP_CHAR) && 00136 union_box.height() <= line_height * kMaxWordSizeRatio; 00137 word_res->odd_size = !add_next_word; 00138 } 00139 WERD* next_word = word_it.data_relative(1); 00140 if (merge_similar_words) { 00141 if (add_next_word && !next_word->flag(W_REP_CHAR)) { 00142 // Next word will be added on if all of the following are true: 00143 // Not a rep char. 00144 // Box height small enough. 00145 // Union box height small enough. 00146 // Horizontal gap small enough. 00147 TBOX next_box = next_word->bounding_box(); 00148 int prev_right = union_box.right(); 00149 union_box += next_box; 00150 if (next_box.height() > line_height * kMaxWordSizeRatio || 00151 union_box.height() > line_height * kMaxLineSizeRatio || 00152 next_box.left() > prev_right + line_height * kMaxWordGapRatio) { 00153 add_next_word = false; 00154 } 00155 } 00156 next_word->set_flag(W_FUZZY_NON, add_next_word); 00157 } else { 00158 add_next_word = next_word->flag(W_FUZZY_NON); 00159 } 00160 if (add_next_word) { 00161 if (combo == NULL) { 00162 copy_word = new WERD; 00163 *copy_word = *(word_it.data()); // deep copy 00164 combo = new WERD_RES(copy_word); 00165 combo->x_height = the_row->x_height(); 00166 combo->combination = TRUE; 00167 word_res_it.add_to_end(combo); 00168 } 00169 word_res->part_of_combo = TRUE; 00170 } else { 00171 combo = NULL; 00172 } 00173 word_res_it.add_to_end(word_res); 00174 } 00175 } 00176 00177 00178 WERD_RES& WERD_RES::operator=(const WERD_RES & source) { 00179 this->ELIST_LINK::operator=(source); 00180 Clear(); 00181 if (source.combination) { 00182 word = new WERD; 00183 *word = *(source.word); // deep copy 00184 } else { 00185 word = source.word; // pt to same word 00186 } 00187 if (source.bln_boxes != NULL) 00188 bln_boxes = new tesseract::BoxWord(*source.bln_boxes); 00189 if (source.chopped_word != NULL) 00190 chopped_word = new TWERD(*source.chopped_word); 00191 if (source.rebuild_word != NULL) 00192 rebuild_word = new TWERD(*source.rebuild_word); 00193 // TODO(rays) Do we ever need to copy the seam_array? 00194 blob_row = source.blob_row; 00195 denorm = source.denorm; 00196 if (source.box_word != NULL) 00197 box_word = new tesseract::BoxWord(*source.box_word); 00198 best_state = source.best_state; 00199 correct_text = source.correct_text; 00200 blob_widths = source.blob_widths; 00201 blob_gaps = source.blob_gaps; 00202 // None of the uses of operator= require the ratings matrix to be copied, 00203 // so don't as it would be really slow. 00204 00205 // Copy the cooked choices. 00206 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&source.best_choices)); 00207 WERD_CHOICE_IT wc_dest_it(&best_choices); 00208 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) { 00209 const WERD_CHOICE *choice = wc_it.data(); 00210 wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice)); 00211 } 00212 if (!wc_dest_it.empty()) { 00213 wc_dest_it.move_to_first(); 00214 best_choice = wc_dest_it.data(); 00215 } else { 00216 best_choice = NULL; 00217 } 00218 00219 if (source.raw_choice != NULL) { 00220 raw_choice = new WERD_CHOICE(*source.raw_choice); 00221 } else { 00222 raw_choice = NULL; 00223 } 00224 if (source.ep_choice != NULL) { 00225 ep_choice = new WERD_CHOICE(*source.ep_choice); 00226 } else { 00227 ep_choice = NULL; 00228 } 00229 reject_map = source.reject_map; 00230 combination = source.combination; 00231 part_of_combo = source.part_of_combo; 00232 CopySimpleFields(source); 00233 if (source.blamer_bundle != NULL) { 00234 blamer_bundle = new BlamerBundle(*(source.blamer_bundle)); 00235 } 00236 return *this; 00237 } 00238 00239 // Copies basic fields that don't involve pointers that might be useful 00240 // to copy when making one WERD_RES from another. 00241 void WERD_RES::CopySimpleFields(const WERD_RES& source) { 00242 tess_failed = source.tess_failed; 00243 tess_accepted = source.tess_accepted; 00244 tess_would_adapt = source.tess_would_adapt; 00245 done = source.done; 00246 unlv_crunch_mode = source.unlv_crunch_mode; 00247 small_caps = source.small_caps; 00248 odd_size = source.odd_size; 00249 italic = source.italic; 00250 bold = source.bold; 00251 fontinfo = source.fontinfo; 00252 fontinfo2 = source.fontinfo2; 00253 fontinfo_id_count = source.fontinfo_id_count; 00254 fontinfo_id2_count = source.fontinfo_id2_count; 00255 x_height = source.x_height; 00256 caps_height = source.caps_height; 00257 baseline_shift = source.baseline_shift; 00258 guessed_x_ht = source.guessed_x_ht; 00259 guessed_caps_ht = source.guessed_caps_ht; 00260 reject_spaces = source.reject_spaces; 00261 uch_set = source.uch_set; 00262 tesseract = source.tesseract; 00263 } 00264 00265 // Initializes a blank (default constructed) WERD_RES from one that has 00266 // already been recognized. 00267 // Use SetupFor*Recognition afterwards to complete the setup and make 00268 // it ready for a retry recognition. 00269 void WERD_RES::InitForRetryRecognition(const WERD_RES& source) { 00270 word = source.word; 00271 CopySimpleFields(source); 00272 if (source.blamer_bundle != NULL) { 00273 blamer_bundle = new BlamerBundle(); 00274 blamer_bundle->CopyTruth(*source.blamer_bundle); 00275 } 00276 } 00277 00278 // Sets up the members used in recognition: bln_boxes, chopped_word, 00279 // seam_array, denorm. Returns false if 00280 // the word is empty and sets up fake results. If use_body_size is 00281 // true and row->body_size is set, then body_size will be used for 00282 // blob normalization instead of xheight + ascrise. This flag is for 00283 // those languages that are using CJK pitch model and thus it has to 00284 // be true if and only if tesseract->textord_use_cjk_fp_model is 00285 // true. 00286 // If allow_detailed_fx is true, the feature extractor will receive fine 00287 // precision outline information, allowing smoother features and better 00288 // features on low resolution images. 00289 // The norm_mode_hint sets the default mode for normalization in absence 00290 // of any of the above flags. 00291 // norm_box is used to override the word bounding box to determine the 00292 // normalization scale and offset. 00293 // Returns false if the word is empty and sets up fake results. 00294 bool WERD_RES::SetupForRecognition(const UNICHARSET& unicharset_in, 00295 tesseract::Tesseract* tess, Pix* pix, 00296 int norm_mode, 00297 const TBOX* norm_box, 00298 bool numeric_mode, 00299 bool use_body_size, 00300 bool allow_detailed_fx, 00301 ROW *row, const BLOCK* block) { 00302 tesseract::OcrEngineMode norm_mode_hint = 00303 static_cast<tesseract::OcrEngineMode>(norm_mode); 00304 tesseract = tess; 00305 POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL; 00306 if ((norm_mode_hint != tesseract::OEM_CUBE_ONLY && 00307 word->cblob_list()->empty()) || (pb != NULL && !pb->IsText())) { 00308 // Empty words occur when all the blobs have been moved to the rej_blobs 00309 // list, which seems to occur frequently in junk. 00310 SetupFake(unicharset_in); 00311 word->set_flag(W_REP_CHAR, false); 00312 return false; 00313 } 00314 ClearResults(); 00315 SetupWordScript(unicharset_in); 00316 chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word); 00317 float word_xheight = use_body_size && row != NULL && row->body_size() > 0.0f 00318 ? row->body_size() : x_height; 00319 chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE), 00320 word_xheight, baseline_shift, numeric_mode, 00321 norm_mode_hint, norm_box, &denorm); 00322 blob_row = row; 00323 SetupBasicsFromChoppedWord(unicharset_in); 00324 SetupBlamerBundle(); 00325 int num_blobs = chopped_word->NumBlobs(); 00326 ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks); 00327 tess_failed = false; 00328 return true; 00329 } 00330 00331 // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty 00332 // accumulators from a made chopped word. We presume the fields are already 00333 // empty. 00334 void WERD_RES::SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in) { 00335 bln_boxes = tesseract::BoxWord::CopyFromNormalized(chopped_word); 00336 start_seam_list(chopped_word, &seam_array); 00337 SetupBlobWidthsAndGaps(); 00338 ClearWordChoices(); 00339 } 00340 00341 // Sets up the members used in recognition for an empty recognition result: 00342 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice. 00343 void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) { 00344 ClearResults(); 00345 SetupWordScript(unicharset_in); 00346 chopped_word = new TWERD; 00347 rebuild_word = new TWERD; 00348 bln_boxes = new tesseract::BoxWord; 00349 box_word = new tesseract::BoxWord; 00350 int blob_count = word->cblob_list()->length(); 00351 if (blob_count > 0) { 00352 BLOB_CHOICE** fake_choices = new BLOB_CHOICE*[blob_count]; 00353 // For non-text blocks, just pass any blobs through to the box_word 00354 // and call the word failed with a fake classification. 00355 C_BLOB_IT b_it(word->cblob_list()); 00356 int blob_id = 0; 00357 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { 00358 TBOX box = b_it.data()->bounding_box(); 00359 box_word->InsertBox(box_word->length(), box); 00360 fake_choices[blob_id++] = new BLOB_CHOICE; 00361 } 00362 FakeClassifyWord(blob_count, fake_choices); 00363 delete [] fake_choices; 00364 } else { 00365 WERD_CHOICE* word = new WERD_CHOICE(&unicharset_in); 00366 word->make_bad(); 00367 LogNewRawChoice(word); 00368 // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice. 00369 LogNewCookedChoice(1, false, word); 00370 } 00371 tess_failed = true; 00372 done = true; 00373 } 00374 00375 void WERD_RES::SetupWordScript(const UNICHARSET& uch) { 00376 uch_set = &uch; 00377 int script = uch.default_sid(); 00378 word->set_script_id(script); 00379 word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight()); 00380 word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid()); 00381 } 00382 00383 // Sets up the blamer_bundle if it is not null, using the initialized denorm. 00384 void WERD_RES::SetupBlamerBundle() { 00385 if (blamer_bundle != NULL) { 00386 blamer_bundle->SetupNormTruthWord(denorm); 00387 } 00388 } 00389 00390 // Computes the blob_widths and blob_gaps from the chopped_word. 00391 void WERD_RES::SetupBlobWidthsAndGaps() { 00392 blob_widths.truncate(0); 00393 blob_gaps.truncate(0); 00394 int num_blobs = chopped_word->NumBlobs(); 00395 for (int b = 0; b < num_blobs; ++b) { 00396 TBLOB *blob = chopped_word->blobs[b]; 00397 TBOX box = blob->bounding_box(); 00398 blob_widths.push_back(box.width()); 00399 if (b + 1 < num_blobs) { 00400 blob_gaps.push_back( 00401 chopped_word->blobs[b + 1]->bounding_box().left() - box.right()); 00402 } 00403 } 00404 } 00405 00406 // Updates internal data to account for a new SEAM (chop) at the given 00407 // blob_number. Fixes the ratings matrix and states in the choices, as well 00408 // as the blob widths and gaps. 00409 void WERD_RES::InsertSeam(int blob_number, SEAM* seam) { 00410 // Insert the seam into the SEAMS array. 00411 seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true); 00412 seam_array.insert(seam, blob_number); 00413 if (ratings != NULL) { 00414 // Expand the ratings matrix. 00415 ratings = ratings->ConsumeAndMakeBigger(blob_number); 00416 // Fix all the segmentation states. 00417 if (raw_choice != NULL) 00418 raw_choice->UpdateStateForSplit(blob_number); 00419 WERD_CHOICE_IT wc_it(&best_choices); 00420 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) { 00421 WERD_CHOICE* choice = wc_it.data(); 00422 choice->UpdateStateForSplit(blob_number); 00423 } 00424 SetupBlobWidthsAndGaps(); 00425 } 00426 } 00427 00428 // Returns true if all the word choices except the first have adjust_factors 00429 // worse than the given threshold. 00430 bool WERD_RES::AlternativeChoiceAdjustmentsWorseThan(float threshold) const { 00431 // The choices are not changed by this iteration. 00432 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&best_choices)); 00433 for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) { 00434 WERD_CHOICE* choice = wc_it.data(); 00435 if (choice->adjust_factor() <= threshold) 00436 return false; 00437 } 00438 return true; 00439 } 00440 00441 // Returns true if the current word is ambiguous (by number of answers or 00442 // by dangerous ambigs.) 00443 bool WERD_RES::IsAmbiguous() { 00444 return !best_choices.singleton() || best_choice->dangerous_ambig_found(); 00445 } 00446 00447 // Returns true if the ratings matrix size matches the sum of each of the 00448 // segmentation states. 00449 bool WERD_RES::StatesAllValid() { 00450 int ratings_dim = ratings->dimension(); 00451 if (raw_choice->TotalOfStates() != ratings_dim) { 00452 tprintf("raw_choice has total of states = %d vs ratings dim of %d\n", 00453 raw_choice->TotalOfStates(), ratings_dim); 00454 return false; 00455 } 00456 WERD_CHOICE_IT it(&best_choices); 00457 int index = 0; 00458 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) { 00459 WERD_CHOICE* choice = it.data(); 00460 if (choice->TotalOfStates() != ratings_dim) { 00461 tprintf("Cooked #%d has total of states = %d vs ratings dim of %d\n", 00462 choice->TotalOfStates(), ratings_dim); 00463 return false; 00464 } 00465 } 00466 return true; 00467 } 00468 00469 // Prints a list of words found if debug is true or the word result matches 00470 // the word_to_debug. 00471 void WERD_RES::DebugWordChoices(bool debug, const char* word_to_debug) { 00472 if (debug || 00473 (word_to_debug != NULL && *word_to_debug != '\0' && best_choice != NULL && 00474 best_choice->unichar_string() == STRING(word_to_debug))) { 00475 if (raw_choice != NULL) 00476 raw_choice->print("\nBest Raw Choice"); 00477 00478 WERD_CHOICE_IT it(&best_choices); 00479 int index = 0; 00480 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) { 00481 WERD_CHOICE* choice = it.data(); 00482 STRING label; 00483 label.add_str_int("\nCooked Choice #", index); 00484 choice->print(label.string()); 00485 } 00486 } 00487 } 00488 00489 // Prints the top choice along with the accepted/done flags. 00490 void WERD_RES::DebugTopChoice(const char* msg) const { 00491 tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ", 00492 tess_accepted, tess_would_adapt, done); 00493 if (best_choice == NULL) 00494 tprintf("<Null choice>\n"); 00495 else 00496 best_choice->print(msg); 00497 } 00498 00499 // Removes from best_choices all choices which are not within a reasonable 00500 // range of the best choice. 00501 // TODO(rays) incorporate the information used here into the params training 00502 // re-ranker, in place of this heuristic that is based on the previous 00503 // adjustment factor. 00504 void WERD_RES::FilterWordChoices(int debug_level) { 00505 if (best_choice == NULL || best_choices.singleton()) 00506 return; 00507 00508 if (debug_level >= 2) 00509 best_choice->print("\nFiltering against best choice"); 00510 WERD_CHOICE_IT it(&best_choices); 00511 int index = 0; 00512 for (it.forward(); !it.at_first(); it.forward(), ++index) { 00513 WERD_CHOICE* choice = it.data(); 00514 float threshold = StopperAmbigThreshold(best_choice->adjust_factor(), 00515 choice->adjust_factor()); 00516 // i, j index the blob choice in choice, best_choice. 00517 // chunk is an index into the chopped_word blobs (AKA chunks). 00518 // Since the two words may use different segmentations of the chunks, we 00519 // iterate over the chunks to find out whether a comparable blob 00520 // classification is much worse than the best result. 00521 int i = 0, j = 0, chunk = 0; 00522 // Each iteration of the while deals with 1 chunk. On entry choice_chunk 00523 // and best_chunk are the indices of the first chunk in the NEXT blob, 00524 // i.e. we don't have to increment i, j while chunk < choice_chunk and 00525 // best_chunk respectively. 00526 int choice_chunk = choice->state(0), best_chunk = best_choice->state(0); 00527 while (i < choice->length() && j < best_choice->length()) { 00528 if (choice->unichar_id(i) != best_choice->unichar_id(j) && 00529 choice->certainty(i) - best_choice->certainty(j) < threshold) { 00530 if (debug_level >= 2) { 00531 STRING label; 00532 label.add_str_int("\nDiscarding bad choice #", index); 00533 choice->print(label.string()); 00534 tprintf("i %d j %d Chunk %d Choice->Blob[i].Certainty %.4g" 00535 " BestChoice->ChunkCertainty[Chunk] %g Threshold %g\n", 00536 i, j, chunk, choice->certainty(i), 00537 best_choice->certainty(j), threshold); 00538 } 00539 delete it.extract(); 00540 break; 00541 } 00542 ++chunk; 00543 // If needed, advance choice_chunk to keep up with chunk. 00544 while (choice_chunk < chunk && ++i < choice->length()) 00545 choice_chunk += choice->state(i); 00546 // If needed, advance best_chunk to keep up with chunk. 00547 while (best_chunk < chunk && ++j < best_choice->length()) 00548 best_chunk += best_choice->state(j); 00549 } 00550 } 00551 } 00552 00553 void WERD_RES::ComputeAdaptionThresholds(float certainty_scale, 00554 float min_rating, 00555 float max_rating, 00556 float rating_margin, 00557 float* thresholds) { 00558 int chunk = 0; 00559 int end_chunk = best_choice->state(0); 00560 int end_raw_chunk = raw_choice->state(0); 00561 int raw_blob = 0; 00562 for (int i = 0; i < best_choice->length(); i++, thresholds++) { 00563 float avg_rating = 0.0f; 00564 int num_error_chunks = 0; 00565 00566 // For each chunk in best choice blob i, count non-matching raw results. 00567 while (chunk < end_chunk) { 00568 if (chunk >= end_raw_chunk) { 00569 ++raw_blob; 00570 end_raw_chunk += raw_choice->state(raw_blob); 00571 } 00572 if (best_choice->unichar_id(i) != 00573 raw_choice->unichar_id(raw_blob)) { 00574 avg_rating += raw_choice->certainty(raw_blob); 00575 ++num_error_chunks; 00576 } 00577 ++chunk; 00578 } 00579 00580 if (num_error_chunks > 0) { 00581 avg_rating /= num_error_chunks; 00582 *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin); 00583 } else { 00584 *thresholds = max_rating; 00585 } 00586 00587 if (*thresholds > max_rating) 00588 *thresholds = max_rating; 00589 if (*thresholds < min_rating) 00590 *thresholds = min_rating; 00591 } 00592 } 00593 00594 // Saves a copy of the word_choice if it has the best unadjusted rating. 00595 // Returns true if the word_choice was the new best. 00596 bool WERD_RES::LogNewRawChoice(WERD_CHOICE* word_choice) { 00597 if (raw_choice == NULL || word_choice->rating() < raw_choice->rating()) { 00598 delete raw_choice; 00599 raw_choice = new WERD_CHOICE(*word_choice); 00600 raw_choice->set_permuter(TOP_CHOICE_PERM); 00601 return true; 00602 } 00603 return false; 00604 } 00605 00606 // Consumes word_choice by adding it to best_choices, (taking ownership) if 00607 // the certainty for word_choice is some distance of the best choice in 00608 // best_choices, or by deleting the word_choice and returning false. 00609 // The best_choices list is kept in sorted order by rating. Duplicates are 00610 // removed, and the list is kept no longer than max_num_choices in length. 00611 // Returns true if the word_choice is still a valid pointer. 00612 bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug, 00613 WERD_CHOICE* word_choice) { 00614 if (best_choice != NULL) { 00615 // Throw out obviously bad choices to save some work. 00616 // TODO(rays) Get rid of this! This piece of code produces different 00617 // results according to the order in which words are found, which is an 00618 // undesirable behavior. It would be better to keep all the choices and 00619 // prune them later when more information is available. 00620 float max_certainty_delta = 00621 StopperAmbigThreshold(best_choice->adjust_factor(), 00622 word_choice->adjust_factor()); 00623 if (max_certainty_delta > -kStopperAmbiguityThresholdOffset) 00624 max_certainty_delta = -kStopperAmbiguityThresholdOffset; 00625 if (word_choice->certainty() - best_choice->certainty() < 00626 max_certainty_delta) { 00627 if (debug) { 00628 STRING bad_string; 00629 word_choice->string_and_lengths(&bad_string, NULL); 00630 tprintf("Discarding choice \"%s\" with an overly low certainty" 00631 " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n", 00632 bad_string.string(), word_choice->certainty(), 00633 best_choice->certainty(), 00634 max_certainty_delta + best_choice->certainty()); 00635 } 00636 delete word_choice; 00637 return false; 00638 } 00639 } 00640 00641 // Insert in the list in order of increasing rating, but knock out worse 00642 // string duplicates. 00643 WERD_CHOICE_IT it(&best_choices); 00644 const STRING& new_str = word_choice->unichar_string(); 00645 bool inserted = false; 00646 int num_choices = 0; 00647 if (!it.empty()) { 00648 do { 00649 WERD_CHOICE* choice = it.data(); 00650 if (choice->rating() > word_choice->rating() && !inserted) { 00651 // Time to insert. 00652 it.add_before_stay_put(word_choice); 00653 inserted = true; 00654 if (num_choices == 0) 00655 best_choice = word_choice; // This is the new best. 00656 ++num_choices; 00657 } 00658 if (choice->unichar_string() == new_str) { 00659 if (inserted) { 00660 // New is better. 00661 delete it.extract(); 00662 } else { 00663 // Old is better. 00664 if (debug) { 00665 tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n", 00666 new_str.string(), word_choice->rating(), choice->rating()); 00667 } 00668 delete word_choice; 00669 return false; 00670 } 00671 } else { 00672 ++num_choices; 00673 if (num_choices > max_num_choices) 00674 delete it.extract(); 00675 } 00676 it.forward(); 00677 } while (!it.at_first()); 00678 } 00679 if (!inserted && num_choices < max_num_choices) { 00680 it.add_to_end(word_choice); 00681 inserted = true; 00682 if (num_choices == 0) 00683 best_choice = word_choice; // This is the new best. 00684 } 00685 if (debug) { 00686 if (inserted) 00687 tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary"); 00688 else 00689 tprintf("Poor"); 00690 word_choice->print(" Word Choice"); 00691 } 00692 if (!inserted) { 00693 delete word_choice; 00694 return false; 00695 } 00696 return true; 00697 } 00698 00699 00700 // Simple helper moves the ownership of the pointer data from src to dest, 00701 // first deleting anything in dest, and nulling out src afterwards. 00702 template<class T> static void MovePointerData(T** dest, T**src) { 00703 delete *dest; 00704 *dest = *src; 00705 *src = NULL; 00706 } 00707 00708 // Prints a brief list of all the best choices. 00709 void WERD_RES::PrintBestChoices() const { 00710 STRING alternates_str; 00711 WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST*>(&best_choices)); 00712 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00713 if (!it.at_first()) alternates_str += "\", \""; 00714 alternates_str += it.data()->unichar_string(); 00715 } 00716 tprintf("Alternates for \"%s\": {\"%s\"}\n", 00717 best_choice->unichar_string().string(), alternates_str.string()); 00718 } 00719 00720 // Returns the sum of the widths of the blob between start_blob and last_blob 00721 // inclusive. 00722 int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) { 00723 int result = 0; 00724 for (int b = start_blob; b <= last_blob; ++b) { 00725 result += blob_widths[b]; 00726 if (b < last_blob) 00727 result += blob_gaps[b]; 00728 } 00729 return result; 00730 } 00731 // Returns the width of a gap between the specified blob and the next one. 00732 int WERD_RES::GetBlobsGap(int blob_index) { 00733 if (blob_index < 0 || blob_index >= blob_gaps.size()) 00734 return 0; 00735 return blob_gaps[blob_index]; 00736 } 00737 00738 // Returns the BLOB_CHOICE corresponding to the given index in the 00739 // best choice word taken from the appropriate cell in the ratings MATRIX. 00740 // Borrowed pointer, so do not delete. May return NULL if there is no 00741 // BLOB_CHOICE matching the unichar_id at the given index. 00742 BLOB_CHOICE* WERD_RES::GetBlobChoice(int index) const { 00743 if (index < 0 || index >= best_choice->length()) return NULL; 00744 BLOB_CHOICE_LIST* choices = GetBlobChoices(index); 00745 return FindMatchingChoice(best_choice->unichar_id(index), choices); 00746 } 00747 00748 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the 00749 // best choice word taken from the appropriate cell in the ratings MATRIX. 00750 // Borrowed pointer, so do not delete. 00751 BLOB_CHOICE_LIST* WERD_RES::GetBlobChoices(int index) const { 00752 return best_choice->blob_choices(index, ratings); 00753 } 00754 00755 // Moves the results fields from word to this. This takes ownership of all 00756 // the data, so src can be destructed. 00757 void WERD_RES::ConsumeWordResults(WERD_RES* word) { 00758 denorm = word->denorm; 00759 blob_row = word->blob_row; 00760 MovePointerData(&chopped_word, &word->chopped_word); 00761 MovePointerData(&rebuild_word, &word->rebuild_word); 00762 MovePointerData(&box_word, &word->box_word); 00763 seam_array.delete_data_pointers(); 00764 seam_array = word->seam_array; 00765 word->seam_array.clear(); 00766 best_state.move(&word->best_state); 00767 correct_text.move(&word->correct_text); 00768 blob_widths.move(&word->blob_widths); 00769 blob_gaps.move(&word->blob_gaps); 00770 if (ratings != NULL) ratings->delete_matrix_pointers(); 00771 MovePointerData(&ratings, &word->ratings); 00772 best_choice = word->best_choice; 00773 MovePointerData(&raw_choice, &word->raw_choice); 00774 best_choices.clear(); 00775 WERD_CHOICE_IT wc_it(&best_choices); 00776 wc_it.add_list_after(&word->best_choices); 00777 reject_map = word->reject_map; 00778 if (word->blamer_bundle != NULL) { 00779 assert(blamer_bundle != NULL); 00780 blamer_bundle->CopyResults(*(word->blamer_bundle)); 00781 } 00782 CopySimpleFields(*word); 00783 } 00784 00785 // Replace the best choice and rebuild box word. 00786 // choice must be from the current best_choices list. 00787 void WERD_RES::ReplaceBestChoice(WERD_CHOICE* choice) { 00788 best_choice = choice; 00789 RebuildBestState(); 00790 SetupBoxWord(); 00791 // Make up a fake reject map of the right length to keep the 00792 // rejection pass happy. 00793 reject_map.initialise(best_state.length()); 00794 done = tess_accepted = tess_would_adapt = true; 00795 SetScriptPositions(); 00796 } 00797 00798 // Builds the rebuild_word and sets the best_state from the chopped_word and 00799 // the best_choice->state. 00800 void WERD_RES::RebuildBestState() { 00801 ASSERT_HOST(best_choice != NULL); 00802 if (rebuild_word != NULL) 00803 delete rebuild_word; 00804 rebuild_word = new TWERD; 00805 if (seam_array.empty()) 00806 start_seam_list(chopped_word, &seam_array); 00807 best_state.truncate(0); 00808 int start = 0; 00809 for (int i = 0; i < best_choice->length(); ++i) { 00810 int length = best_choice->state(i); 00811 best_state.push_back(length); 00812 if (length > 1) { 00813 SEAM::JoinPieces(seam_array, chopped_word->blobs, start, 00814 start + length - 1); 00815 } 00816 TBLOB* blob = chopped_word->blobs[start]; 00817 rebuild_word->blobs.push_back(new TBLOB(*blob)); 00818 if (length > 1) { 00819 SEAM::BreakPieces(seam_array, chopped_word->blobs, start, 00820 start + length - 1); 00821 } 00822 start += length; 00823 } 00824 } 00825 00826 // Copies the chopped_word to the rebuild_word, faking a best_state as well. 00827 // Also sets up the output box_word. 00828 void WERD_RES::CloneChoppedToRebuild() { 00829 if (rebuild_word != NULL) 00830 delete rebuild_word; 00831 rebuild_word = new TWERD(*chopped_word); 00832 SetupBoxWord(); 00833 int word_len = box_word->length(); 00834 best_state.reserve(word_len); 00835 correct_text.reserve(word_len); 00836 for (int i = 0; i < word_len; ++i) { 00837 best_state.push_back(1); 00838 correct_text.push_back(STRING("")); 00839 } 00840 } 00841 00842 // Sets/replaces the box_word with one made from the rebuild_word. 00843 void WERD_RES::SetupBoxWord() { 00844 if (box_word != NULL) 00845 delete box_word; 00846 rebuild_word->ComputeBoundingBoxes(); 00847 box_word = tesseract::BoxWord::CopyFromNormalized(rebuild_word); 00848 box_word->ClipToOriginalWord(denorm.block(), word); 00849 } 00850 00851 // Sets up the script positions in the output best_choice using the best_choice 00852 // to get the unichars, and the unicharset to get the target positions. 00853 void WERD_RES::SetScriptPositions() { 00854 best_choice->SetScriptPositions(small_caps, chopped_word); 00855 } 00856 // Sets all the blobs in all the words (raw choice and best choices) to be 00857 // the given position. (When a sub/superscript is recognized as a separate 00858 // word, it falls victim to the rule that a whole word cannot be sub or 00859 // superscript, so this function overrides that problem.) 00860 void WERD_RES::SetAllScriptPositions(tesseract::ScriptPos position) { 00861 raw_choice->SetAllScriptPositions(position); 00862 WERD_CHOICE_IT wc_it(&best_choices); 00863 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) 00864 wc_it.data()->SetAllScriptPositions(position); 00865 } 00866 00867 // Classifies the word with some already-calculated BLOB_CHOICEs. 00868 // The choices are an array of blob_count pointers to BLOB_CHOICE, 00869 // providing a single classifier result for each blob. 00870 // The BLOB_CHOICEs are consumed and the word takes ownership. 00871 // The number of blobs in the box_word must match blob_count. 00872 void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) { 00873 // Setup the WERD_RES. 00874 ASSERT_HOST(box_word != NULL); 00875 ASSERT_HOST(blob_count == box_word->length()); 00876 ClearWordChoices(); 00877 ClearRatings(); 00878 ratings = new MATRIX(blob_count, 1); 00879 for (int c = 0; c < blob_count; ++c) { 00880 BLOB_CHOICE_LIST* choice_list = new BLOB_CHOICE_LIST; 00881 BLOB_CHOICE_IT choice_it(choice_list); 00882 choice_it.add_after_then_move(choices[c]); 00883 ratings->put(c, c, choice_list); 00884 } 00885 FakeWordFromRatings(); 00886 reject_map.initialise(blob_count); 00887 done = true; 00888 } 00889 00890 // Creates a WERD_CHOICE for the word using the top choices from the leading 00891 // diagonal of the ratings matrix. 00892 void WERD_RES::FakeWordFromRatings() { 00893 int num_blobs = ratings->dimension(); 00894 WERD_CHOICE* word_choice = new WERD_CHOICE(uch_set, num_blobs); 00895 word_choice->set_permuter(TOP_CHOICE_PERM); 00896 for (int b = 0; b < num_blobs; ++b) { 00897 UNICHAR_ID unichar_id = UNICHAR_SPACE; 00898 float rating = MAX_INT32; 00899 float certainty = -MAX_INT32; 00900 BLOB_CHOICE_LIST* choices = ratings->get(b, b); 00901 if (choices != NULL && !choices->empty()) { 00902 BLOB_CHOICE_IT bc_it(choices); 00903 BLOB_CHOICE* choice = bc_it.data(); 00904 unichar_id = choice->unichar_id(); 00905 rating = choice->rating(); 00906 certainty = choice->certainty(); 00907 } 00908 word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating, 00909 certainty); 00910 } 00911 LogNewRawChoice(word_choice); 00912 // Ownership of word_choice taken by word here. 00913 LogNewCookedChoice(1, false, word_choice); 00914 } 00915 00916 // Copies the best_choice strings to the correct_text for adaption/training. 00917 void WERD_RES::BestChoiceToCorrectText() { 00918 correct_text.clear(); 00919 ASSERT_HOST(best_choice != NULL); 00920 for (int i = 0; i < best_choice->length(); ++i) { 00921 UNICHAR_ID choice_id = best_choice->unichar_id(i); 00922 const char* blob_choice = uch_set->id_to_unichar(choice_id); 00923 correct_text.push_back(STRING(blob_choice)); 00924 } 00925 } 00926 00927 // Merges 2 adjacent blobs in the result if the permanent callback 00928 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent 00929 // callback box_cb is NULL or returns true, setting the merged blob 00930 // result to the class returned from class_cb. 00931 // Returns true if anything was merged. 00932 bool WERD_RES::ConditionalBlobMerge( 00933 TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb, 00934 TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb) { 00935 ASSERT_HOST(best_choice->length() == 0 || ratings != NULL); 00936 bool modified = false; 00937 for (int i = 0; i + 1 < best_choice->length(); ++i) { 00938 UNICHAR_ID new_id = class_cb->Run(best_choice->unichar_id(i), 00939 best_choice->unichar_id(i+1)); 00940 if (new_id != INVALID_UNICHAR_ID && 00941 (box_cb == NULL || box_cb->Run(box_word->BlobBox(i), 00942 box_word->BlobBox(i + 1)))) { 00943 // Raw choice should not be fixed. 00944 best_choice->set_unichar_id(new_id, i); 00945 modified = true; 00946 MergeAdjacentBlobs(i); 00947 const MATRIX_COORD& coord = best_choice->MatrixCoord(i); 00948 if (!coord.Valid(*ratings)) { 00949 ratings->IncreaseBandSize(coord.row + 1 - coord.col); 00950 } 00951 BLOB_CHOICE_LIST* blob_choices = GetBlobChoices(i); 00952 if (FindMatchingChoice(new_id, blob_choices) == NULL) { 00953 // Insert a fake result. 00954 BLOB_CHOICE* blob_choice = new BLOB_CHOICE; 00955 blob_choice->set_unichar_id(new_id); 00956 BLOB_CHOICE_IT bc_it(blob_choices); 00957 bc_it.add_before_then_move(blob_choice); 00958 } 00959 } 00960 } 00961 delete class_cb; 00962 delete box_cb; 00963 return modified; 00964 } 00965 00966 // Merges 2 adjacent blobs in the result (index and index+1) and corrects 00967 // all the data to account for the change. 00968 void WERD_RES::MergeAdjacentBlobs(int index) { 00969 if (reject_map.length() == best_choice->length()) 00970 reject_map.remove_pos(index); 00971 best_choice->remove_unichar_id(index + 1); 00972 rebuild_word->MergeBlobs(index, index + 2); 00973 box_word->MergeBoxes(index, index + 2); 00974 if (index + 1 < best_state.length()) { 00975 best_state[index] += best_state[index + 1]; 00976 best_state.remove(index + 1); 00977 } 00978 } 00979 00980 // TODO(tkielbus) Decide between keeping this behavior here or modifying the 00981 // training data. 00982 00983 // Utility function for fix_quotes 00984 // Return true if the next character in the string (given the UTF8 length in 00985 // bytes) is a quote character. 00986 static int is_simple_quote(const char* signed_str, int length) { 00987 const unsigned char* str = 00988 reinterpret_cast<const unsigned char*>(signed_str); 00989 // Standard 1 byte quotes. 00990 return (length == 1 && (*str == '\'' || *str == '`')) || 00991 // UTF-8 3 bytes curved quotes. 00992 (length == 3 && ((*str == 0xe2 && 00993 *(str + 1) == 0x80 && 00994 *(str + 2) == 0x98) || 00995 (*str == 0xe2 && 00996 *(str + 1) == 0x80 && 00997 *(str + 2) == 0x99))); 00998 } 00999 01000 // Callback helper for fix_quotes returns a double quote if both 01001 // arguments are quote, otherwise INVALID_UNICHAR_ID. 01002 UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) { 01003 const char *ch = uch_set->id_to_unichar(id1); 01004 const char *next_ch = uch_set->id_to_unichar(id2); 01005 if (is_simple_quote(ch, strlen(ch)) && 01006 is_simple_quote(next_ch, strlen(next_ch))) 01007 return uch_set->unichar_to_id("\""); 01008 return INVALID_UNICHAR_ID; 01009 } 01010 01011 // Change pairs of quotes to double quotes. 01012 void WERD_RES::fix_quotes() { 01013 if (!uch_set->contains_unichar("\"") || 01014 !uch_set->get_enabled(uch_set->unichar_to_id("\""))) 01015 return; // Don't create it if it is disallowed. 01016 01017 ConditionalBlobMerge( 01018 NewPermanentTessCallback(this, &WERD_RES::BothQuotes), 01019 NULL); 01020 } 01021 01022 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both 01023 // arguments are hyphen, otherwise INVALID_UNICHAR_ID. 01024 UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) { 01025 const char *ch = uch_set->id_to_unichar(id1); 01026 const char *next_ch = uch_set->id_to_unichar(id2); 01027 if (strlen(ch) == 1 && strlen(next_ch) == 1 && 01028 (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~')) 01029 return uch_set->unichar_to_id("-"); 01030 return INVALID_UNICHAR_ID; 01031 } 01032 01033 // Callback helper for fix_hyphens returns true if box1 and box2 overlap 01034 // (assuming both on the same textline, are in order and a chopped em dash.) 01035 bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) { 01036 return box1.right() >= box2.left(); 01037 } 01038 01039 // Change pairs of hyphens to a single hyphen if the bounding boxes touch 01040 // Typically a long dash which has been segmented. 01041 void WERD_RES::fix_hyphens() { 01042 if (!uch_set->contains_unichar("-") || 01043 !uch_set->get_enabled(uch_set->unichar_to_id("-"))) 01044 return; // Don't create it if it is disallowed. 01045 01046 ConditionalBlobMerge( 01047 NewPermanentTessCallback(this, &WERD_RES::BothHyphens), 01048 NewPermanentTessCallback(this, &WERD_RES::HyphenBoxesOverlap)); 01049 } 01050 01051 // Callback helper for merge_tess_fails returns a space if both 01052 // arguments are space, otherwise INVALID_UNICHAR_ID. 01053 UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) { 01054 if (id1 == id2 && id1 == uch_set->unichar_to_id(" ")) 01055 return id1; 01056 else 01057 return INVALID_UNICHAR_ID; 01058 } 01059 01060 // Change pairs of tess failures to a single one 01061 void WERD_RES::merge_tess_fails() { 01062 if (ConditionalBlobMerge( 01063 NewPermanentTessCallback(this, &WERD_RES::BothSpaces), NULL)) { 01064 int len = best_choice->length(); 01065 ASSERT_HOST(reject_map.length() == len); 01066 ASSERT_HOST(box_word->length() == len); 01067 } 01068 } 01069 01070 // Returns true if the collection of count pieces, starting at start, are all 01071 // natural connected components, ie there are no real chops involved. 01072 bool WERD_RES::PiecesAllNatural(int start, int count) const { 01073 // all seams must have no splits. 01074 for (int index = start; index < start + count - 1; ++index) { 01075 if (index >= 0 && index < seam_array.size()) { 01076 SEAM* seam = seam_array[index]; 01077 if (seam != NULL && seam->HasAnySplits()) return false; 01078 } 01079 } 01080 return true; 01081 } 01082 01083 01084 WERD_RES::~WERD_RES () { 01085 Clear(); 01086 } 01087 01088 void WERD_RES::InitNonPointers() { 01089 tess_failed = FALSE; 01090 tess_accepted = FALSE; 01091 tess_would_adapt = FALSE; 01092 done = FALSE; 01093 unlv_crunch_mode = CR_NONE; 01094 small_caps = false; 01095 odd_size = false; 01096 italic = FALSE; 01097 bold = FALSE; 01098 // The fontinfos and tesseract count as non-pointers as they point to 01099 // data owned elsewhere. 01100 fontinfo = NULL; 01101 fontinfo2 = NULL; 01102 tesseract = NULL; 01103 fontinfo_id_count = 0; 01104 fontinfo_id2_count = 0; 01105 x_height = 0.0; 01106 caps_height = 0.0; 01107 baseline_shift = 0.0f; 01108 guessed_x_ht = TRUE; 01109 guessed_caps_ht = TRUE; 01110 combination = FALSE; 01111 part_of_combo = FALSE; 01112 reject_spaces = FALSE; 01113 } 01114 01115 void WERD_RES::InitPointers() { 01116 word = NULL; 01117 bln_boxes = NULL; 01118 blob_row = NULL; 01119 uch_set = NULL; 01120 chopped_word = NULL; 01121 rebuild_word = NULL; 01122 box_word = NULL; 01123 ratings = NULL; 01124 best_choice = NULL; 01125 raw_choice = NULL; 01126 ep_choice = NULL; 01127 blamer_bundle = NULL; 01128 } 01129 01130 void WERD_RES::Clear() { 01131 if (word != NULL && combination) { 01132 delete word; 01133 } 01134 word = NULL; 01135 delete blamer_bundle; 01136 blamer_bundle = NULL; 01137 ClearResults(); 01138 } 01139 01140 void WERD_RES::ClearResults() { 01141 done = false; 01142 fontinfo = NULL; 01143 fontinfo2 = NULL; 01144 fontinfo_id_count = 0; 01145 fontinfo_id2_count = 0; 01146 if (bln_boxes != NULL) { 01147 delete bln_boxes; 01148 bln_boxes = NULL; 01149 } 01150 blob_row = NULL; 01151 if (chopped_word != NULL) { 01152 delete chopped_word; 01153 chopped_word = NULL; 01154 } 01155 if (rebuild_word != NULL) { 01156 delete rebuild_word; 01157 rebuild_word = NULL; 01158 } 01159 if (box_word != NULL) { 01160 delete box_word; 01161 box_word = NULL; 01162 } 01163 best_state.clear(); 01164 correct_text.clear(); 01165 seam_array.delete_data_pointers(); 01166 seam_array.clear(); 01167 blob_widths.clear(); 01168 blob_gaps.clear(); 01169 ClearRatings(); 01170 ClearWordChoices(); 01171 if (blamer_bundle != NULL) blamer_bundle->ClearResults(); 01172 } 01173 void WERD_RES::ClearWordChoices() { 01174 best_choice = NULL; 01175 if (raw_choice != NULL) { 01176 delete raw_choice; 01177 raw_choice = NULL; 01178 } 01179 best_choices.clear(); 01180 if (ep_choice != NULL) { 01181 delete ep_choice; 01182 ep_choice = NULL; 01183 } 01184 } 01185 void WERD_RES::ClearRatings() { 01186 if (ratings != NULL) { 01187 ratings->delete_matrix_pointers(); 01188 delete ratings; 01189 ratings = NULL; 01190 } 01191 } 01192 01193 01194 bool PAGE_RES_IT::operator ==(const PAGE_RES_IT &other) const { 01195 return word_res == other.word_res && 01196 row_res == other.row_res && 01197 block_res == other.block_res; 01198 } 01199 01200 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const { 01201 ASSERT_HOST(page_res == other.page_res); 01202 if (other.block_res == NULL) { 01203 // other points to the end of the page. 01204 if (block_res == NULL) 01205 return 0; 01206 return -1; 01207 } 01208 if (block_res == NULL) { 01209 return 1; // we point to the end of the page. 01210 } 01211 if (block_res == other.block_res) { 01212 if (other.row_res == NULL || row_res == NULL) { 01213 // this should only happen if we hit an image block. 01214 return 0; 01215 } 01216 if (row_res == other.row_res) { 01217 // we point to the same block and row. 01218 ASSERT_HOST(other.word_res != NULL && word_res != NULL); 01219 if (word_res == other.word_res) { 01220 // we point to the same word! 01221 return 0; 01222 } 01223 01224 WERD_RES_IT word_res_it(&row_res->word_res_list); 01225 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); 01226 word_res_it.forward()) { 01227 if (word_res_it.data() == word_res) { 01228 return -1; 01229 } else if (word_res_it.data() == other.word_res) { 01230 return 1; 01231 } 01232 } 01233 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL); 01234 } 01235 01236 // we both point to the same block, but different rows. 01237 ROW_RES_IT row_res_it(&block_res->row_res_list); 01238 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); 01239 row_res_it.forward()) { 01240 if (row_res_it.data() == row_res) { 01241 return -1; 01242 } else if (row_res_it.data() == other.row_res) { 01243 return 1; 01244 } 01245 } 01246 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL); 01247 } 01248 01249 // We point to different blocks. 01250 BLOCK_RES_IT block_res_it(&page_res->block_res_list); 01251 for (block_res_it.mark_cycle_pt(); 01252 !block_res_it.cycled_list(); block_res_it.forward()) { 01253 if (block_res_it.data() == block_res) { 01254 return -1; 01255 } else if (block_res_it.data() == other.block_res) { 01256 return 1; 01257 } 01258 } 01259 // Shouldn't happen... 01260 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL); 01261 return 0; 01262 } 01263 01264 // Inserts the new_word as a combination owned by a corresponding WERD_RES 01265 // before the current position. The simple fields of the WERD_RES are copied 01266 // from clone_res and the resulting WERD_RES is returned for further setup 01267 // with best_choice etc. 01268 WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res, 01269 WERD* new_word) { 01270 // Make a WERD_RES for the new_word. 01271 WERD_RES* new_res = new WERD_RES(new_word); 01272 new_res->CopySimpleFields(clone_res); 01273 new_res->combination = true; 01274 // Insert into the appropriate place in the ROW_RES. 01275 WERD_RES_IT wr_it(&row()->word_res_list); 01276 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { 01277 WERD_RES* word = wr_it.data(); 01278 if (word == word_res) 01279 break; 01280 } 01281 ASSERT_HOST(!wr_it.cycled_list()); 01282 wr_it.add_before_then_move(new_res); 01283 if (wr_it.at_first()) { 01284 // This is the new first word, so reset the member iterator so it 01285 // detects the cycled_list state correctly. 01286 ResetWordIterator(); 01287 } 01288 return new_res; 01289 } 01290 01291 // Helper computes the boundaries between blobs in the word. The blob bounds 01292 // are likely very poor, if they come from LSTM, where it only outputs the 01293 // character at one pixel within it, so we find the midpoints between them. 01294 static void ComputeBlobEnds(const WERD_RES& word, C_BLOB_LIST* next_word_blobs, 01295 GenericVector<int>* blob_ends) { 01296 C_BLOB_IT blob_it(word.word->cblob_list()); 01297 for (int i = 0; i < word.best_state.size(); ++i) { 01298 int length = word.best_state[i]; 01299 // Get the bounding box of the fake blobs 01300 TBOX blob_box = blob_it.data()->bounding_box(); 01301 blob_it.forward(); 01302 for (int b = 1; b < length; ++b) { 01303 blob_box += blob_it.data()->bounding_box(); 01304 blob_it.forward(); 01305 } 01306 // This blob_box is crap, so for now we are only looking for the 01307 // boundaries between them. 01308 int blob_end = MAX_INT32; 01309 if (!blob_it.at_first() || next_word_blobs != NULL) { 01310 if (blob_it.at_first()) 01311 blob_it.set_to_list(next_word_blobs); 01312 blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2; 01313 } 01314 blob_ends->push_back(blob_end); 01315 } 01316 } 01317 01318 // Replaces the current WERD/WERD_RES with the given words. The given words 01319 // contain fake blobs that indicate the position of the characters. These are 01320 // replaced with real blobs from the current word as much as possible. 01321 void PAGE_RES_IT::ReplaceCurrentWord( 01322 tesseract::PointerVector<WERD_RES>* words) { 01323 if (words->empty()) { 01324 DeleteCurrentWord(); 01325 return; 01326 } 01327 WERD_RES* input_word = word(); 01328 // Set the BOL/EOL flags on the words from the input word. 01329 if (input_word->word->flag(W_BOL)) { 01330 (*words)[0]->word->set_flag(W_BOL, true); 01331 } else { 01332 (*words)[0]->word->set_blanks(1); 01333 } 01334 words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL)); 01335 01336 // Move the blobs from the input word to the new set of words. 01337 // If the input word_res is a combination, then the replacements will also be 01338 // combinations, and will own their own words. If the input word_res is not a 01339 // combination, then the final replacements will not be either, (although it 01340 // is allowed for the input words to be combinations) and their words 01341 // will get put on the row list. This maintains the ownership rules. 01342 WERD_IT w_it(row()->row->word_list()); 01343 if (!input_word->combination) { 01344 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 01345 WERD* word = w_it.data(); 01346 if (word == input_word->word) 01347 break; 01348 } 01349 // w_it is now set to the input_word's word. 01350 ASSERT_HOST(!w_it.cycled_list()); 01351 } 01352 // Insert into the appropriate place in the ROW_RES. 01353 WERD_RES_IT wr_it(&row()->word_res_list); 01354 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { 01355 WERD_RES* word = wr_it.data(); 01356 if (word == input_word) 01357 break; 01358 } 01359 ASSERT_HOST(!wr_it.cycled_list()); 01360 // Since we only have an estimate of the bounds between blobs, use the blob 01361 // x-middle as the determiner of where to put the blobs 01362 C_BLOB_IT src_b_it(input_word->word->cblob_list()); 01363 src_b_it.sort(&C_BLOB::SortByXMiddle); 01364 C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list()); 01365 rej_b_it.sort(&C_BLOB::SortByXMiddle); 01366 for (int w = 0; w < words->size(); ++w) { 01367 WERD_RES* word_w = (*words)[w]; 01368 // Compute blob boundaries. 01369 GenericVector<int> blob_ends; 01370 C_BLOB_LIST* next_word_blobs = 01371 w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : NULL; 01372 ComputeBlobEnds(*word_w, next_word_blobs, &blob_ends); 01373 // Delete the fake blobs on the current word. 01374 word_w->word->cblob_list()->clear(); 01375 C_BLOB_IT dest_it(word_w->word->cblob_list()); 01376 // Build the box word as we move the blobs. 01377 tesseract::BoxWord* box_word = new tesseract::BoxWord; 01378 for (int i = 0; i < blob_ends.size(); ++i) { 01379 int end_x = blob_ends[i]; 01380 TBOX blob_box; 01381 // Add the blobs up to end_x. 01382 while (!src_b_it.empty() && 01383 src_b_it.data()->bounding_box().x_middle() < end_x) { 01384 blob_box += src_b_it.data()->bounding_box(); 01385 dest_it.add_after_then_move(src_b_it.extract()); 01386 src_b_it.forward(); 01387 } 01388 while (!rej_b_it.empty() && 01389 rej_b_it.data()->bounding_box().x_middle() < end_x) { 01390 blob_box += rej_b_it.data()->bounding_box(); 01391 dest_it.add_after_then_move(rej_b_it.extract()); 01392 rej_b_it.forward(); 01393 } 01394 // Clip to the previously computed bounds. Although imperfectly accurate, 01395 // it is good enough, and much more complicated to determine where else 01396 // to clip. 01397 if (i > 0 && blob_box.left() < blob_ends[i - 1]) 01398 blob_box.set_left(blob_ends[i - 1]); 01399 if (blob_box.right() > end_x) 01400 blob_box.set_right(end_x); 01401 box_word->InsertBox(i, blob_box); 01402 } 01403 // Fix empty boxes. If a very joined blob sits over multiple characters, 01404 // then we will have some empty boxes from using the middle, so look for 01405 // overlaps. 01406 for (int i = 0; i < box_word->length(); ++i) { 01407 TBOX box = box_word->BlobBox(i); 01408 if (box.null_box()) { 01409 // Nothing has its middle in the bounds of this blob, so use anything 01410 // that overlaps. 01411 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); 01412 dest_it.forward()) { 01413 TBOX blob_box = dest_it.data()->bounding_box(); 01414 if (blob_box.left() < blob_ends[i] && 01415 (i == 0 || blob_box.right() >= blob_ends[i - 1])) { 01416 if (i > 0 && blob_box.left() < blob_ends[i - 1]) 01417 blob_box.set_left(blob_ends[i - 1]); 01418 if (blob_box.right() > blob_ends[i]) 01419 blob_box.set_right(blob_ends[i]); 01420 box_word->ChangeBox(i, blob_box); 01421 break; 01422 } 01423 } 01424 } 01425 } 01426 delete word_w->box_word; 01427 word_w->box_word = box_word; 01428 if (!input_word->combination) { 01429 // Insert word_w->word into the ROW. It doesn't own its word, so the 01430 // ROW needs to own it. 01431 w_it.add_before_stay_put(word_w->word); 01432 word_w->combination = false; 01433 } 01434 (*words)[w] = NULL; // We are taking ownership. 01435 wr_it.add_before_stay_put(word_w); 01436 } 01437 // We have taken ownership of the words. 01438 words->clear(); 01439 // Delete the current word, which has been replaced. We could just call 01440 // DeleteCurrentWord, but that would iterate both lists again, and we know 01441 // we are already in the right place. 01442 if (!input_word->combination) 01443 delete w_it.extract(); 01444 delete wr_it.extract(); 01445 ResetWordIterator(); 01446 } 01447 01448 // Deletes the current WERD_RES and its underlying WERD. 01449 void PAGE_RES_IT::DeleteCurrentWord() { 01450 // Check that this word is as we expect. part_of_combos are NEVER iterated 01451 // by the normal iterator, so we should never be trying to delete them. 01452 ASSERT_HOST(!word_res->part_of_combo); 01453 if (!word_res->combination) { 01454 // Combinations own their own word, so we won't find the word on the 01455 // row's word_list, but it is legitimate to try to delete them. 01456 // Delete word from the ROW when not a combination. 01457 WERD_IT w_it(row()->row->word_list()); 01458 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 01459 if (w_it.data() == word_res->word) { 01460 break; 01461 } 01462 } 01463 ASSERT_HOST(!w_it.cycled_list()); 01464 delete w_it.extract(); 01465 } 01466 // Remove the WERD_RES for the new_word. 01467 // Remove the WORD_RES from the ROW_RES. 01468 WERD_RES_IT wr_it(&row()->word_res_list); 01469 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { 01470 if (wr_it.data() == word_res) { 01471 word_res = NULL; 01472 break; 01473 } 01474 } 01475 ASSERT_HOST(!wr_it.cycled_list()); 01476 delete wr_it.extract(); 01477 ResetWordIterator(); 01478 } 01479 01480 // Makes the current word a fuzzy space if not already fuzzy. Updates 01481 // corresponding part of combo if required. 01482 void PAGE_RES_IT::MakeCurrentWordFuzzy() { 01483 WERD* real_word = word_res->word; 01484 if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) { 01485 real_word->set_flag(W_FUZZY_SP, true); 01486 if (word_res->combination) { 01487 // The next word should be the corresponding part of combo, but we have 01488 // already stepped past it, so find it by search. 01489 WERD_RES_IT wr_it(&row()->word_res_list); 01490 for (wr_it.mark_cycle_pt(); 01491 !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) { 01492 } 01493 wr_it.forward(); 01494 ASSERT_HOST(wr_it.data()->part_of_combo); 01495 real_word = wr_it.data()->word; 01496 ASSERT_HOST(!real_word->flag(W_FUZZY_SP) && 01497 !real_word->flag(W_FUZZY_NON)); 01498 real_word->set_flag(W_FUZZY_SP, true); 01499 } 01500 } 01501 } 01502 01503 /************************************************************************* 01504 * PAGE_RES_IT::restart_page 01505 * 01506 * Set things up at the start of the page 01507 *************************************************************************/ 01508 01509 WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) { 01510 block_res_it.set_to_list(&page_res->block_res_list); 01511 block_res_it.mark_cycle_pt(); 01512 prev_block_res = NULL; 01513 prev_row_res = NULL; 01514 prev_word_res = NULL; 01515 block_res = NULL; 01516 row_res = NULL; 01517 word_res = NULL; 01518 next_block_res = NULL; 01519 next_row_res = NULL; 01520 next_word_res = NULL; 01521 internal_forward(true, empty_ok); 01522 return internal_forward(false, empty_ok); 01523 } 01524 01525 // Recovers from operations on the current word, such as in InsertCloneWord 01526 // and DeleteCurrentWord. 01527 // Resets the word_res_it so that it is one past the next_word_res, as 01528 // it should be after internal_forward. If next_row_res != row_res, 01529 // then the next_word_res is in the next row, so there is no need to do 01530 // anything to word_res_it, but it is still a good idea to reset the pointers 01531 // word_res and prev_word_res, which are still in the current row. 01532 void PAGE_RES_IT::ResetWordIterator() { 01533 if (row_res == next_row_res) { 01534 // Reset the member iterator so it can move forward and detect the 01535 // cycled_list state correctly. 01536 word_res_it.move_to_first(); 01537 for (word_res_it.mark_cycle_pt(); 01538 !word_res_it.cycled_list() && word_res_it.data() != next_word_res; 01539 word_res_it.forward()) { 01540 if (!word_res_it.data()->part_of_combo) { 01541 if (prev_row_res == row_res) prev_word_res = word_res; 01542 word_res = word_res_it.data(); 01543 } 01544 } 01545 ASSERT_HOST(!word_res_it.cycled_list()); 01546 word_res_it.forward(); 01547 } else { 01548 // word_res_it is OK, but reset word_res and prev_word_res if needed. 01549 WERD_RES_IT wr_it(&row_res->word_res_list); 01550 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { 01551 if (!wr_it.data()->part_of_combo) { 01552 if (prev_row_res == row_res) prev_word_res = word_res; 01553 word_res = wr_it.data(); 01554 } 01555 } 01556 } 01557 } 01558 01559 /************************************************************************* 01560 * PAGE_RES_IT::internal_forward 01561 * 01562 * Find the next word on the page. If empty_ok is true, then non-text blocks 01563 * and text blocks with no text are visited as if they contain a single 01564 * imaginary word in a single imaginary row. (word() and row() both return NULL 01565 * in such a block and the return value is NULL.) 01566 * If empty_ok is false, the old behaviour is maintained. Each real word 01567 * is visited and empty and non-text blocks and rows are skipped. 01568 * new_block is used to initialize the iterators for a new block. 01569 * The iterator maintains pointers to block, row and word for the previous, 01570 * current and next words. These are correct, regardless of block/row 01571 * boundaries. NULL values denote start and end of the page. 01572 *************************************************************************/ 01573 01574 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) { 01575 bool new_row = false; 01576 01577 prev_block_res = block_res; 01578 prev_row_res = row_res; 01579 prev_word_res = word_res; 01580 block_res = next_block_res; 01581 row_res = next_row_res; 01582 word_res = next_word_res; 01583 next_block_res = NULL; 01584 next_row_res = NULL; 01585 next_word_res = NULL; 01586 01587 while (!block_res_it.cycled_list()) { 01588 if (new_block) { 01589 new_block = false; 01590 row_res_it.set_to_list(&block_res_it.data()->row_res_list); 01591 row_res_it.mark_cycle_pt(); 01592 if (row_res_it.empty() && empty_ok) { 01593 next_block_res = block_res_it.data(); 01594 break; 01595 } 01596 new_row = true; 01597 } 01598 while (!row_res_it.cycled_list()) { 01599 if (new_row) { 01600 new_row = false; 01601 word_res_it.set_to_list(&row_res_it.data()->word_res_list); 01602 word_res_it.mark_cycle_pt(); 01603 } 01604 // Skip any part_of_combo words. 01605 while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo) 01606 word_res_it.forward(); 01607 if (!word_res_it.cycled_list()) { 01608 next_block_res = block_res_it.data(); 01609 next_row_res = row_res_it.data(); 01610 next_word_res = word_res_it.data(); 01611 word_res_it.forward(); 01612 goto foundword; 01613 } 01614 // end of row reached 01615 row_res_it.forward(); 01616 new_row = true; 01617 } 01618 // end of block reached 01619 block_res_it.forward(); 01620 new_block = true; 01621 } 01622 foundword: 01623 // Update prev_word_best_choice pointer. 01624 if (page_res != NULL && page_res->prev_word_best_choice != NULL) { 01625 *page_res->prev_word_best_choice = 01626 (new_block || prev_word_res == NULL) ? NULL : prev_word_res->best_choice; 01627 } 01628 return word_res; 01629 } 01630 01631 /************************************************************************* 01632 * PAGE_RES_IT::restart_row() 01633 * 01634 * Move to the beginning (leftmost word) of the current row. 01635 *************************************************************************/ 01636 WERD_RES *PAGE_RES_IT::restart_row() { 01637 ROW_RES *row = this->row(); 01638 if (!row) return NULL; 01639 for (restart_page(); this->row() != row; forward()) { 01640 // pass 01641 } 01642 return word(); 01643 } 01644 01645 /************************************************************************* 01646 * PAGE_RES_IT::forward_paragraph 01647 * 01648 * Move to the beginning of the next paragraph, allowing empty blocks. 01649 *************************************************************************/ 01650 01651 WERD_RES *PAGE_RES_IT::forward_paragraph() { 01652 while (block_res == next_block_res && 01653 (next_row_res != NULL && next_row_res->row != NULL && 01654 row_res->row->para() == next_row_res->row->para())) { 01655 internal_forward(false, true); 01656 } 01657 return internal_forward(false, true); 01658 } 01659 01660 /************************************************************************* 01661 * PAGE_RES_IT::forward_block 01662 * 01663 * Move to the beginning of the next block, allowing empty blocks. 01664 *************************************************************************/ 01665 01666 WERD_RES *PAGE_RES_IT::forward_block() { 01667 while (block_res == next_block_res) { 01668 internal_forward(false, true); 01669 } 01670 return internal_forward(false, true); 01671 } 01672 01673 void PAGE_RES_IT::rej_stat_word() { 01674 inT16 chars_in_word; 01675 inT16 rejects_in_word = 0; 01676 01677 chars_in_word = word_res->reject_map.length (); 01678 page_res->char_count += chars_in_word; 01679 block_res->char_count += chars_in_word; 01680 row_res->char_count += chars_in_word; 01681 01682 rejects_in_word = word_res->reject_map.reject_count (); 01683 01684 page_res->rej_count += rejects_in_word; 01685 block_res->rej_count += rejects_in_word; 01686 row_res->rej_count += rejects_in_word; 01687 if (chars_in_word == rejects_in_word) 01688 row_res->whole_word_rej_count += rejects_in_word; 01689 }