|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: applybox.cpp (Formerly applybox.c) 00003 * Description: Re segment rows according to box file data 00004 * Author: Phil Cheatle 00005 * Created: Wed Nov 24 09:11:23 GMT 1993 00006 * 00007 * (C) Copyright 1993, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #endif 00023 00024 #include <ctype.h> 00025 #include <string.h> 00026 #ifdef __UNIX__ 00027 #include <assert.h> 00028 #include <errno.h> 00029 #endif 00030 #include "allheaders.h" 00031 #include "boxread.h" 00032 #include "chopper.h" 00033 #include "pageres.h" 00034 #include "unichar.h" 00035 #include "unicharset.h" 00036 #include "tesseractclass.h" 00037 #include "genericvector.h" 00038 00040 const int kMaxGroupSize = 4; 00043 const double kMaxXHeightDeviationFraction = 0.125; 00044 00080 namespace tesseract { 00081 00082 static void clear_any_old_text(BLOCK_LIST *block_list) { 00083 BLOCK_IT block_it(block_list); 00084 for (block_it.mark_cycle_pt(); 00085 !block_it.cycled_list(); block_it.forward()) { 00086 ROW_IT row_it(block_it.data()->row_list()); 00087 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00088 WERD_IT word_it(row_it.data()->word_list()); 00089 for (word_it.mark_cycle_pt(); 00090 !word_it.cycled_list(); word_it.forward()) { 00091 word_it.data()->set_text(""); 00092 } 00093 } 00094 } 00095 } 00096 00097 // Applies the box file based on the image name fname, and resegments 00098 // the words in the block_list (page), with: 00099 // blob-mode: one blob per line in the box file, words as input. 00100 // word/line-mode: one blob per space-delimited unit after the #, and one word 00101 // per line in the box file. (See comment above for box file format.) 00102 // If find_segmentation is true, (word/line mode) then the classifier is used 00103 // to re-segment words/lines to match the space-delimited truth string for 00104 // each box. In this case, the input box may be for a word or even a whole 00105 // text line, and the output words will contain multiple blobs corresponding 00106 // to the space-delimited input string. 00107 // With find_segmentation false, no classifier is needed, but the chopper 00108 // can still be used to correctly segment touching characters with the help 00109 // of the input boxes. 00110 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned 00111 // from normal classification, ie. with a word, chopped_word, rebuild_word, 00112 // seam_array, denorm, box_word, and best_state, but NO best_choice or 00113 // raw_choice, as they would require a UNICHARSET, which we aim to avoid. 00114 // Instead, the correct_text member of WERD_RES is set, and this may be later 00115 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords 00116 // is not required before calling ApplyBoxTraining. 00117 PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname, 00118 bool find_segmentation, 00119 BLOCK_LIST *block_list) { 00120 GenericVector<TBOX> boxes; 00121 GenericVector<STRING> texts, full_texts; 00122 if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts, 00123 NULL)) { 00124 return NULL; // Can't do it. 00125 } 00126 00127 int box_count = boxes.size(); 00128 int box_failures = 0; 00129 // Add an empty everything to the end. 00130 boxes.push_back(TBOX()); 00131 texts.push_back(STRING()); 00132 full_texts.push_back(STRING()); 00133 00134 // In word mode, we use the boxes to make a word for each box, but 00135 // in blob mode we use the existing words and maximally chop them first. 00136 PAGE_RES* page_res = find_segmentation ? 00137 NULL : SetupApplyBoxes(boxes, block_list); 00138 clear_any_old_text(block_list); 00139 00140 for (int i = 0; i < boxes.size() - 1; i++) { 00141 bool foundit = false; 00142 if (page_res != NULL) { 00143 if (i == 0) { 00144 foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1], 00145 full_texts[i].string()); 00146 } else { 00147 foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i], 00148 boxes[i + 1], full_texts[i].string()); 00149 } 00150 } else { 00151 foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1], 00152 texts[i].string()); 00153 } 00154 if (!foundit) { 00155 box_failures++; 00156 ReportFailedBox(i, boxes[i], texts[i].string(), 00157 "FAILURE! Couldn't find a matching blob"); 00158 } 00159 } 00160 00161 if (page_res == NULL) { 00162 // In word/line mode, we now maximally chop all the words and resegment 00163 // them with the classifier. 00164 page_res = SetupApplyBoxes(boxes, block_list); 00165 ReSegmentByClassification(page_res); 00166 } 00167 if (applybox_debug > 0) { 00168 tprintf("APPLY_BOXES:\n"); 00169 tprintf(" Boxes read from boxfile: %6d\n", box_count); 00170 if (box_failures > 0) 00171 tprintf(" Boxes failed resegmentation: %6d\n", box_failures); 00172 } 00173 TidyUp(page_res); 00174 return page_res; 00175 } 00176 00177 // Helper computes median xheight in the image. 00178 static double MedianXHeight(BLOCK_LIST *block_list) { 00179 BLOCK_IT block_it(block_list); 00180 STATS xheights(0, block_it.data()->bounding_box().height()); 00181 for (block_it.mark_cycle_pt(); 00182 !block_it.cycled_list(); block_it.forward()) { 00183 ROW_IT row_it(block_it.data()->row_list()); 00184 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00185 xheights.add(IntCastRounded(row_it.data()->x_height()), 1); 00186 } 00187 } 00188 return xheights.median(); 00189 } 00190 00193 void Tesseract::PreenXHeights(BLOCK_LIST *block_list) { 00194 double median_xheight = MedianXHeight(block_list); 00195 double max_deviation = kMaxXHeightDeviationFraction * median_xheight; 00196 // Strip all fuzzy space markers to simplify the PAGE_RES. 00197 BLOCK_IT b_it(block_list); 00198 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { 00199 BLOCK* block = b_it.data(); 00200 ROW_IT r_it(block->row_list()); 00201 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) { 00202 ROW* row = r_it.data(); 00203 float diff = fabs(row->x_height() - median_xheight); 00204 if (diff > max_deviation) { 00205 if (applybox_debug) { 00206 tprintf("row xheight=%g, but median xheight = %g\n", 00207 row->x_height(), median_xheight); 00208 } 00209 row->set_x_height(static_cast<float>(median_xheight)); 00210 } 00211 } 00212 } 00213 } 00214 00217 PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes, 00218 BLOCK_LIST *block_list) { 00219 PreenXHeights(block_list); 00220 // Strip all fuzzy space markers to simplify the PAGE_RES. 00221 BLOCK_IT b_it(block_list); 00222 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { 00223 BLOCK* block = b_it.data(); 00224 ROW_IT r_it(block->row_list()); 00225 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) { 00226 ROW* row = r_it.data(); 00227 WERD_IT w_it(row->word_list()); 00228 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 00229 WERD* word = w_it.data(); 00230 if (word->cblob_list()->empty()) { 00231 delete w_it.extract(); 00232 } else { 00233 word->set_flag(W_FUZZY_SP, false); 00234 word->set_flag(W_FUZZY_NON, false); 00235 } 00236 } 00237 } 00238 } 00239 PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL); 00240 PAGE_RES_IT pr_it(page_res); 00241 WERD_RES* word_res; 00242 while ((word_res = pr_it.word()) != NULL) { 00243 MaximallyChopWord(boxes, pr_it.block()->block, 00244 pr_it.row()->row, word_res); 00245 pr_it.forward(); 00246 } 00247 return page_res; 00248 } 00249 00253 void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes, 00254 BLOCK* block, ROW* row, 00255 WERD_RES* word_res) { 00256 if (!word_res->SetupForRecognition(unicharset, this, BestPix(), 00257 tessedit_ocr_engine_mode, NULL, 00258 classify_bln_numeric_mode, 00259 textord_use_cjk_fp_model, 00260 poly_allow_detailed_fx, 00261 row, block)) { 00262 word_res->CloneChoppedToRebuild(); 00263 return; 00264 } 00265 if (chop_debug) { 00266 tprintf("Maximally chopping word at:"); 00267 word_res->word->bounding_box().print(); 00268 } 00269 GenericVector<BLOB_CHOICE*> blob_choices; 00270 ASSERT_HOST(!word_res->chopped_word->blobs.empty()); 00271 float rating = static_cast<float>(MAX_INT8); 00272 for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) { 00273 // The rating and certainty are not quite arbitrary. Since 00274 // select_blob_to_chop uses the worst certainty to choose, they all have 00275 // to be different, so starting with MAX_INT8, subtract 1/8 for each blob 00276 // in here, and then divide by e each time they are chopped, which 00277 // should guarantee a set of unequal values for the whole tree of blobs 00278 // produced, however much chopping is required. The chops are thus only 00279 // limited by the ability of the chopper to find suitable chop points, 00280 // and not by the value of the certainties. 00281 BLOB_CHOICE* choice = 00282 new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE); 00283 blob_choices.push_back(choice); 00284 rating -= 0.125f; 00285 } 00286 const double e = exp(1.0); // The base of natural logs. 00287 int blob_number; 00288 int right_chop_index = 0; 00289 if (!assume_fixed_pitch_char_segment) { 00290 // We only chop if the language is not fixed pitch like CJK. 00291 SEAM* seam = NULL; 00292 while ((seam = chop_one_blob(boxes, blob_choices, word_res, 00293 &blob_number)) != NULL) { 00294 word_res->InsertSeam(blob_number, seam); 00295 BLOB_CHOICE* left_choice = blob_choices[blob_number]; 00296 rating = left_choice->rating() / e; 00297 left_choice->set_rating(rating); 00298 left_choice->set_certainty(-rating); 00299 // combine confidence w/ serial # 00300 BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index, 00301 rating - 0.125f, -rating, -1, 00302 0.0f, 0.0f, 0.0f, BCC_FAKE); 00303 blob_choices.insert(right_choice, blob_number + 1); 00304 } 00305 } 00306 word_res->CloneChoppedToRebuild(); 00307 word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]); 00308 } 00309 00321 static double BoxMissMetric(const TBOX& box1, const TBOX& box2) { 00322 int overlap_area = box1.intersection(box2).area(); 00323 double miss_metric = box1.area()- overlap_area; 00324 miss_metric /= box1.area(); 00325 miss_metric *= box2.area() - overlap_area; 00326 miss_metric /= box2.area(); 00327 return miss_metric; 00328 } 00329 00340 bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box, 00341 const TBOX& box, const TBOX& next_box, 00342 const char* correct_text) { 00343 if (applybox_debug > 1) { 00344 tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text); 00345 } 00346 PAGE_RES_IT page_res_it(page_res); 00347 WERD_RES* word_res; 00348 for (word_res = page_res_it.word(); word_res != NULL; 00349 word_res = page_res_it.forward()) { 00350 if (!word_res->box_word->bounding_box().major_overlap(box)) 00351 continue; 00352 if (applybox_debug > 1) { 00353 tprintf("Checking word box:"); 00354 word_res->box_word->bounding_box().print(); 00355 } 00356 int word_len = word_res->box_word->length(); 00357 for (int i = 0; i < word_len; ++i) { 00358 TBOX char_box = TBOX(); 00359 int blob_count = 0; 00360 for (blob_count = 0; i + blob_count < word_len; ++blob_count) { 00361 TBOX blob_box = word_res->box_word->BlobBox(i + blob_count); 00362 if (!blob_box.major_overlap(box)) 00363 break; 00364 if (word_res->correct_text[i + blob_count].length() > 0) 00365 break; // Blob is claimed already. 00366 double current_box_miss_metric = BoxMissMetric(blob_box, box); 00367 double next_box_miss_metric = BoxMissMetric(blob_box, next_box); 00368 if (applybox_debug > 2) { 00369 tprintf("Checking blob:"); 00370 blob_box.print(); 00371 tprintf("Current miss metric = %g, next = %g\n", 00372 current_box_miss_metric, next_box_miss_metric); 00373 } 00374 if (current_box_miss_metric > next_box_miss_metric) 00375 break; // Blob is a better match for next box. 00376 char_box += blob_box; 00377 } 00378 if (blob_count > 0) { 00379 if (applybox_debug > 1) { 00380 tprintf("Index [%d, %d) seem good.\n", i, i + blob_count); 00381 } 00382 if (!char_box.almost_equal(box, 3) && 00383 (box.x_gap(next_box) < -3 || 00384 (prev_box != NULL && prev_box->x_gap(box) < -3))) { 00385 return false; 00386 } 00387 // We refine just the box_word, best_state and correct_text here. 00388 // The rebuild_word is made in TidyUp. 00389 // blob_count blobs are put together to match the box. Merge the 00390 // box_word boxes, save the blob_count in the state and the text. 00391 word_res->box_word->MergeBoxes(i, i + blob_count); 00392 word_res->best_state[i] = blob_count; 00393 word_res->correct_text[i] = correct_text; 00394 if (applybox_debug > 2) { 00395 tprintf("%d Blobs match: blob box:", blob_count); 00396 word_res->box_word->BlobBox(i).print(); 00397 tprintf("Matches box:"); 00398 box.print(); 00399 tprintf("With next box:"); 00400 next_box.print(); 00401 } 00402 // Eliminated best_state and correct_text entries for the consumed 00403 // blobs. 00404 for (int j = 1; j < blob_count; ++j) { 00405 word_res->best_state.remove(i + 1); 00406 word_res->correct_text.remove(i + 1); 00407 } 00408 // Assume that no box spans multiple source words, so we are done with 00409 // this box. 00410 if (applybox_debug > 1) { 00411 tprintf("Best state = "); 00412 for (int j = 0; j < word_res->best_state.size(); ++j) { 00413 tprintf("%d ", word_res->best_state[j]); 00414 } 00415 tprintf("\n"); 00416 tprintf("Correct text = [[ "); 00417 for (int j = 0; j < word_res->correct_text.size(); ++j) { 00418 tprintf("%s ", word_res->correct_text[j].string()); 00419 } 00420 tprintf("]]\n"); 00421 } 00422 return true; 00423 } 00424 } 00425 } 00426 if (applybox_debug > 0) { 00427 tprintf("FAIL!\n"); 00428 } 00429 return false; // Failure. 00430 } 00431 00438 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, 00439 const TBOX& box, const TBOX& next_box, 00440 const char* correct_text) { 00441 if (applybox_debug > 1) { 00442 tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text); 00443 } 00444 WERD* new_word = NULL; 00445 BLOCK_IT b_it(block_list); 00446 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { 00447 BLOCK* block = b_it.data(); 00448 if (!box.major_overlap(block->bounding_box())) 00449 continue; 00450 ROW_IT r_it(block->row_list()); 00451 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { 00452 ROW* row = r_it.data(); 00453 if (!box.major_overlap(row->bounding_box())) 00454 continue; 00455 WERD_IT w_it(row->word_list()); 00456 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 00457 WERD* word = w_it.data(); 00458 if (applybox_debug > 2) { 00459 tprintf("Checking word:"); 00460 word->bounding_box().print(); 00461 } 00462 if (word->text() != NULL && word->text()[0] != '\0') 00463 continue; // Ignore words that are already done. 00464 if (!box.major_overlap(word->bounding_box())) 00465 continue; 00466 C_BLOB_IT blob_it(word->cblob_list()); 00467 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); 00468 blob_it.forward()) { 00469 C_BLOB* blob = blob_it.data(); 00470 TBOX blob_box = blob->bounding_box(); 00471 if (!blob_box.major_overlap(box)) 00472 continue; 00473 double current_box_miss_metric = BoxMissMetric(blob_box, box); 00474 double next_box_miss_metric = BoxMissMetric(blob_box, next_box); 00475 if (applybox_debug > 2) { 00476 tprintf("Checking blob:"); 00477 blob_box.print(); 00478 tprintf("Current miss metric = %g, next = %g\n", 00479 current_box_miss_metric, next_box_miss_metric); 00480 } 00481 if (current_box_miss_metric > next_box_miss_metric) 00482 continue; // Blob is a better match for next box. 00483 if (applybox_debug > 2) { 00484 tprintf("Blob match: blob:"); 00485 blob_box.print(); 00486 tprintf("Matches box:"); 00487 box.print(); 00488 tprintf("With next box:"); 00489 next_box.print(); 00490 } 00491 if (new_word == NULL) { 00492 // Make a new word with a single blob. 00493 new_word = word->shallow_copy(); 00494 new_word->set_text(correct_text); 00495 w_it.add_to_end(new_word); 00496 } 00497 C_BLOB_IT new_blob_it(new_word->cblob_list()); 00498 new_blob_it.add_to_end(blob_it.extract()); 00499 } 00500 } 00501 } 00502 } 00503 if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n"); 00504 return new_word != NULL; 00505 } 00506 00509 void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) { 00510 PAGE_RES_IT pr_it(page_res); 00511 WERD_RES* word_res; 00512 for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) { 00513 WERD* word = word_res->word; 00514 if (word->text() == NULL || word->text()[0] == '\0') 00515 continue; // Ignore words that have no text. 00516 // Convert the correct text to a vector of UNICHAR_ID 00517 GenericVector<UNICHAR_ID> target_text; 00518 if (!ConvertStringToUnichars(word->text(), &target_text)) { 00519 tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", 00520 word->text()); 00521 pr_it.DeleteCurrentWord(); 00522 continue; 00523 } 00524 if (!FindSegmentation(target_text, word_res)) { 00525 tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", 00526 word->text()); 00527 pr_it.DeleteCurrentWord(); 00528 continue; 00529 } 00530 } 00531 } 00532 00535 bool Tesseract::ConvertStringToUnichars(const char* utf8, 00536 GenericVector<UNICHAR_ID>* class_ids) { 00537 for (int step = 0; *utf8 != '\0'; utf8 += step) { 00538 const char* next_space = strchr(utf8, ' '); 00539 if (next_space == NULL) 00540 next_space = utf8 + strlen(utf8); 00541 step = next_space - utf8; 00542 UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step); 00543 if (class_id == INVALID_UNICHAR_ID) { 00544 return false; 00545 } 00546 while (utf8[step] == ' ') 00547 ++step; 00548 class_ids->push_back(class_id); 00549 } 00550 return true; 00551 } 00552 00559 bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text, 00560 WERD_RES* word_res) { 00561 // Classify all required combinations of blobs and save results in choices. 00562 int word_length = word_res->box_word->length(); 00563 GenericVector<BLOB_CHOICE_LIST*>* choices = 00564 new GenericVector<BLOB_CHOICE_LIST*>[word_length]; 00565 for (int i = 0; i < word_length; ++i) { 00566 for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) { 00567 BLOB_CHOICE_LIST* match_result = classify_piece( 00568 word_res->seam_array, i, i + j - 1, "Applybox", 00569 word_res->chopped_word, word_res->blamer_bundle); 00570 if (applybox_debug > 2) { 00571 tprintf("%d+%d:", i, j); 00572 print_ratings_list("Segment:", match_result, unicharset); 00573 } 00574 choices[i].push_back(match_result); 00575 } 00576 } 00577 // Search the segmentation graph for the target text. Must be an exact 00578 // match. Using wildcards makes it difficult to find the correct 00579 // segmentation even when it is there. 00580 word_res->best_state.clear(); 00581 GenericVector<int> search_segmentation; 00582 float best_rating = 0.0f; 00583 SearchForText(choices, 0, word_length, target_text, 0, 0.0f, 00584 &search_segmentation, &best_rating, &word_res->best_state); 00585 for (int i = 0; i < word_length; ++i) 00586 choices[i].delete_data_pointers(); 00587 delete [] choices; 00588 if (word_res->best_state.empty()) { 00589 // Build the original segmentation and if it is the same length as the 00590 // truth, assume it will do. 00591 int blob_count = 1; 00592 for (int s = 0; s < word_res->seam_array.size(); ++s) { 00593 SEAM* seam = word_res->seam_array[s]; 00594 if (!seam->HasAnySplits()) { 00595 word_res->best_state.push_back(blob_count); 00596 blob_count = 1; 00597 } else { 00598 ++blob_count; 00599 } 00600 } 00601 word_res->best_state.push_back(blob_count); 00602 if (word_res->best_state.size() != target_text.size()) { 00603 word_res->best_state.clear(); // No good. Original segmentation bad size. 00604 return false; 00605 } 00606 } 00607 word_res->correct_text.clear(); 00608 for (int i = 0; i < target_text.size(); ++i) { 00609 word_res->correct_text.push_back( 00610 STRING(unicharset.id_to_unichar(target_text[i]))); 00611 } 00612 return true; 00613 } 00614 00629 void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices, 00630 int choices_pos, int choices_length, 00631 const GenericVector<UNICHAR_ID>& target_text, 00632 int text_index, 00633 float rating, GenericVector<int>* segmentation, 00634 float* best_rating, 00635 GenericVector<int>* best_segmentation) { 00636 const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs(); 00637 for (int length = 1; length <= choices[choices_pos].size(); ++length) { 00638 // Rating of matching choice or worst choice if no match. 00639 float choice_rating = 0.0f; 00640 // Find the corresponding best BLOB_CHOICE. 00641 BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]); 00642 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); 00643 choice_it.forward()) { 00644 BLOB_CHOICE* choice = choice_it.data(); 00645 choice_rating = choice->rating(); 00646 UNICHAR_ID class_id = choice->unichar_id(); 00647 if (class_id == target_text[text_index]) { 00648 break; 00649 } 00650 // Search ambigs table. 00651 if (class_id < table.size() && table[class_id] != NULL) { 00652 AmbigSpec_IT spec_it(table[class_id]); 00653 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); 00654 spec_it.forward()) { 00655 const AmbigSpec *ambig_spec = spec_it.data(); 00656 // We'll only do 1-1. 00657 if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID && 00658 ambig_spec->correct_ngram_id == target_text[text_index]) 00659 break; 00660 } 00661 if (!spec_it.cycled_list()) 00662 break; // Found an ambig. 00663 } 00664 } 00665 if (choice_it.cycled_list()) 00666 continue; // No match. 00667 segmentation->push_back(length); 00668 if (choices_pos + length == choices_length && 00669 text_index + 1 == target_text.size()) { 00670 // This is a complete match. If the rating is good record a new best. 00671 if (applybox_debug > 2) { 00672 tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n", 00673 rating + choice_rating, *best_rating, segmentation->size(), 00674 best_segmentation->size()); 00675 } 00676 if (best_segmentation->empty() || rating + choice_rating < *best_rating) { 00677 *best_segmentation = *segmentation; 00678 *best_rating = rating + choice_rating; 00679 } 00680 } else if (choices_pos + length < choices_length && 00681 text_index + 1 < target_text.size()) { 00682 if (applybox_debug > 3) { 00683 tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", 00684 target_text[text_index], 00685 unicharset.id_to_unichar(target_text[text_index]), 00686 choice_it.data()->unichar_id() == target_text[text_index] 00687 ? "Match" : "Ambig", 00688 choices_pos, length); 00689 } 00690 SearchForText(choices, choices_pos + length, choices_length, target_text, 00691 text_index + 1, rating + choice_rating, segmentation, 00692 best_rating, best_segmentation); 00693 if (applybox_debug > 3) { 00694 tprintf("End recursion for %d=%s\n", target_text[text_index], 00695 unicharset.id_to_unichar(target_text[text_index])); 00696 } 00697 } 00698 segmentation->truncate(segmentation->size() - 1); 00699 } 00700 } 00701 00706 void Tesseract::TidyUp(PAGE_RES* page_res) { 00707 int ok_blob_count = 0; 00708 int bad_blob_count = 0; 00709 int ok_word_count = 0; 00710 int unlabelled_words = 0; 00711 PAGE_RES_IT pr_it(page_res); 00712 WERD_RES* word_res; 00713 for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) { 00714 int ok_in_word = 0; 00715 int blob_count = word_res->correct_text.size(); 00716 WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count); 00717 word_choice->set_permuter(TOP_CHOICE_PERM); 00718 for (int c = 0; c < blob_count; ++c) { 00719 if (word_res->correct_text[c].length() > 0) { 00720 ++ok_in_word; 00721 } 00722 // Since we only need a fake word_res->best_choice, the actual 00723 // unichar_ids do not matter. Which is fortunate, since TidyUp() 00724 // can be called while training Tesseract, at the stage where 00725 // unicharset is not meaningful yet. 00726 word_choice->append_unichar_id_space_allocated( 00727 INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f); 00728 } 00729 if (ok_in_word > 0) { 00730 ok_blob_count += ok_in_word; 00731 bad_blob_count += word_res->correct_text.size() - ok_in_word; 00732 word_res->LogNewRawChoice(word_choice); 00733 word_res->LogNewCookedChoice(1, false, word_choice); 00734 } else { 00735 ++unlabelled_words; 00736 if (applybox_debug > 0) { 00737 tprintf("APPLY_BOXES: Unlabelled word at :"); 00738 word_res->word->bounding_box().print(); 00739 } 00740 pr_it.DeleteCurrentWord(); 00741 delete word_choice; 00742 } 00743 } 00744 pr_it.restart_page(); 00745 for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) { 00746 // Denormalize back to a BoxWord. 00747 word_res->RebuildBestState(); 00748 word_res->SetupBoxWord(); 00749 word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row()); 00750 word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row()); 00751 } 00752 if (applybox_debug > 0) { 00753 tprintf(" Found %d good blobs.\n", ok_blob_count); 00754 if (bad_blob_count > 0) { 00755 tprintf(" Leaving %d unlabelled blobs in %d words.\n", 00756 bad_blob_count, ok_word_count); 00757 } 00758 if (unlabelled_words > 0) 00759 tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words); 00760 } 00761 } 00762 00764 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, 00765 const char *box_ch, const char *err_msg) { 00766 tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", 00767 boxfile_lineno + 1, box_ch, 00768 box.left(), box.bottom(), box.right(), box.top(), err_msg); 00769 } 00770 00772 void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) { 00773 PAGE_RES_IT pr_it(page_res); 00774 for (WERD_RES *word_res = pr_it.word(); word_res != NULL; 00775 word_res = pr_it.forward()) { 00776 WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set, 00777 word_res->correct_text.size()); 00778 for (int i = 0; i < word_res->correct_text.size(); ++i) { 00779 // The part before the first space is the real ground truth, and the 00780 // rest is the bounding box location and page number. 00781 GenericVector<STRING> tokens; 00782 word_res->correct_text[i].split(' ', &tokens); 00783 UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string()); 00784 choice->append_unichar_id_space_allocated(char_id, 00785 word_res->best_state[i], 00786 0.0f, 0.0f); 00787 } 00788 word_res->ClearWordChoices(); 00789 word_res->LogNewRawChoice(choice); 00790 word_res->LogNewCookedChoice(1, false, choice); 00791 } 00792 } 00793 00796 void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) { 00797 PAGE_RES_IT pr_it(page_res); 00798 int word_count = 0; 00799 for (WERD_RES *word_res = pr_it.word(); word_res != NULL; 00800 word_res = pr_it.forward()) { 00801 LearnWord(fontname.string(), word_res); 00802 ++word_count; 00803 } 00804 tprintf("Generated training data for %d words\n", word_count); 00805 } 00806 00807 00808 } // namespace tesseract