|
tesseract 3.04.01
|
00001 /****************************************************************** 00002 * File: control.cpp (Formerly control.c) 00003 * Description: Module-independent matcher controller. 00004 * Author: Ray Smith 00005 * Created: Thu Apr 23 11:09:58 BST 1992 00006 * ReHacked: Tue Sep 22 08:42:49 BST 1992 Phil Cheatle 00007 * 00008 * (C) Copyright 1992, Hewlett-Packard Ltd. 00009 ** Licensed under the Apache License, Version 2.0 (the "License"); 00010 ** you may not use this file except in compliance with the License. 00011 ** You may obtain a copy of the License at 00012 ** http://www.apache.org/licenses/LICENSE-2.0 00013 ** Unless required by applicable law or agreed to in writing, software 00014 ** distributed under the License is distributed on an "AS IS" BASIS, 00015 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 ** See the License for the specific language governing permissions and 00017 ** limitations under the License. 00018 * 00019 **********************************************************************/ 00020 00021 // Include automatically generated configuration file if running autoconf. 00022 #ifdef HAVE_CONFIG_H 00023 #include "config_auto.h" 00024 #endif 00025 00026 #include <string.h> 00027 #include <math.h> 00028 #ifdef __UNIX__ 00029 #include <assert.h> 00030 #include <unistd.h> 00031 #include <errno.h> 00032 #endif 00033 #include <ctype.h> 00034 #include "ocrclass.h" 00035 #include "werdit.h" 00036 #include "drawfx.h" 00037 #include "tessbox.h" 00038 #include "tessvars.h" 00039 #include "pgedit.h" 00040 #include "reject.h" 00041 #include "fixspace.h" 00042 #include "docqual.h" 00043 #include "control.h" 00044 #include "output.h" 00045 #include "callcpp.h" 00046 #include "globals.h" 00047 #include "sorthelper.h" 00048 #include "tesseractclass.h" 00049 00050 #define MIN_FONT_ROW_COUNT 8 00051 #define MAX_XHEIGHT_DIFF 3 00052 00053 const char* const kBackUpConfigFile = "tempconfigdata.config"; 00054 // Multiple of x-height to make a repeated word have spaces in it. 00055 const double kRepcharGapThreshold = 0.5; 00056 // Min believable x-height for any text when refitting as a fraction of 00057 // original x-height 00058 const double kMinRefitXHeightFraction = 0.5; 00059 00060 00067 namespace tesseract { 00068 void Tesseract::recog_pseudo_word(PAGE_RES* page_res, 00069 TBOX &selection_box) { 00070 PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box); 00071 if (it != NULL) { 00072 recog_interactive(it); 00073 it->DeleteCurrentWord(); 00074 delete it; 00075 } 00076 } 00077 00078 00084 BOOL8 Tesseract::recog_interactive(PAGE_RES_IT* pr_it) { 00085 inT16 char_qual; 00086 inT16 good_char_qual; 00087 00088 WordData word_data(*pr_it); 00089 SetupWordPassN(2, &word_data); 00090 classify_word_and_language(2, pr_it, &word_data); 00091 if (tessedit_debug_quality_metrics) { 00092 WERD_RES* word_res = pr_it->word(); 00093 word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual); 00094 tprintf("\n%d chars; word_blob_quality: %d; outline_errs: %d; " 00095 "char_quality: %d; good_char_quality: %d\n", 00096 word_res->reject_map.length(), 00097 word_blob_quality(word_res, pr_it->row()->row), 00098 word_outline_errs(word_res), char_qual, good_char_qual); 00099 } 00100 return TRUE; 00101 } 00102 00103 // Helper function to check for a target word and handle it appropriately. 00104 // Inspired by Jetsoft's requirement to process only single words on pass2 00105 // and beyond. 00106 // If word_config is not null: 00107 // If the word_box and target_word_box overlap, read the word_config file 00108 // else reset to previous config data. 00109 // return true. 00110 // else 00111 // If the word_box and target_word_box overlap or pass <= 1, return true. 00112 // Note that this function uses a fixed temporary file for storing the previous 00113 // configs, so it is neither thread-safe, nor process-safe, but the assumption 00114 // is that it will only be used for one debug window at a time. 00115 // 00116 // Since this function is used for debugging (and not to change OCR results) 00117 // set only debug params from the word config file. 00118 bool Tesseract::ProcessTargetWord(const TBOX& word_box, 00119 const TBOX& target_word_box, 00120 const char* word_config, 00121 int pass) { 00122 if (word_config != NULL) { 00123 if (word_box.major_overlap(target_word_box)) { 00124 if (backup_config_file_ == NULL) { 00125 backup_config_file_ = kBackUpConfigFile; 00126 FILE* config_fp = fopen(backup_config_file_, "wb"); 00127 ParamUtils::PrintParams(config_fp, params()); 00128 fclose(config_fp); 00129 ParamUtils::ReadParamsFile(word_config, 00130 SET_PARAM_CONSTRAINT_DEBUG_ONLY, 00131 params()); 00132 } 00133 } else { 00134 if (backup_config_file_ != NULL) { 00135 ParamUtils::ReadParamsFile(backup_config_file_, 00136 SET_PARAM_CONSTRAINT_DEBUG_ONLY, 00137 params()); 00138 backup_config_file_ = NULL; 00139 } 00140 } 00141 } else if (pass > 1 && !word_box.major_overlap(target_word_box)) { 00142 return false; 00143 } 00144 return true; 00145 } 00146 00148 void Tesseract::SetupAllWordsPassN(int pass_n, 00149 const TBOX* target_word_box, 00150 const char* word_config, 00151 PAGE_RES* page_res, 00152 GenericVector<WordData>* words) { 00153 // Prepare all the words. 00154 PAGE_RES_IT page_res_it(page_res); 00155 for (page_res_it.restart_page(); page_res_it.word() != NULL; 00156 page_res_it.forward()) { 00157 if (target_word_box == NULL || 00158 ProcessTargetWord(page_res_it.word()->word->bounding_box(), 00159 *target_word_box, word_config, 1)) { 00160 words->push_back(WordData(page_res_it)); 00161 } 00162 } 00163 // Setup all the words for recognition with polygonal approximation. 00164 for (int w = 0; w < words->size(); ++w) { 00165 SetupWordPassN(pass_n, &(*words)[w]); 00166 if (w > 0) (*words)[w].prev_word = &(*words)[w - 1]; 00167 } 00168 } 00169 00170 // Sets up the single word ready for whichever engine is to be run. 00171 void Tesseract::SetupWordPassN(int pass_n, WordData* word) { 00172 if (pass_n == 1 || !word->word->done) { 00173 if (pass_n == 1) { 00174 word->word->SetupForRecognition(unicharset, this, BestPix(), 00175 tessedit_ocr_engine_mode, NULL, 00176 classify_bln_numeric_mode, 00177 textord_use_cjk_fp_model, 00178 poly_allow_detailed_fx, 00179 word->row, word->block); 00180 } else if (pass_n == 2) { 00181 // TODO(rays) Should we do this on pass1 too? 00182 word->word->caps_height = 0.0; 00183 if (word->word->x_height == 0.0f) 00184 word->word->x_height = word->row->x_height(); 00185 } 00186 word->lang_words.truncate(0); 00187 for (int s = 0; s <= sub_langs_.size(); ++s) { 00188 // The sub_langs_.size() entry is for the master language. 00189 Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this; 00190 WERD_RES* word_res = new WERD_RES; 00191 word_res->InitForRetryRecognition(*word->word); 00192 word->lang_words.push_back(word_res); 00193 // Cube doesn't get setup for pass2. 00194 if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY) { 00195 word_res->SetupForRecognition( 00196 lang_t->unicharset, lang_t, BestPix(), 00197 lang_t->tessedit_ocr_engine_mode, NULL, 00198 lang_t->classify_bln_numeric_mode, 00199 lang_t->textord_use_cjk_fp_model, 00200 lang_t->poly_allow_detailed_fx, word->row, word->block); 00201 } 00202 } 00203 } 00204 } 00205 00206 // Runs word recognition on all the words. 00207 bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor, 00208 PAGE_RES_IT* pr_it, 00209 GenericVector<WordData>* words) { 00210 // TODO(rays) Before this loop can be parallelized (it would yield a massive 00211 // speed-up) all remaining member globals need to be converted to local/heap 00212 // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be 00213 // added. The results will be significantly different with adaption on, and 00214 // deterioration will need investigation. 00215 pr_it->restart_page(); 00216 for (int w = 0; w < words->size(); ++w) { 00217 WordData* word = &(*words)[w]; 00218 if (w > 0) word->prev_word = &(*words)[w - 1]; 00219 if (monitor != NULL) { 00220 monitor->ocr_alive = TRUE; 00221 if (pass_n == 1) 00222 monitor->progress = 30 + 50 * w / words->size(); 00223 else 00224 monitor->progress = 80 + 10 * w / words->size(); 00225 if (monitor->deadline_exceeded() || 00226 (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, 00227 words->size()))) { 00228 // Timeout. Fake out the rest of the words. 00229 for (; w < words->size(); ++w) { 00230 (*words)[w].word->SetupFake(unicharset); 00231 } 00232 return false; 00233 } 00234 } 00235 if (word->word->tess_failed) { 00236 int s; 00237 for (s = 0; s < word->lang_words.size() && 00238 word->lang_words[s]->tess_failed; ++s) {} 00239 // If all are failed, skip it. Image words are skipped by this test. 00240 if (s > word->lang_words.size()) continue; 00241 } 00242 // Sync pr_it with the wth WordData. 00243 while (pr_it->word() != NULL && pr_it->word() != word->word) 00244 pr_it->forward(); 00245 ASSERT_HOST(pr_it->word() != NULL); 00246 bool make_next_word_fuzzy = false; 00247 if (ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) { 00248 // Needs to be setup again to see the new outlines in the chopped_word. 00249 SetupWordPassN(pass_n, word); 00250 } 00251 00252 classify_word_and_language(pass_n, pr_it, word); 00253 if (tessedit_dump_choices || debug_noise_removal) { 00254 tprintf("Pass%d: %s [%s]\n", pass_n, 00255 word->word->best_choice->unichar_string().string(), 00256 word->word->best_choice->debug_string().string()); 00257 } 00258 pr_it->forward(); 00259 if (make_next_word_fuzzy && pr_it->word() != NULL) { 00260 pr_it->MakeCurrentWordFuzzy(); 00261 } 00262 } 00263 return true; 00264 } 00265 00287 bool Tesseract::recog_all_words(PAGE_RES* page_res, 00288 ETEXT_DESC* monitor, 00289 const TBOX* target_word_box, 00290 const char* word_config, 00291 int dopasses) { 00292 PAGE_RES_IT page_res_it(page_res); 00293 00294 if (tessedit_minimal_rej_pass1) { 00295 tessedit_test_adaption.set_value (TRUE); 00296 tessedit_minimal_rejection.set_value (TRUE); 00297 } 00298 00299 if (dopasses==0 || dopasses==1) { 00300 page_res_it.restart_page(); 00301 // ****************** Pass 1 ******************* 00302 00303 // If the adaptive classifier is full switch to one we prepared earlier, 00304 // ie on the previous page. If the current adaptive classifier is non-empty, 00305 // prepare a backup starting at this page, in case it fills up. Do all this 00306 // independently for each language. 00307 if (AdaptiveClassifierIsFull()) { 00308 SwitchAdaptiveClassifier(); 00309 } else if (!AdaptiveClassifierIsEmpty()) { 00310 StartBackupAdaptiveClassifier(); 00311 } 00312 // Now check the sub-langs as well. 00313 for (int i = 0; i < sub_langs_.size(); ++i) { 00314 if (sub_langs_[i]->AdaptiveClassifierIsFull()) { 00315 sub_langs_[i]->SwitchAdaptiveClassifier(); 00316 } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) { 00317 sub_langs_[i]->StartBackupAdaptiveClassifier(); 00318 } 00319 } 00320 // Set up all words ready for recognition, so that if parallelism is on 00321 // all the input and output classes are ready to run the classifier. 00322 GenericVector<WordData> words; 00323 SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words); 00324 if (tessedit_parallelize) { 00325 PrerecAllWordsPar(words); 00326 } 00327 00328 stats_.word_count = words.size(); 00329 00330 stats_.dict_words = 0; 00331 stats_.doc_blob_quality = 0; 00332 stats_.doc_outline_errs = 0; 00333 stats_.doc_char_quality = 0; 00334 stats_.good_char_count = 0; 00335 stats_.doc_good_char_quality = 0; 00336 00337 most_recently_used_ = this; 00338 // Run pass 1 word recognition. 00339 if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false; 00340 // Pass 1 post-processing. 00341 for (page_res_it.restart_page(); page_res_it.word() != NULL; 00342 page_res_it.forward()) { 00343 if (page_res_it.word()->word->flag(W_REP_CHAR)) { 00344 fix_rep_char(&page_res_it); 00345 continue; 00346 } 00347 00348 // Count dict words. 00349 if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) 00350 ++(stats_.dict_words); 00351 00352 // Update misadaption log (we only need to do it on pass 1, since 00353 // adaption only happens on this pass). 00354 if (page_res_it.word()->blamer_bundle != NULL && 00355 page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) { 00356 page_res->misadaption_log.push_back( 00357 page_res_it.word()->blamer_bundle->misadaption_debug()); 00358 } 00359 } 00360 } 00361 00362 if (dopasses == 1) return true; 00363 00364 // ****************** Pass 2 ******************* 00365 if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption && 00366 AnyTessLang()) { 00367 page_res_it.restart_page(); 00368 GenericVector<WordData> words; 00369 SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words); 00370 if (tessedit_parallelize) { 00371 PrerecAllWordsPar(words); 00372 } 00373 most_recently_used_ = this; 00374 // Run pass 2 word recognition. 00375 if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false; 00376 } 00377 00378 // The next passes can only be run if tesseract has been used, as cube 00379 // doesn't set all the necessary outputs in WERD_RES. 00380 if (AnyTessLang()) { 00381 // ****************** Pass 3 ******************* 00382 // Fix fuzzy spaces. 00383 set_global_loc_code(LOC_FUZZY_SPACE); 00384 00385 if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces 00386 && !tessedit_word_for_word && !right_to_left()) 00387 fix_fuzzy_spaces(monitor, stats_.word_count, page_res); 00388 00389 // ****************** Pass 4 ******************* 00390 if (tessedit_enable_dict_correction) dictionary_correction_pass(page_res); 00391 if (tessedit_enable_bigram_correction) bigram_correction_pass(page_res); 00392 00393 // ****************** Pass 5,6 ******************* 00394 rejection_passes(page_res, monitor, target_word_box, word_config); 00395 00396 #ifndef NO_CUBE_BUILD 00397 // ****************** Pass 7 ******************* 00398 // Cube combiner. 00399 // If cube is loaded and its combiner is present, run it. 00400 if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { 00401 run_cube_combiner(page_res); 00402 } 00403 #endif 00404 00405 // ****************** Pass 8 ******************* 00406 font_recognition_pass(page_res); 00407 00408 // ****************** Pass 9 ******************* 00409 // Check the correctness of the final results. 00410 blamer_pass(page_res); 00411 script_pos_pass(page_res); 00412 } 00413 00414 // Write results pass. 00415 set_global_loc_code(LOC_WRITE_RESULTS); 00416 // This is now redundant, but retained commented so show how to obtain 00417 // bounding boxes and style information. 00418 00419 // changed by jetsoft 00420 // needed for dll to output memory structure 00421 if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) 00422 output_pass(page_res_it, target_word_box); 00423 // end jetsoft 00424 PageSegMode pageseg_mode = static_cast<PageSegMode>( 00425 static_cast<int>(tessedit_pageseg_mode)); 00426 textord_.CleanupSingleRowResult(pageseg_mode, page_res); 00427 00428 // Remove empty words, as these mess up the result iterators. 00429 for (page_res_it.restart_page(); page_res_it.word() != NULL; 00430 page_res_it.forward()) { 00431 WERD_RES* word = page_res_it.word(); 00432 if (word->best_choice == NULL || word->best_choice->length() == 0) 00433 page_res_it.DeleteCurrentWord(); 00434 } 00435 00436 if (monitor != NULL) { 00437 monitor->progress = 100; 00438 } 00439 return true; 00440 } 00441 00442 void Tesseract::bigram_correction_pass(PAGE_RES *page_res) { 00443 PAGE_RES_IT word_it(page_res); 00444 00445 WERD_RES *w_prev = NULL; 00446 WERD_RES *w = word_it.word(); 00447 while (1) { 00448 w_prev = w; 00449 while (word_it.forward() != NULL && 00450 (!word_it.word() || word_it.word()->part_of_combo)) { 00451 // advance word_it, skipping over parts of combos 00452 } 00453 if (!word_it.word()) break; 00454 w = word_it.word(); 00455 if (!w || !w_prev || w->uch_set != w_prev->uch_set) { 00456 continue; 00457 } 00458 if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) { 00459 if (tessedit_bigram_debug) { 00460 tprintf("Skipping because one of the words is W_REP_CHAR\n"); 00461 } 00462 continue; 00463 } 00464 // Two words sharing the same language model, excellent! 00465 GenericVector<WERD_CHOICE *> overrides_word1; 00466 GenericVector<WERD_CHOICE *> overrides_word2; 00467 00468 STRING orig_w1_str = w_prev->best_choice->unichar_string(); 00469 STRING orig_w2_str = w->best_choice->unichar_string(); 00470 WERD_CHOICE prev_best(w->uch_set); 00471 { 00472 int w1start, w1end; 00473 w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end); 00474 prev_best = w_prev->best_choice->shallow_copy(w1start, w1end); 00475 } 00476 WERD_CHOICE this_best(w->uch_set); 00477 { 00478 int w2start, w2end; 00479 w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end); 00480 this_best = w->best_choice->shallow_copy(w2start, w2end); 00481 } 00482 00483 if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) { 00484 if (tessedit_bigram_debug) { 00485 tprintf("Top choice \"%s %s\" verified by bigram model.\n", 00486 orig_w1_str.string(), orig_w2_str.string()); 00487 } 00488 continue; 00489 } 00490 if (tessedit_bigram_debug > 2) { 00491 tprintf("Examining alt choices for \"%s %s\".\n", 00492 orig_w1_str.string(), orig_w2_str.string()); 00493 } 00494 if (tessedit_bigram_debug > 1) { 00495 if (!w_prev->best_choices.singleton()) { 00496 w_prev->PrintBestChoices(); 00497 } 00498 if (!w->best_choices.singleton()) { 00499 w->PrintBestChoices(); 00500 } 00501 } 00502 float best_rating = 0.0; 00503 int best_idx = 0; 00504 WERD_CHOICE_IT prev_it(&w_prev->best_choices); 00505 for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) { 00506 WERD_CHOICE *p1 = prev_it.data(); 00507 WERD_CHOICE strip1(w->uch_set); 00508 { 00509 int p1start, p1end; 00510 p1->GetNonSuperscriptSpan(&p1start, &p1end); 00511 strip1 = p1->shallow_copy(p1start, p1end); 00512 } 00513 WERD_CHOICE_IT w_it(&w->best_choices); 00514 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 00515 WERD_CHOICE *p2 = w_it.data(); 00516 WERD_CHOICE strip2(w->uch_set); 00517 { 00518 int p2start, p2end; 00519 p2->GetNonSuperscriptSpan(&p2start, &p2end); 00520 strip2 = p2->shallow_copy(p2start, p2end); 00521 } 00522 if (w->tesseract->getDict().valid_bigram(strip1, strip2)) { 00523 overrides_word1.push_back(p1); 00524 overrides_word2.push_back(p2); 00525 if (overrides_word1.size() == 1 || 00526 p1->rating() + p2->rating() < best_rating) { 00527 best_rating = p1->rating() + p2->rating(); 00528 best_idx = overrides_word1.size() - 1; 00529 } 00530 } 00531 } 00532 } 00533 if (overrides_word1.size() >= 1) { 00534 // Excellent, we have some bigram matches. 00535 if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, 00536 *overrides_word1[best_idx]) && 00537 EqualIgnoringCaseAndTerminalPunct(*w->best_choice, 00538 *overrides_word2[best_idx])) { 00539 if (tessedit_bigram_debug > 1) { 00540 tprintf("Top choice \"%s %s\" verified (sans case) by bigram " 00541 "model.\n", orig_w1_str.string(), orig_w2_str.string()); 00542 } 00543 continue; 00544 } 00545 STRING new_w1_str = overrides_word1[best_idx]->unichar_string(); 00546 STRING new_w2_str = overrides_word2[best_idx]->unichar_string(); 00547 if (new_w1_str != orig_w1_str) { 00548 w_prev->ReplaceBestChoice(overrides_word1[best_idx]); 00549 } 00550 if (new_w2_str != orig_w2_str) { 00551 w->ReplaceBestChoice(overrides_word2[best_idx]); 00552 } 00553 if (tessedit_bigram_debug > 0) { 00554 STRING choices_description; 00555 int num_bigram_choices 00556 = overrides_word1.size() * overrides_word2.size(); 00557 if (num_bigram_choices == 1) { 00558 choices_description = "This was the unique bigram choice."; 00559 } else { 00560 if (tessedit_bigram_debug > 1) { 00561 STRING bigrams_list; 00562 const int kMaxChoicesToPrint = 20; 00563 for (int i = 0; i < overrides_word1.size() && 00564 i < kMaxChoicesToPrint; i++) { 00565 if (i > 0) { bigrams_list += ", "; } 00566 WERD_CHOICE *p1 = overrides_word1[i]; 00567 WERD_CHOICE *p2 = overrides_word2[i]; 00568 bigrams_list += p1->unichar_string() + " " + p2->unichar_string(); 00569 if (i == kMaxChoicesToPrint) { 00570 bigrams_list += " ..."; 00571 } 00572 } 00573 choices_description = "There were many choices: {"; 00574 choices_description += bigrams_list; 00575 choices_description += "}"; 00576 } else { 00577 choices_description.add_str_int("There were ", num_bigram_choices); 00578 choices_description += " compatible bigrams."; 00579 } 00580 } 00581 tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", 00582 orig_w1_str.string(), orig_w2_str.string(), 00583 new_w1_str.string(), new_w2_str.string(), 00584 choices_description.string()); 00585 } 00586 } 00587 } 00588 } 00589 00590 void Tesseract::rejection_passes(PAGE_RES* page_res, 00591 ETEXT_DESC* monitor, 00592 const TBOX* target_word_box, 00593 const char* word_config) { 00594 PAGE_RES_IT page_res_it(page_res); 00595 // ****************** Pass 5 ******************* 00596 // Gather statistics on rejects. 00597 int word_index = 0; 00598 while (!tessedit_test_adaption && page_res_it.word() != NULL) { 00599 set_global_loc_code(LOC_MM_ADAPT); 00600 WERD_RES* word = page_res_it.word(); 00601 word_index++; 00602 if (monitor != NULL) { 00603 monitor->ocr_alive = TRUE; 00604 monitor->progress = 95 + 5 * word_index / stats_.word_count; 00605 } 00606 if (word->rebuild_word == NULL) { 00607 // Word was not processed by tesseract. 00608 page_res_it.forward(); 00609 continue; 00610 } 00611 check_debug_pt(word, 70); 00612 00613 // changed by jetsoft 00614 // specific to its needs to extract one word when need 00615 if (target_word_box && 00616 !ProcessTargetWord(word->word->bounding_box(), 00617 *target_word_box, word_config, 4)) { 00618 page_res_it.forward(); 00619 continue; 00620 } 00621 // end jetsoft 00622 00623 page_res_it.rej_stat_word(); 00624 int chars_in_word = word->reject_map.length(); 00625 int rejects_in_word = word->reject_map.reject_count(); 00626 00627 int blob_quality = word_blob_quality(word, page_res_it.row()->row); 00628 stats_.doc_blob_quality += blob_quality; 00629 int outline_errs = word_outline_errs(word); 00630 stats_.doc_outline_errs += outline_errs; 00631 inT16 all_char_quality; 00632 inT16 accepted_all_char_quality; 00633 word_char_quality(word, page_res_it.row()->row, 00634 &all_char_quality, &accepted_all_char_quality); 00635 stats_.doc_char_quality += all_char_quality; 00636 uinT8 permuter_type = word->best_choice->permuter(); 00637 if ((permuter_type == SYSTEM_DAWG_PERM) || 00638 (permuter_type == FREQ_DAWG_PERM) || 00639 (permuter_type == USER_DAWG_PERM)) { 00640 stats_.good_char_count += chars_in_word - rejects_in_word; 00641 stats_.doc_good_char_quality += accepted_all_char_quality; 00642 } 00643 check_debug_pt(word, 80); 00644 if (tessedit_reject_bad_qual_wds && 00645 (blob_quality == 0) && (outline_errs >= chars_in_word)) 00646 word->reject_map.rej_word_bad_quality(); 00647 check_debug_pt(word, 90); 00648 page_res_it.forward(); 00649 } 00650 00651 if (tessedit_debug_quality_metrics) { 00652 tprintf 00653 ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f" 00654 " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n", 00655 page_res->char_count, page_res->rej_count, 00656 page_res->rej_count / static_cast<float>(page_res->char_count), 00657 stats_.doc_blob_quality, 00658 stats_.doc_blob_quality / static_cast<float>(page_res->char_count), 00659 stats_.doc_outline_errs, 00660 stats_.doc_outline_errs / static_cast<float>(page_res->char_count), 00661 stats_.doc_char_quality, 00662 stats_.doc_char_quality / static_cast<float>(page_res->char_count), 00663 stats_.doc_good_char_quality, 00664 (stats_.good_char_count > 0) ? 00665 (stats_.doc_good_char_quality / 00666 static_cast<float>(stats_.good_char_count)) : 0.0); 00667 } 00668 BOOL8 good_quality_doc = 00669 ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= 00670 quality_rej_pc) && 00671 (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= 00672 quality_blob_pc) && 00673 (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= 00674 quality_outline_pc) && 00675 (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= 00676 quality_char_pc); 00677 00678 // ****************** Pass 6 ******************* 00679 // Do whole document or whole block rejection pass 00680 if (!tessedit_test_adaption) { 00681 set_global_loc_code(LOC_DOC_BLK_REJ); 00682 quality_based_rejection(page_res_it, good_quality_doc); 00683 } 00684 } 00685 00686 void Tesseract::blamer_pass(PAGE_RES* page_res) { 00687 if (!wordrec_run_blamer) return; 00688 PAGE_RES_IT page_res_it(page_res); 00689 for (page_res_it.restart_page(); page_res_it.word() != NULL; 00690 page_res_it.forward()) { 00691 WERD_RES *word = page_res_it.word(); 00692 BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word); 00693 page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++; 00694 } 00695 tprintf("Blame reasons:\n"); 00696 for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) { 00697 tprintf("%s %d\n", BlamerBundle::IncorrectReasonName( 00698 static_cast<IncorrectResultReason>(bl)), 00699 page_res->blame_reasons[bl]); 00700 } 00701 if (page_res->misadaption_log.length() > 0) { 00702 tprintf("Misadaption log:\n"); 00703 for (int i = 0; i < page_res->misadaption_log.length(); ++i) { 00704 tprintf("%s\n", page_res->misadaption_log[i].string()); 00705 } 00706 } 00707 } 00708 00709 // Sets script positions and detects smallcaps on all output words. 00710 void Tesseract::script_pos_pass(PAGE_RES* page_res) { 00711 PAGE_RES_IT page_res_it(page_res); 00712 for (page_res_it.restart_page(); page_res_it.word() != NULL; 00713 page_res_it.forward()) { 00714 WERD_RES* word = page_res_it.word(); 00715 if (word->word->flag(W_REP_CHAR)) { 00716 page_res_it.forward(); 00717 continue; 00718 } 00719 float x_height = page_res_it.block()->block->x_height(); 00720 float word_x_height = word->x_height; 00721 if (word_x_height < word->best_choice->min_x_height() || 00722 word_x_height > word->best_choice->max_x_height()) { 00723 word_x_height = (word->best_choice->min_x_height() + 00724 word->best_choice->max_x_height()) / 2.0f; 00725 } 00726 // Test for small caps. Word capheight must be close to block xheight, 00727 // and word must contain no lower case letters, and at least one upper case. 00728 double small_cap_xheight = x_height * kXHeightCapRatio; 00729 double small_cap_delta = (x_height - small_cap_xheight) / 2.0; 00730 if (word->uch_set->script_has_xheight() && 00731 small_cap_xheight - small_cap_delta <= word_x_height && 00732 word_x_height <= small_cap_xheight + small_cap_delta) { 00733 // Scan for upper/lower. 00734 int num_upper = 0; 00735 int num_lower = 0; 00736 for (int i = 0; i < word->best_choice->length(); ++i) { 00737 if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) 00738 ++num_upper; 00739 else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) 00740 ++num_lower; 00741 } 00742 if (num_upper > 0 && num_lower == 0) 00743 word->small_caps = true; 00744 } 00745 word->SetScriptPositions(); 00746 } 00747 } 00748 00749 // Factored helper considers the indexed word and updates all the pointed 00750 // values. 00751 static void EvaluateWord(const PointerVector<WERD_RES>& words, int index, 00752 float* rating, float* certainty, bool* bad, 00753 bool* valid_permuter, int* right, int* next_left) { 00754 *right = -MAX_INT32; 00755 *next_left = MAX_INT32; 00756 if (index < words.size()) { 00757 WERD_CHOICE* choice = words[index]->best_choice; 00758 if (choice == NULL) { 00759 *bad = true; 00760 } else { 00761 *rating += choice->rating(); 00762 *certainty = MIN(*certainty, choice->certainty()); 00763 if (!Dict::valid_word_permuter(choice->permuter(), false)) 00764 *valid_permuter = false; 00765 } 00766 *right = words[index]->word->bounding_box().right(); 00767 if (index + 1 < words.size()) 00768 *next_left = words[index + 1]->word->bounding_box().left(); 00769 } else { 00770 *valid_permuter = false; 00771 *bad = true; 00772 } 00773 } 00774 00775 // Helper chooses the best combination of words, transferring good ones from 00776 // new_words to best_words. To win, a new word must have (better rating and 00777 // certainty) or (better permuter status and rating within rating ratio and 00778 // certainty within certainty margin) than current best. 00779 // All the new_words are consumed (moved to best_words or deleted.) 00780 // The return value is the number of new_words used minus the number of 00781 // best_words that remain in the output. 00782 static int SelectBestWords(double rating_ratio, 00783 double certainty_margin, 00784 bool debug, 00785 PointerVector<WERD_RES>* new_words, 00786 PointerVector<WERD_RES>* best_words) { 00787 // Process the smallest groups of words that have an overlapping word 00788 // boundary at the end. 00789 GenericVector<WERD_RES*> out_words; 00790 // Index into each word vector (best, new). 00791 int b = 0, n = 0; 00792 int num_best = 0, num_new = 0; 00793 while (b < best_words->size() || n < new_words->size()) { 00794 // Start of the current run in each. 00795 int start_b = b, start_n = n; 00796 // Rating of the current run in each. 00797 float b_rating = 0.0f, n_rating = 0.0f; 00798 // Certainty of the current run in each. 00799 float b_certainty = 0.0f, n_certainty = 0.0f; 00800 // True if any word is missing its best choice. 00801 bool b_bad = false, n_bad = false; 00802 // True if all words have a valid permuter. 00803 bool b_valid_permuter = true, n_valid_permuter = true; 00804 00805 while (b < best_words->size() || n < new_words->size()) { 00806 int b_right = -MAX_INT32; 00807 int next_b_left = MAX_INT32; 00808 EvaluateWord(*best_words, b, &b_rating, &b_certainty, &b_bad, 00809 &b_valid_permuter, &b_right, &next_b_left); 00810 int n_right = -MAX_INT32; 00811 int next_n_left = MAX_INT32; 00812 EvaluateWord(*new_words, n, &n_rating, &n_certainty, &n_bad, 00813 &n_valid_permuter, &n_right, &next_n_left); 00814 if (MAX(b_right, n_right) < MIN(next_b_left, next_n_left)) { 00815 // The word breaks overlap. [start_b,b] and [start_n, n] match. 00816 break; 00817 } 00818 // Keep searching for the matching word break. 00819 if ((b_right < n_right && b < best_words->size()) || 00820 n == new_words->size()) 00821 ++b; 00822 else 00823 ++n; 00824 } 00825 bool new_better = false; 00826 if (!n_bad && (b_bad || (n_certainty > b_certainty && 00827 n_rating < b_rating) || 00828 (!b_valid_permuter && n_valid_permuter && 00829 n_rating < b_rating * rating_ratio && 00830 n_certainty > b_certainty - certainty_margin))) { 00831 // New is better. 00832 for (int i = start_n; i <= n; ++i) { 00833 out_words.push_back((*new_words)[i]); 00834 (*new_words)[i] = NULL; 00835 ++num_new; 00836 } 00837 new_better = true; 00838 } else if (!b_bad) { 00839 // Current best is better. 00840 for (int i = start_b; i <= b; ++i) { 00841 out_words.push_back((*best_words)[i]); 00842 (*best_words)[i] = NULL; 00843 ++num_best; 00844 } 00845 } 00846 int end_b = b < best_words->size() ? b + 1 : b; 00847 int end_n = n < new_words->size() ? n + 1 : n; 00848 if (debug) { 00849 tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g" 00850 " valid dict: %d v %d\n", 00851 end_n - start_n, new_better ? "better" : "worse", 00852 end_b - start_b, n_rating, b_rating, 00853 n_certainty, b_certainty, n_valid_permuter, b_valid_permuter); 00854 } 00855 // Move on to the next group. 00856 b = end_b; 00857 n = end_n; 00858 } 00859 // Transfer from out_words to best_words. 00860 best_words->clear(); 00861 for (int i = 0; i < out_words.size(); ++i) 00862 best_words->push_back(out_words[i]); 00863 return num_new - num_best; 00864 } 00865 00866 // Helper to recognize the word using the given (language-specific) tesseract. 00867 // Returns positive if this recognizer found more new best words than the 00868 // number kept from best_words. 00869 int Tesseract::RetryWithLanguage(const WordData& word_data, 00870 WordRecognizer recognizer, 00871 WERD_RES** in_word, 00872 PointerVector<WERD_RES>* best_words) { 00873 bool debug = classify_debug_level || cube_debug_level; 00874 if (debug) { 00875 tprintf("Trying word using lang %s, oem %d\n", 00876 lang.string(), static_cast<int>(tessedit_ocr_engine_mode)); 00877 } 00878 // Run the recognizer on the word. 00879 PointerVector<WERD_RES> new_words; 00880 (this->*recognizer)(word_data, in_word, &new_words); 00881 if (new_words.empty()) { 00882 // Transfer input word to new_words, as the classifier must have put 00883 // the result back in the input. 00884 new_words.push_back(*in_word); 00885 *in_word = NULL; 00886 } 00887 if (debug) { 00888 for (int i = 0; i < new_words.size(); ++i) 00889 new_words[i]->DebugTopChoice("Lang result"); 00890 } 00891 // Initial version is a bit of a hack based on better certainty and rating 00892 // (to reduce false positives from cube) or a dictionary vs non-dictionary 00893 // word. 00894 return SelectBestWords(classify_max_rating_ratio, 00895 classify_max_certainty_margin, 00896 debug, &new_words, best_words); 00897 } 00898 00899 // Helper returns true if all the words are acceptable. 00900 static bool WordsAcceptable(const PointerVector<WERD_RES>& words) { 00901 for (int w = 0; w < words.size(); ++w) { 00902 if (words[w]->tess_failed || !words[w]->tess_accepted) return false; 00903 } 00904 return true; 00905 } 00906 00907 // Moves good-looking "noise"/diacritics from the reject list to the main 00908 // blob list on the current word. Returns true if anything was done, and 00909 // sets make_next_word_fuzzy if blob(s) were added to the end of the word. 00910 bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT* pr_it, 00911 bool* make_next_word_fuzzy) { 00912 *make_next_word_fuzzy = false; 00913 WERD* real_word = pr_it->word()->word; 00914 if (real_word->rej_cblob_list()->empty() || 00915 real_word->cblob_list()->empty() || 00916 real_word->rej_cblob_list()->length() > noise_maxperword) 00917 return false; 00918 real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle); 00919 // Get the noise outlines into a vector with matching bool map. 00920 GenericVector<C_OUTLINE*> outlines; 00921 real_word->GetNoiseOutlines(&outlines); 00922 GenericVector<bool> word_wanted; 00923 GenericVector<bool> overlapped_any_blob; 00924 GenericVector<C_BLOB*> target_blobs; 00925 AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, 00926 &word_wanted, &overlapped_any_blob, 00927 &target_blobs); 00928 // Filter the outlines that overlapped any blob and put them into the word 00929 // now. This simplifies the remaining task and also makes it more accurate 00930 // as it has more completed blobs to work on. 00931 GenericVector<bool> wanted; 00932 GenericVector<C_BLOB*> wanted_blobs; 00933 GenericVector<C_OUTLINE*> wanted_outlines; 00934 int num_overlapped = 0; 00935 int num_overlapped_used = 0; 00936 for (int i = 0; i < overlapped_any_blob.size(); ++i) { 00937 if (overlapped_any_blob[i]) { 00938 ++num_overlapped; 00939 if (word_wanted[i]) ++num_overlapped_used; 00940 wanted.push_back(word_wanted[i]); 00941 wanted_blobs.push_back(target_blobs[i]); 00942 wanted_outlines.push_back(outlines[i]); 00943 outlines[i] = NULL; 00944 } 00945 } 00946 real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, NULL); 00947 AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, 00948 &target_blobs); 00949 int non_overlapped = 0; 00950 int non_overlapped_used = 0; 00951 for (int i = 0; i < word_wanted.size(); ++i) { 00952 if (word_wanted[i]) ++non_overlapped_used; 00953 if (outlines[i] != NULL) ++non_overlapped_used; 00954 } 00955 if (debug_noise_removal) { 00956 tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:", 00957 num_overlapped_used, num_overlapped, non_overlapped_used, 00958 non_overlapped); 00959 real_word->bounding_box().print(); 00960 } 00961 // Now we have decided which outlines we want, put them into the real_word. 00962 if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, 00963 make_next_word_fuzzy)) { 00964 pr_it->MakeCurrentWordFuzzy(); 00965 } 00966 // TODO(rays) Parts of combos have a deep copy of the real word, and need 00967 // to have their noise outlines moved/assigned in the same way!! 00968 return num_overlapped_used != 0 || non_overlapped_used != 0; 00969 } 00970 00971 // Attempts to put noise/diacritic outlines into the blobs that they overlap. 00972 // Input: a set of noisy outlines that probably belong to the real_word. 00973 // Output: word_wanted indicates which outlines are to be assigned to a blob, 00974 // target_blobs indicates which to assign to, and overlapped_any_blob is 00975 // true for all outlines that overlapped a blob. 00976 void Tesseract::AssignDiacriticsToOverlappingBlobs( 00977 const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word, 00978 PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted, 00979 GenericVector<bool>* overlapped_any_blob, 00980 GenericVector<C_BLOB*>* target_blobs) { 00981 GenericVector<bool> blob_wanted; 00982 word_wanted->init_to_size(outlines.size(), false); 00983 overlapped_any_blob->init_to_size(outlines.size(), false); 00984 target_blobs->init_to_size(outlines.size(), NULL); 00985 // For each real blob, find the outlines that seriously overlap it. 00986 // A single blob could be several merged characters, so there can be quite 00987 // a few outlines overlapping, and the full engine needs to be used to chop 00988 // and join to get a sensible result. 00989 C_BLOB_IT blob_it(real_word->cblob_list()); 00990 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { 00991 C_BLOB* blob = blob_it.data(); 00992 TBOX blob_box = blob->bounding_box(); 00993 blob_wanted.init_to_size(outlines.size(), false); 00994 int num_blob_outlines = 0; 00995 for (int i = 0; i < outlines.size(); ++i) { 00996 if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && 00997 !(*word_wanted)[i]) { 00998 blob_wanted[i] = true; 00999 (*overlapped_any_blob)[i] = true; 01000 ++num_blob_outlines; 01001 } 01002 } 01003 if (debug_noise_removal) { 01004 tprintf("%d noise outlines overlap blob at:", num_blob_outlines); 01005 blob_box.print(); 01006 } 01007 // If any outlines overlap the blob, and not too many, classify the blob 01008 // (using the full engine, languages and all), and choose the maximal 01009 // combination of outlines that doesn't hurt the end-result classification 01010 // by too much. Mark them as wanted. 01011 if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) { 01012 if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, 01013 outlines, num_blob_outlines, 01014 &blob_wanted)) { 01015 for (int i = 0; i < blob_wanted.size(); ++i) { 01016 if (blob_wanted[i]) { 01017 // Claim the outline and record where it is going. 01018 (*word_wanted)[i] = true; 01019 (*target_blobs)[i] = blob; 01020 } 01021 } 01022 } 01023 } 01024 } 01025 } 01026 01027 // Attempts to assign non-overlapping outlines to their nearest blobs or 01028 // make new blobs out of them. 01029 void Tesseract::AssignDiacriticsToNewBlobs( 01030 const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word, 01031 PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted, 01032 GenericVector<C_BLOB*>* target_blobs) { 01033 GenericVector<bool> blob_wanted; 01034 word_wanted->init_to_size(outlines.size(), false); 01035 target_blobs->init_to_size(outlines.size(), NULL); 01036 // Check for outlines that need to be turned into stand-alone blobs. 01037 for (int i = 0; i < outlines.size(); ++i) { 01038 if (outlines[i] == NULL) continue; 01039 // Get a set of adjacent outlines that don't overlap any existing blob. 01040 blob_wanted.init_to_size(outlines.size(), false); 01041 int num_blob_outlines = 0; 01042 TBOX total_ol_box(outlines[i]->bounding_box()); 01043 while (i < outlines.size() && outlines[i] != NULL) { 01044 blob_wanted[i] = true; 01045 total_ol_box += outlines[i]->bounding_box(); 01046 ++i; 01047 ++num_blob_outlines; 01048 } 01049 // Find the insertion point. 01050 C_BLOB_IT blob_it(real_word->cblob_list()); 01051 while (!blob_it.at_last() && 01052 blob_it.data_relative(1)->bounding_box().left() <= 01053 total_ol_box.left()) { 01054 blob_it.forward(); 01055 } 01056 // Choose which combination of them we actually want and where to put 01057 // them. 01058 if (debug_noise_removal) 01059 tprintf("Num blobless outlines = %d\n", num_blob_outlines); 01060 C_BLOB* left_blob = blob_it.data(); 01061 TBOX left_box = left_blob->bounding_box(); 01062 C_BLOB* right_blob = blob_it.at_last() ? NULL : blob_it.data_relative(1); 01063 if ((left_box.x_overlap(total_ol_box) || right_blob == NULL || 01064 !right_blob->bounding_box().x_overlap(total_ol_box)) && 01065 SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, 01066 outlines, num_blob_outlines, 01067 &blob_wanted)) { 01068 if (debug_noise_removal) tprintf("Added to left blob\n"); 01069 for (int j = 0; j < blob_wanted.size(); ++j) { 01070 if (blob_wanted[j]) { 01071 (*word_wanted)[j] = true; 01072 (*target_blobs)[j] = left_blob; 01073 } 01074 } 01075 } else if (right_blob != NULL && 01076 (!left_box.x_overlap(total_ol_box) || 01077 right_blob->bounding_box().x_overlap(total_ol_box)) && 01078 SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, 01079 right_blob, outlines, 01080 num_blob_outlines, &blob_wanted)) { 01081 if (debug_noise_removal) tprintf("Added to right blob\n"); 01082 for (int j = 0; j < blob_wanted.size(); ++j) { 01083 if (blob_wanted[j]) { 01084 (*word_wanted)[j] = true; 01085 (*target_blobs)[j] = right_blob; 01086 } 01087 } 01088 } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, NULL, 01089 outlines, num_blob_outlines, 01090 &blob_wanted)) { 01091 if (debug_noise_removal) tprintf("Fitted between blobs\n"); 01092 for (int j = 0; j < blob_wanted.size(); ++j) { 01093 if (blob_wanted[j]) { 01094 (*word_wanted)[j] = true; 01095 (*target_blobs)[j] = NULL; 01096 } 01097 } 01098 } 01099 } 01100 } 01101 01102 // Starting with ok_outlines set to indicate which outlines overlap the blob, 01103 // chooses the optimal set (approximately) and returns true if any outlines 01104 // are desired, in which case ok_outlines indicates which ones. 01105 bool Tesseract::SelectGoodDiacriticOutlines( 01106 int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob, 01107 const GenericVector<C_OUTLINE*>& outlines, int num_outlines, 01108 GenericVector<bool>* ok_outlines) { 01109 STRING best_str; 01110 float target_cert = certainty_threshold; 01111 if (blob != NULL) { 01112 float target_c2; 01113 target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2); 01114 if (debug_noise_removal) { 01115 tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(), 01116 target_cert, target_c2); 01117 blob->bounding_box().print(); 01118 } 01119 target_cert -= (target_cert - certainty_threshold) * noise_cert_factor; 01120 } 01121 GenericVector<bool> test_outlines = *ok_outlines; 01122 // Start with all the outlines in. 01123 STRING all_str; 01124 GenericVector<bool> best_outlines = *ok_outlines; 01125 float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, 01126 pr_it, blob, &all_str); 01127 if (debug_noise_removal) { 01128 TBOX ol_box; 01129 for (int i = 0; i < test_outlines.size(); ++i) { 01130 if (test_outlines[i]) ol_box += outlines[i]->bounding_box(); 01131 } 01132 tprintf("All Noise blob classified as %s=%g, delta=%g at:", 01133 all_str.string(), best_cert, best_cert - target_cert); 01134 ol_box.print(); 01135 } 01136 // Iteratively zero out the bit that improves the certainty the most, until 01137 // we get past the threshold, have zero bits, or fail to improve. 01138 int best_index = 0; // To zero out. 01139 while (num_outlines > 1 && best_index >= 0 && 01140 (blob == NULL || best_cert < target_cert || blob != NULL)) { 01141 // Find the best bit to zero out. 01142 best_index = -1; 01143 for (int i = 0; i < outlines.size(); ++i) { 01144 if (test_outlines[i]) { 01145 test_outlines[i] = false; 01146 STRING str; 01147 float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, 01148 pr_it, blob, &str); 01149 if (debug_noise_removal) { 01150 TBOX ol_box; 01151 for (int j = 0; j < outlines.size(); ++j) { 01152 if (test_outlines[j]) ol_box += outlines[j]->bounding_box(); 01153 tprintf("%d", test_outlines[j]); 01154 } 01155 tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(), 01156 cert, cert - target_cert); 01157 ol_box.print(); 01158 } 01159 if (cert > best_cert) { 01160 best_cert = cert; 01161 best_index = i; 01162 best_outlines = test_outlines; 01163 } 01164 test_outlines[i] = true; 01165 } 01166 } 01167 if (best_index >= 0) { 01168 test_outlines[best_index] = false; 01169 --num_outlines; 01170 } 01171 } 01172 if (best_cert >= target_cert) { 01173 // Save the best combination. 01174 *ok_outlines = best_outlines; 01175 if (debug_noise_removal) { 01176 tprintf("%s noise combination ", blob ? "Adding" : "New"); 01177 for (int i = 0; i < best_outlines.size(); ++i) { 01178 tprintf("%d", best_outlines[i]); 01179 } 01180 tprintf(" yields certainty %g, beating target of %g\n", best_cert, 01181 target_cert); 01182 } 01183 return true; 01184 } 01185 return false; 01186 } 01187 01188 // Classifies the given blob plus the outlines flagged by ok_outlines, undoes 01189 // the inclusion of the outlines, and returns the certainty of the raw choice. 01190 float Tesseract::ClassifyBlobPlusOutlines( 01191 const GenericVector<bool>& ok_outlines, 01192 const GenericVector<C_OUTLINE*>& outlines, int pass_n, PAGE_RES_IT* pr_it, 01193 C_BLOB* blob, STRING* best_str) { 01194 C_OUTLINE_IT ol_it; 01195 C_OUTLINE* first_to_keep = NULL; 01196 if (blob != NULL) { 01197 // Add the required outlines to the blob. 01198 ol_it.set_to_list(blob->out_list()); 01199 first_to_keep = ol_it.data(); 01200 } 01201 for (int i = 0; i < ok_outlines.size(); ++i) { 01202 if (ok_outlines[i]) { 01203 // This outline is to be added. 01204 if (blob == NULL) { 01205 blob = new C_BLOB(outlines[i]); 01206 ol_it.set_to_list(blob->out_list()); 01207 } else { 01208 ol_it.add_before_stay_put(outlines[i]); 01209 } 01210 } 01211 } 01212 float c2; 01213 float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2); 01214 ol_it.move_to_first(); 01215 if (first_to_keep == NULL) { 01216 // We created blob. Empty its outlines and delete it. 01217 for (; !ol_it.empty(); ol_it.forward()) ol_it.extract(); 01218 delete blob; 01219 cert = -c2; 01220 } else { 01221 // Remove the outlines that we put in. 01222 for (; ol_it.data() != first_to_keep; ol_it.forward()) { 01223 ol_it.extract(); 01224 } 01225 } 01226 return cert; 01227 } 01228 01229 // Classifies the given blob (part of word_data->word->word) as an individual 01230 // word, using languages, chopper etc, returning only the certainty of the 01231 // best raw choice, and undoing all the work done to fake out the word. 01232 float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, 01233 C_BLOB* blob, STRING* best_str, float* c2) { 01234 WERD* real_word = pr_it->word()->word; 01235 WERD* word = real_word->ConstructFromSingleBlob( 01236 real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob)); 01237 WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word); 01238 // Get a new iterator that points to the new word. 01239 PAGE_RES_IT it(pr_it->page_res); 01240 while (it.word() != word_res && it.word() != NULL) it.forward(); 01241 ASSERT_HOST(it.word() == word_res); 01242 WordData wd(it); 01243 // Force full initialization. 01244 SetupWordPassN(1, &wd); 01245 classify_word_and_language(pass_n, &it, &wd); 01246 if (debug_noise_removal) { 01247 tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height, 01248 wd.row->x_height(), wd.word->raw_choice->min_x_height(), 01249 wd.word->raw_choice->max_x_height()); 01250 } 01251 float cert = wd.word->raw_choice->certainty(); 01252 float rat = wd.word->raw_choice->rating(); 01253 *c2 = rat > 0.0f ? cert * cert / rat : 0.0f; 01254 *best_str = wd.word->raw_choice->unichar_string(); 01255 it.DeleteCurrentWord(); 01256 pr_it->ResetWordIterator(); 01257 return cert; 01258 } 01259 01260 // Generic function for classifying a word. Can be used either for pass1 or 01261 // pass2 according to the function passed to recognizer. 01262 // word_data holds the word to be recognized, and its block and row, and 01263 // pr_it points to the word as well, in case we are running LSTM and it wants 01264 // to output multiple words. 01265 // Recognizes in the current language, and if successful that is all. 01266 // If recognition was not successful, tries all available languages until 01267 // it gets a successful result or runs out of languages. Keeps the best result. 01268 void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it, 01269 WordData* word_data) { 01270 WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1 01271 : &Tesseract::classify_word_pass2; 01272 // Best result so far. 01273 PointerVector<WERD_RES> best_words; 01274 // Points to the best result. May be word or in lang_words. 01275 WERD_RES* word = word_data->word; 01276 clock_t start_t = clock(); 01277 if (classify_debug_level || cube_debug_level) { 01278 tprintf("%s word with lang %s at:", 01279 word->done ? "Already done" : "Processing", 01280 most_recently_used_->lang.string()); 01281 word->word->bounding_box().print(); 01282 } 01283 if (word->done) { 01284 // If done on pass1, leave it as-is. 01285 if (!word->tess_failed) 01286 most_recently_used_ = word->tesseract; 01287 return; 01288 } 01289 int sub = sub_langs_.size(); 01290 if (most_recently_used_ != this) { 01291 // Get the index of the most_recently_used_. 01292 for (sub = 0; sub < sub_langs_.size() && 01293 most_recently_used_ != sub_langs_[sub]; ++sub) {} 01294 } 01295 most_recently_used_->RetryWithLanguage( 01296 *word_data, recognizer, &word_data->lang_words[sub], &best_words); 01297 Tesseract* best_lang_tess = most_recently_used_; 01298 if (!WordsAcceptable(best_words)) { 01299 // Try all the other languages to see if they are any better. 01300 if (most_recently_used_ != this && 01301 this->RetryWithLanguage(*word_data, recognizer, 01302 &word_data->lang_words[sub_langs_.size()], 01303 &best_words) > 0) { 01304 best_lang_tess = this; 01305 } 01306 for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size(); 01307 ++i) { 01308 if (most_recently_used_ != sub_langs_[i] && 01309 sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, 01310 &word_data->lang_words[i], 01311 &best_words) > 0) { 01312 best_lang_tess = sub_langs_[i]; 01313 } 01314 } 01315 } 01316 most_recently_used_ = best_lang_tess; 01317 if (!best_words.empty()) { 01318 if (best_words.size() == 1 && !best_words[0]->combination) { 01319 // Move the best single result to the main word. 01320 word_data->word->ConsumeWordResults(best_words[0]); 01321 } else { 01322 // Words came from LSTM, and must be moved to the PAGE_RES properly. 01323 word_data->word = best_words.back(); 01324 pr_it->ReplaceCurrentWord(&best_words); 01325 } 01326 ASSERT_HOST(word_data->word->box_word != NULL); 01327 } else { 01328 tprintf("no best words!!\n"); 01329 } 01330 clock_t ocr_t = clock(); 01331 if (tessedit_timing_debug) { 01332 tprintf("%s (ocr took %.2f sec)\n", 01333 word->best_choice->unichar_string().string(), 01334 static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC); 01335 } 01336 } 01337 01344 void Tesseract::classify_word_pass1(const WordData& word_data, 01345 WERD_RES** in_word, 01346 PointerVector<WERD_RES>* out_words) { 01347 ROW* row = word_data.row; 01348 BLOCK* block = word_data.block; 01349 prev_word_best_choice_ = word_data.prev_word != NULL 01350 ? word_data.prev_word->word->best_choice : NULL; 01351 #ifndef NO_CUBE_BUILD 01352 // If we only intend to run cube - run it and return. 01353 if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { 01354 cube_word_pass1(block, row, *in_word); 01355 return; 01356 } 01357 #endif 01358 WERD_RES* word = *in_word; 01359 match_word_pass_n(1, word, row, block); 01360 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) { 01361 word->tess_would_adapt = AdaptableWord(word); 01362 bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode); 01363 01364 if (adapt_ok) { 01365 // Send word to adaptive classifier for training. 01366 word->BestChoiceToCorrectText(); 01367 LearnWord(NULL, word); 01368 // Mark misadaptions if running blamer. 01369 if (word->blamer_bundle != NULL) { 01370 word->blamer_bundle->SetMisAdaptionDebug(word->best_choice, 01371 wordrec_debug_blamer); 01372 } 01373 } 01374 01375 if (tessedit_enable_doc_dict && !word->IsAmbiguous()) 01376 tess_add_doc_word(word->best_choice); 01377 } 01378 } 01379 01380 // Helper to report the result of the xheight fix. 01381 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht, 01382 WERD_RES* word, WERD_RES* new_word) { 01383 tprintf("New XHT Match:%s = %s ", 01384 word->best_choice->unichar_string().string(), 01385 word->best_choice->debug_string().string()); 01386 word->reject_map.print(debug_fp); 01387 tprintf(" -> %s = %s ", 01388 new_word->best_choice->unichar_string().string(), 01389 new_word->best_choice->debug_string().string()); 01390 new_word->reject_map.print(debug_fp); 01391 tprintf(" %s->%s %s %s\n", 01392 word->guessed_x_ht ? "GUESS" : "CERT", 01393 new_word->guessed_x_ht ? "GUESS" : "CERT", 01394 new_x_ht > 0.1 ? "STILL DOUBT" : "OK", 01395 accept_new_word ? "ACCEPTED" : ""); 01396 } 01397 01398 // Run the x-height fix-up, based on min/max top/bottom information in 01399 // unicharset. 01400 // Returns true if the word was changed. 01401 // See the comment in fixxht.cpp for a description of the overall process. 01402 bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) { 01403 int original_misfits = CountMisfitTops(word); 01404 if (original_misfits == 0) 01405 return false; 01406 float baseline_shift = 0.0f; 01407 float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift); 01408 if (baseline_shift != 0.0f) { 01409 // Try the shift on its own first. 01410 if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height, 01411 word, block, row)) 01412 return false; 01413 original_misfits = CountMisfitTops(word); 01414 if (original_misfits > 0) { 01415 float new_baseline_shift; 01416 // Now recompute the new x_height. 01417 new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift); 01418 if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) { 01419 // No test of return value here, as we are definitely making a change 01420 // to the word by shifting the baseline. 01421 TestNewNormalization(original_misfits, baseline_shift, new_x_ht, 01422 word, block, row); 01423 } 01424 } 01425 return true; 01426 } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) { 01427 return TestNewNormalization(original_misfits, 0.0f, new_x_ht, 01428 word, block, row); 01429 } else { 01430 return false; 01431 } 01432 } 01433 01434 // Runs recognition with the test baseline shift and x-height and returns true 01435 // if there was an improvement in recognition result. 01436 bool Tesseract::TestNewNormalization(int original_misfits, 01437 float baseline_shift, float new_x_ht, 01438 WERD_RES *word, BLOCK* block, ROW *row) { 01439 bool accept_new_x_ht = false; 01440 WERD_RES new_x_ht_word(word->word); 01441 if (word->blamer_bundle != NULL) { 01442 new_x_ht_word.blamer_bundle = new BlamerBundle(); 01443 new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle)); 01444 } 01445 new_x_ht_word.x_height = new_x_ht; 01446 new_x_ht_word.baseline_shift = baseline_shift; 01447 new_x_ht_word.caps_height = 0.0; 01448 new_x_ht_word.SetupForRecognition( 01449 unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL, 01450 classify_bln_numeric_mode, textord_use_cjk_fp_model, 01451 poly_allow_detailed_fx, row, block); 01452 match_word_pass_n(2, &new_x_ht_word, row, block); 01453 if (!new_x_ht_word.tess_failed) { 01454 int new_misfits = CountMisfitTops(&new_x_ht_word); 01455 if (debug_x_ht_level >= 1) { 01456 tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", 01457 original_misfits, word->x_height, 01458 new_misfits, new_x_ht); 01459 tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", 01460 word->best_choice->rating(), word->best_choice->certainty(), 01461 new_x_ht_word.best_choice->rating(), 01462 new_x_ht_word.best_choice->certainty()); 01463 } 01464 // The misfits must improve and either the rating or certainty. 01465 accept_new_x_ht = new_misfits < original_misfits && 01466 (new_x_ht_word.best_choice->certainty() > 01467 word->best_choice->certainty() || 01468 new_x_ht_word.best_choice->rating() < 01469 word->best_choice->rating()); 01470 if (debug_x_ht_level >= 1) { 01471 ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word); 01472 } 01473 } 01474 if (accept_new_x_ht) { 01475 word->ConsumeWordResults(&new_x_ht_word); 01476 return true; 01477 } 01478 return false; 01479 } 01480 01487 void Tesseract::classify_word_pass2(const WordData& word_data, 01488 WERD_RES** in_word, 01489 PointerVector<WERD_RES>* out_words) { 01490 // Return if we do not want to run Tesseract. 01491 if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY && 01492 tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED && 01493 word_data.word->best_choice != NULL) 01494 return; 01495 if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { 01496 return; 01497 } 01498 ROW* row = word_data.row; 01499 BLOCK* block = word_data.block; 01500 WERD_RES* word = *in_word; 01501 prev_word_best_choice_ = word_data.prev_word != NULL 01502 ? word_data.prev_word->word->best_choice : NULL; 01503 01504 set_global_subloc_code(SUBLOC_NORM); 01505 check_debug_pt(word, 30); 01506 if (!word->done) { 01507 word->caps_height = 0.0; 01508 if (word->x_height == 0.0f) 01509 word->x_height = row->x_height(); 01510 match_word_pass_n(2, word, row, block); 01511 check_debug_pt(word, 40); 01512 } 01513 01514 SubAndSuperscriptFix(word); 01515 01516 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) { 01517 if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() && 01518 block->classify_rotation().y() == 0.0f) { 01519 // Use the tops and bottoms since they are available. 01520 TrainedXheightFix(word, block, row); 01521 } 01522 01523 set_global_subloc_code(SUBLOC_NORM); 01524 } 01525 #ifndef GRAPHICS_DISABLED 01526 if (tessedit_display_outwords) { 01527 if (fx_win == NULL) 01528 create_fx_win(); 01529 clear_fx_win(); 01530 word->rebuild_word->plot(fx_win); 01531 TBOX wbox = word->rebuild_word->bounding_box(); 01532 fx_win->ZoomToRectangle(wbox.left(), wbox.top(), 01533 wbox.right(), wbox.bottom()); 01534 ScrollView::Update(); 01535 } 01536 #endif 01537 set_global_subloc_code(SUBLOC_NORM); 01538 check_debug_pt(word, 50); 01539 } 01540 01541 01548 void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, 01549 ROW *row, BLOCK* block) { 01550 if (word->tess_failed) return; 01551 tess_segment_pass_n(pass_n, word); 01552 01553 if (!word->tess_failed) { 01554 if (!word->word->flag (W_REP_CHAR)) { 01555 word->fix_quotes(); 01556 if (tessedit_fix_hyphens) 01557 word->fix_hyphens(); 01558 /* Don't trust fix_quotes! - though I think I've fixed the bug */ 01559 if (word->best_choice->length() != word->box_word->length()) { 01560 tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;" 01561 " #Blobs=%d\n", 01562 word->best_choice->debug_string().string(), 01563 word->best_choice->length(), 01564 word->box_word->length()); 01565 01566 } 01567 word->tess_accepted = tess_acceptable_word(word); 01568 01569 // Also sets word->done flag 01570 make_reject_map(word, row, pass_n); 01571 } 01572 } 01573 set_word_fonts(word); 01574 01575 ASSERT_HOST(word->raw_choice != NULL); 01576 } 01577 01578 // Helper to return the best rated BLOB_CHOICE in the whole word that matches 01579 // the given char_id, or NULL if none can be found. 01580 static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id, 01581 WERD_RES* word_res) { 01582 // Find the corresponding best BLOB_CHOICE from any position in the word_res. 01583 BLOB_CHOICE* best_choice = NULL; 01584 for (int i = 0; i < word_res->best_choice->length(); ++i) { 01585 BLOB_CHOICE* choice = FindMatchingChoice(char_id, 01586 word_res->GetBlobChoices(i)); 01587 if (choice != NULL) { 01588 if (best_choice == NULL || choice->rating() < best_choice->rating()) 01589 best_choice = choice; 01590 } 01591 } 01592 return best_choice; 01593 } 01594 01595 // Helper to insert blob_choice in each location in the leader word if there is 01596 // no matching BLOB_CHOICE there already, and correct any incorrect results 01597 // in the best_choice. 01598 static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice, 01599 WERD_RES* word_res) { 01600 WERD_CHOICE* word = word_res->best_choice; 01601 for (int i = 0; i < word_res->best_choice->length(); ++i) { 01602 BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(), 01603 word_res->GetBlobChoices(i)); 01604 if (choice == NULL) { 01605 BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i)); 01606 choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice)); 01607 } 01608 } 01609 // Correct any incorrect results in word. 01610 for (int i = 0; i < word->length(); ++i) { 01611 if (word->unichar_id(i) != blob_choice->unichar_id()) 01612 word->set_unichar_id(blob_choice->unichar_id(), i); 01613 } 01614 } 01615 01623 void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) { 01624 WERD_RES *word_res = page_res_it->word(); 01625 const WERD_CHOICE &word = *(word_res->best_choice); 01626 01627 // Find the frequency of each unique character in the word. 01628 SortHelper<UNICHAR_ID> rep_ch(word.length()); 01629 for (int i = 0; i < word.length(); ++i) { 01630 rep_ch.Add(word.unichar_id(i), 1); 01631 } 01632 01633 // Find the most frequent result. 01634 UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char 01635 int max_count = rep_ch.MaxCount(&maxch_id); 01636 // Find the best exemplar of a classifier result for maxch_id. 01637 BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res); 01638 if (best_choice == NULL) { 01639 tprintf("Failed to find a choice for %s, occurring %d times\n", 01640 word_res->uch_set->debug_str(maxch_id).string(), max_count); 01641 return; 01642 } 01643 word_res->done = TRUE; 01644 01645 // Measure the mean space. 01646 int gap_count = 0; 01647 WERD* werd = word_res->word; 01648 C_BLOB_IT blob_it(werd->cblob_list()); 01649 C_BLOB* prev_blob = blob_it.data(); 01650 for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) { 01651 C_BLOB* blob = blob_it.data(); 01652 int gap = blob->bounding_box().left(); 01653 gap -= prev_blob->bounding_box().right(); 01654 ++gap_count; 01655 prev_blob = blob; 01656 } 01657 // Just correct existing classification. 01658 CorrectRepcharChoices(best_choice, word_res); 01659 word_res->reject_map.initialise(word.length()); 01660 } 01661 01662 ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string( 01663 const UNICHARSET& char_set, const char *s, const char *lengths) { 01664 int i = 0; 01665 int offset = 0; 01666 int leading_punct_count; 01667 int upper_count = 0; 01668 int hyphen_pos = -1; 01669 ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE; 01670 01671 if (strlen (lengths) > 20) 01672 return word_type; 01673 01674 /* Single Leading punctuation char*/ 01675 01676 if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset])) 01677 offset += lengths[i++]; 01678 leading_punct_count = i; 01679 01680 /* Initial cap */ 01681 while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) { 01682 offset += lengths[i++]; 01683 upper_count++; 01684 } 01685 if (upper_count > 1) { 01686 word_type = AC_UPPER_CASE; 01687 } else { 01688 /* Lower case word, possibly with an initial cap */ 01689 while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) { 01690 offset += lengths[i++]; 01691 } 01692 if (i - leading_punct_count < quality_min_initial_alphas_reqd) 01693 goto not_a_word; 01694 /* 01695 Allow a single hyphen in a lower case word 01696 - don't trust upper case - I've seen several cases of "H" -> "I-I" 01697 */ 01698 if (lengths[i] == 1 && s[offset] == '-') { 01699 hyphen_pos = i; 01700 offset += lengths[i++]; 01701 if (s[offset] != '\0') { 01702 while ((s[offset] != '\0') && 01703 char_set.get_islower(s + offset, lengths[i])) { 01704 offset += lengths[i++]; 01705 } 01706 if (i < hyphen_pos + 3) 01707 goto not_a_word; 01708 } 01709 } else { 01710 /* Allow "'s" in NON hyphenated lower case words */ 01711 if (lengths[i] == 1 && (s[offset] == '\'') && 01712 lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) { 01713 offset += lengths[i++]; 01714 offset += lengths[i++]; 01715 } 01716 } 01717 if (upper_count > 0) 01718 word_type = AC_INITIAL_CAP; 01719 else 01720 word_type = AC_LOWER_CASE; 01721 } 01722 01723 /* Up to two different, constrained trailing punctuation chars */ 01724 if (lengths[i] == 1 && s[offset] != '\0' && 01725 STRING(chs_trailing_punct1).contains(s[offset])) 01726 offset += lengths[i++]; 01727 if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && 01728 s[offset - lengths[i - 1]] != s[offset] && 01729 STRING(chs_trailing_punct2).contains (s[offset])) 01730 offset += lengths[i++]; 01731 01732 if (s[offset] != '\0') 01733 word_type = AC_UNACCEPTABLE; 01734 01735 not_a_word: 01736 01737 if (word_type == AC_UNACCEPTABLE) { 01738 /* Look for abbreviation string */ 01739 i = 0; 01740 offset = 0; 01741 if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) { 01742 word_type = AC_UC_ABBREV; 01743 while (s[offset] != '\0' && 01744 char_set.get_isupper(s + offset, lengths[i]) && 01745 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { 01746 offset += lengths[i++]; 01747 offset += lengths[i++]; 01748 } 01749 } 01750 else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) { 01751 word_type = AC_LC_ABBREV; 01752 while (s[offset] != '\0' && 01753 char_set.get_islower(s + offset, lengths[i]) && 01754 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { 01755 offset += lengths[i++]; 01756 offset += lengths[i++]; 01757 } 01758 } 01759 if (s[offset] != '\0') 01760 word_type = AC_UNACCEPTABLE; 01761 } 01762 01763 return word_type; 01764 } 01765 01766 BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) { 01767 BOOL8 show_map_detail = FALSE; 01768 inT16 i; 01769 01770 if (!test_pt) 01771 return FALSE; 01772 01773 tessedit_rejection_debug.set_value (FALSE); 01774 debug_x_ht_level.set_value(0); 01775 01776 if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) { 01777 if (location < 0) 01778 return TRUE; // For breakpoint use 01779 tessedit_rejection_debug.set_value (TRUE); 01780 debug_x_ht_level.set_value(2); 01781 tprintf ("\n\nTESTWD::"); 01782 switch (location) { 01783 case 0: 01784 tprintf ("classify_word_pass1 start\n"); 01785 word->word->print(); 01786 break; 01787 case 10: 01788 tprintf ("make_reject_map: initial map"); 01789 break; 01790 case 20: 01791 tprintf ("make_reject_map: after NN"); 01792 break; 01793 case 30: 01794 tprintf ("classify_word_pass2 - START"); 01795 break; 01796 case 40: 01797 tprintf ("classify_word_pass2 - Pre Xht"); 01798 break; 01799 case 50: 01800 tprintf ("classify_word_pass2 - END"); 01801 show_map_detail = TRUE; 01802 break; 01803 case 60: 01804 tprintf ("fixspace"); 01805 break; 01806 case 70: 01807 tprintf ("MM pass START"); 01808 break; 01809 case 80: 01810 tprintf ("MM pass END"); 01811 break; 01812 case 90: 01813 tprintf ("After Poor quality rejection"); 01814 break; 01815 case 100: 01816 tprintf ("unrej_good_quality_words - START"); 01817 break; 01818 case 110: 01819 tprintf ("unrej_good_quality_words - END"); 01820 break; 01821 case 120: 01822 tprintf ("Write results pass"); 01823 show_map_detail = TRUE; 01824 break; 01825 } 01826 if (word->best_choice != NULL) { 01827 tprintf(" \"%s\" ", word->best_choice->unichar_string().string()); 01828 word->reject_map.print(debug_fp); 01829 tprintf("\n"); 01830 if (show_map_detail) { 01831 tprintf("\"%s\"\n", word->best_choice->unichar_string().string()); 01832 for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { 01833 tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); 01834 word->reject_map[i].full_print(debug_fp); 01835 } 01836 } 01837 } else { 01838 tprintf("null best choice\n"); 01839 } 01840 tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); 01841 tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); 01842 return TRUE; 01843 } else { 01844 return FALSE; 01845 } 01846 } 01847 01853 static void find_modal_font( //good chars in word 01854 STATS *fonts, //font stats 01855 inT16 *font_out, //output font 01856 inT8 *font_count //output count 01857 ) { 01858 inT16 font; //font index 01859 inT32 count; //pile couat 01860 01861 if (fonts->get_total () > 0) { 01862 font = (inT16) fonts->mode (); 01863 *font_out = font; 01864 count = fonts->pile_count (font); 01865 *font_count = count < MAX_INT8 ? count : MAX_INT8; 01866 fonts->add (font, -*font_count); 01867 } 01868 else { 01869 *font_out = -1; 01870 *font_count = 0; 01871 } 01872 } 01873 01879 void Tesseract::set_word_fonts(WERD_RES *word) { 01880 // Don't try to set the word fonts for a cube word, as the configs 01881 // will be meaningless. 01882 if (word->chopped_word == NULL) return; 01883 ASSERT_HOST(word->best_choice != NULL); 01884 01885 int fontinfo_size = get_fontinfo_table().size(); 01886 if (fontinfo_size == 0) return; 01887 GenericVector<int> font_total_score; 01888 font_total_score.init_to_size(fontinfo_size, 0); 01889 01890 word->italic = 0; 01891 word->bold = 0; 01892 // Compute the font scores for the word 01893 if (tessedit_debug_fonts) { 01894 tprintf("Examining fonts in %s\n", 01895 word->best_choice->debug_string().string()); 01896 } 01897 for (int b = 0; b < word->best_choice->length(); ++b) { 01898 BLOB_CHOICE* choice = word->GetBlobChoice(b); 01899 if (choice == NULL) continue; 01900 const GenericVector<ScoredFont>& fonts = choice->fonts(); 01901 for (int f = 0; f < fonts.size(); ++f) { 01902 int fontinfo_id = fonts[f].fontinfo_id; 01903 if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) { 01904 font_total_score[fontinfo_id] += fonts[f].score; 01905 } 01906 } 01907 } 01908 // Find the top and 2nd choice for the word. 01909 int score1 = 0, score2 = 0; 01910 inT16 font_id1 = -1, font_id2 = -1; 01911 for (int f = 0; f < fontinfo_size; ++f) { 01912 if (tessedit_debug_fonts && font_total_score[f] > 0) { 01913 tprintf("Font %s, total score = %d\n", 01914 fontinfo_table_.get(f).name, font_total_score[f]); 01915 } 01916 if (font_total_score[f] > score1) { 01917 score2 = score1; 01918 font_id2 = font_id1; 01919 score1 = font_total_score[f]; 01920 font_id1 = f; 01921 } else if (font_total_score[f] > score2) { 01922 score2 = font_total_score[f]; 01923 font_id2 = f; 01924 } 01925 } 01926 word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL; 01927 word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL; 01928 // Each score has a limit of MAX_UINT16, so divide by that to get the number 01929 // of "votes" for that font, ie number of perfect scores. 01930 word->fontinfo_id_count = ClipToRange(score1 / MAX_UINT16, 1, MAX_INT8); 01931 word->fontinfo_id2_count = ClipToRange(score2 / MAX_UINT16, 0, MAX_INT8); 01932 if (score1 > 0) { 01933 FontInfo fi = fontinfo_table_.get(font_id1); 01934 if (tessedit_debug_fonts) { 01935 if (word->fontinfo_id2_count > 0) { 01936 tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", 01937 fi.name, word->fontinfo_id_count, 01938 fontinfo_table_.get(font_id2).name, 01939 word->fontinfo_id2_count); 01940 } else { 01941 tprintf("Word modal font=%s, score=%d. No 2nd choice\n", 01942 fi.name, word->fontinfo_id_count); 01943 } 01944 } 01945 word->italic = (fi.is_italic() ? 1 : -1) * word->fontinfo_id_count; 01946 word->bold = (fi.is_bold() ? 1 : -1) * word->fontinfo_id_count; 01947 } 01948 } 01949 01950 01957 void Tesseract::font_recognition_pass(PAGE_RES* page_res) { 01958 PAGE_RES_IT page_res_it(page_res); 01959 WERD_RES *word; // current word 01960 STATS doc_fonts(0, font_table_size_); // font counters 01961 01962 // Gather font id statistics. 01963 for (page_res_it.restart_page(); page_res_it.word() != NULL; 01964 page_res_it.forward()) { 01965 word = page_res_it.word(); 01966 if (word->fontinfo != NULL) { 01967 doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count); 01968 } 01969 if (word->fontinfo2 != NULL) { 01970 doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count); 01971 } 01972 } 01973 inT16 doc_font; // modal font 01974 inT8 doc_font_count; // modal font 01975 find_modal_font(&doc_fonts, &doc_font, &doc_font_count); 01976 if (doc_font_count == 0) 01977 return; 01978 // Get the modal font pointer. 01979 const FontInfo* modal_font = NULL; 01980 for (page_res_it.restart_page(); page_res_it.word() != NULL; 01981 page_res_it.forward()) { 01982 word = page_res_it.word(); 01983 if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) { 01984 modal_font = word->fontinfo; 01985 break; 01986 } 01987 if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) { 01988 modal_font = word->fontinfo2; 01989 break; 01990 } 01991 } 01992 ASSERT_HOST(modal_font != NULL); 01993 01994 // Assign modal font to weak words. 01995 for (page_res_it.restart_page(); page_res_it.word() != NULL; 01996 page_res_it.forward()) { 01997 word = page_res_it.word(); 01998 int length = word->best_choice->length(); 01999 02000 int count = word->fontinfo_id_count; 02001 if (!(count == length || (length > 3 && count >= length * 3 / 4))) { 02002 word->fontinfo = modal_font; 02003 // Counts only get 1 as it came from the doc. 02004 word->fontinfo_id_count = 1; 02005 word->italic = modal_font->is_italic() ? 1 : -1; 02006 word->bold = modal_font->is_bold() ? 1 : -1; 02007 } 02008 } 02009 } 02010 02011 // If a word has multiple alternates check if the best choice is in the 02012 // dictionary. If not, replace it with an alternate that exists in the 02013 // dictionary. 02014 void Tesseract::dictionary_correction_pass(PAGE_RES *page_res) { 02015 PAGE_RES_IT word_it(page_res); 02016 for (WERD_RES* word = word_it.word(); word != NULL; 02017 word = word_it.forward()) { 02018 if (word->best_choices.singleton()) 02019 continue; // There are no alternates. 02020 02021 WERD_CHOICE* best = word->best_choice; 02022 if (word->tesseract->getDict().valid_word(*best) != 0) 02023 continue; // The best choice is in the dictionary. 02024 02025 WERD_CHOICE_IT choice_it(&word->best_choices); 02026 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); 02027 choice_it.forward()) { 02028 WERD_CHOICE* alternate = choice_it.data(); 02029 if (word->tesseract->getDict().valid_word(*alternate)) { 02030 // The alternate choice is in the dictionary. 02031 if (tessedit_bigram_debug) { 02032 tprintf("Dictionary correction replaces best choice '%s' with '%s'\n", 02033 best->unichar_string().string(), 02034 alternate->unichar_string().string()); 02035 } 02036 // Replace the 'best' choice with a better choice. 02037 word->ReplaceBestChoice(alternate); 02038 break; 02039 } 02040 } 02041 } 02042 } 02043 02044 } // namespace tesseract