tesseract 3.04.01

ccmain/control.cpp

Go to the documentation of this file.
00001 /******************************************************************
00002  * File:        control.cpp  (Formerly control.c)
00003  * Description: Module-independent matcher controller.
00004  * Author:                                      Ray Smith
00005  * Created:                                     Thu Apr 23 11:09:58 BST 1992
00006  * ReHacked:    Tue Sep 22 08:42:49 BST 1992 Phil Cheatle
00007  *
00008  * (C) Copyright 1992, Hewlett-Packard Ltd.
00009  ** Licensed under the Apache License, Version 2.0 (the "License");
00010  ** you may not use this file except in compliance with the License.
00011  ** You may obtain a copy of the License at
00012  ** http://www.apache.org/licenses/LICENSE-2.0
00013  ** Unless required by applicable law or agreed to in writing, software
00014  ** distributed under the License is distributed on an "AS IS" BASIS,
00015  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016  ** See the License for the specific language governing permissions and
00017  ** limitations under the License.
00018  *
00019  **********************************************************************/
00020 
00021 // Include automatically generated configuration file if running autoconf.
00022 #ifdef HAVE_CONFIG_H
00023 #include "config_auto.h"
00024 #endif
00025 
00026 #include <string.h>
00027 #include <math.h>
00028 #ifdef __UNIX__
00029 #include <assert.h>
00030 #include <unistd.h>
00031 #include <errno.h>
00032 #endif
00033 #include <ctype.h>
00034 #include "ocrclass.h"
00035 #include "werdit.h"
00036 #include "drawfx.h"
00037 #include "tessbox.h"
00038 #include "tessvars.h"
00039 #include "pgedit.h"
00040 #include "reject.h"
00041 #include "fixspace.h"
00042 #include "docqual.h"
00043 #include "control.h"
00044 #include "output.h"
00045 #include "callcpp.h"
00046 #include "globals.h"
00047 #include "sorthelper.h"
00048 #include "tesseractclass.h"
00049 
00050 #define MIN_FONT_ROW_COUNT  8
00051 #define MAX_XHEIGHT_DIFF  3
00052 
00053 const char* const kBackUpConfigFile = "tempconfigdata.config";
00054 // Multiple of x-height to make a repeated word have spaces in it.
00055 const double kRepcharGapThreshold = 0.5;
00056 // Min believable x-height for any text when refitting as a fraction of
00057 // original x-height
00058 const double kMinRefitXHeightFraction = 0.5;
00059 
00060 
00067 namespace tesseract {
00068 void Tesseract::recog_pseudo_word(PAGE_RES* page_res,
00069                                   TBOX &selection_box) {
00070   PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
00071   if (it != NULL) {
00072     recog_interactive(it);
00073     it->DeleteCurrentWord();
00074     delete it;
00075   }
00076 }
00077 
00078 
00084 BOOL8 Tesseract::recog_interactive(PAGE_RES_IT* pr_it) {
00085   inT16 char_qual;
00086   inT16 good_char_qual;
00087 
00088   WordData word_data(*pr_it);
00089   SetupWordPassN(2, &word_data);
00090   classify_word_and_language(2, pr_it, &word_data);
00091   if (tessedit_debug_quality_metrics) {
00092     WERD_RES* word_res = pr_it->word();
00093     word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
00094     tprintf("\n%d chars;  word_blob_quality: %d;  outline_errs: %d; "
00095             "char_quality: %d; good_char_quality: %d\n",
00096             word_res->reject_map.length(),
00097             word_blob_quality(word_res, pr_it->row()->row),
00098             word_outline_errs(word_res), char_qual, good_char_qual);
00099   }
00100   return TRUE;
00101 }
00102 
00103 // Helper function to check for a target word and handle it appropriately.
00104 // Inspired by Jetsoft's requirement to process only single words on pass2
00105 // and beyond.
00106 // If word_config is not null:
00107 //   If the word_box and target_word_box overlap, read the word_config file
00108 //   else reset to previous config data.
00109 //   return true.
00110 // else
00111 //   If the word_box and target_word_box overlap or pass <= 1, return true.
00112 // Note that this function uses a fixed temporary file for storing the previous
00113 // configs, so it is neither thread-safe, nor process-safe, but the assumption
00114 // is that it will only be used for one debug window at a time.
00115 //
00116 // Since this function is used for debugging (and not to change OCR results)
00117 // set only debug params from the word config file.
00118 bool Tesseract::ProcessTargetWord(const TBOX& word_box,
00119                                   const TBOX& target_word_box,
00120                                   const char* word_config,
00121                                   int pass) {
00122   if (word_config != NULL) {
00123     if (word_box.major_overlap(target_word_box)) {
00124       if (backup_config_file_ == NULL) {
00125         backup_config_file_ = kBackUpConfigFile;
00126         FILE* config_fp = fopen(backup_config_file_, "wb");
00127         ParamUtils::PrintParams(config_fp, params());
00128         fclose(config_fp);
00129         ParamUtils::ReadParamsFile(word_config,
00130                                    SET_PARAM_CONSTRAINT_DEBUG_ONLY,
00131                                    params());
00132       }
00133     } else {
00134       if (backup_config_file_ != NULL) {
00135         ParamUtils::ReadParamsFile(backup_config_file_,
00136                                    SET_PARAM_CONSTRAINT_DEBUG_ONLY,
00137                                    params());
00138         backup_config_file_ = NULL;
00139       }
00140     }
00141   } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
00142     return false;
00143   }
00144   return true;
00145 }
00146 
00148 void Tesseract::SetupAllWordsPassN(int pass_n,
00149                                    const TBOX* target_word_box,
00150                                    const char* word_config,
00151                                    PAGE_RES* page_res,
00152                                    GenericVector<WordData>* words) {
00153   // Prepare all the words.
00154   PAGE_RES_IT page_res_it(page_res);
00155   for (page_res_it.restart_page(); page_res_it.word() != NULL;
00156        page_res_it.forward()) {
00157     if (target_word_box == NULL ||
00158         ProcessTargetWord(page_res_it.word()->word->bounding_box(),
00159                           *target_word_box, word_config, 1)) {
00160       words->push_back(WordData(page_res_it));
00161     }
00162   }
00163   // Setup all the words for recognition with polygonal approximation.
00164   for (int w = 0; w < words->size(); ++w) {
00165     SetupWordPassN(pass_n, &(*words)[w]);
00166     if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
00167   }
00168 }
00169 
00170 // Sets up the single word ready for whichever engine is to be run.
00171 void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
00172   if (pass_n == 1 || !word->word->done) {
00173     if (pass_n == 1) {
00174       word->word->SetupForRecognition(unicharset, this, BestPix(),
00175                                       tessedit_ocr_engine_mode, NULL,
00176                                       classify_bln_numeric_mode,
00177                                       textord_use_cjk_fp_model,
00178                                       poly_allow_detailed_fx,
00179                                       word->row, word->block);
00180     } else if (pass_n == 2) {
00181       // TODO(rays) Should we do this on pass1 too?
00182       word->word->caps_height = 0.0;
00183       if (word->word->x_height == 0.0f)
00184         word->word->x_height = word->row->x_height();
00185     }
00186     word->lang_words.truncate(0);
00187     for (int s = 0; s <= sub_langs_.size(); ++s) {
00188       // The sub_langs_.size() entry is for the master language.
00189       Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
00190       WERD_RES* word_res = new WERD_RES;
00191       word_res->InitForRetryRecognition(*word->word);
00192       word->lang_words.push_back(word_res);
00193       // Cube doesn't get setup for pass2.
00194       if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY) {
00195         word_res->SetupForRecognition(
00196               lang_t->unicharset, lang_t, BestPix(),
00197               lang_t->tessedit_ocr_engine_mode, NULL,
00198               lang_t->classify_bln_numeric_mode,
00199               lang_t->textord_use_cjk_fp_model,
00200               lang_t->poly_allow_detailed_fx, word->row, word->block);
00201       }
00202     }
00203   }
00204 }
00205 
00206 // Runs word recognition on all the words.
00207 bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
00208                                    PAGE_RES_IT* pr_it,
00209                                    GenericVector<WordData>* words) {
00210   // TODO(rays) Before this loop can be parallelized (it would yield a massive
00211   // speed-up) all remaining member globals need to be converted to local/heap
00212   // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
00213   // added. The results will be significantly different with adaption on, and
00214   // deterioration will need investigation.
00215   pr_it->restart_page();
00216   for (int w = 0; w < words->size(); ++w) {
00217     WordData* word = &(*words)[w];
00218     if (w > 0) word->prev_word = &(*words)[w - 1];
00219     if (monitor != NULL) {
00220       monitor->ocr_alive = TRUE;
00221       if (pass_n == 1)
00222         monitor->progress = 30 + 50 * w / words->size();
00223       else
00224         monitor->progress = 80 + 10 * w / words->size();
00225       if (monitor->deadline_exceeded() ||
00226           (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
00227                                                          words->size()))) {
00228         // Timeout. Fake out the rest of the words.
00229         for (; w < words->size(); ++w) {
00230           (*words)[w].word->SetupFake(unicharset);
00231         }
00232         return false;
00233       }
00234     }
00235     if (word->word->tess_failed) {
00236       int s;
00237       for (s = 0; s < word->lang_words.size() &&
00238            word->lang_words[s]->tess_failed; ++s) {}
00239       // If all are failed, skip it. Image words are skipped by this test.
00240       if (s > word->lang_words.size()) continue;
00241     }
00242     // Sync pr_it with the wth WordData.
00243     while (pr_it->word() != NULL && pr_it->word() != word->word)
00244       pr_it->forward();
00245     ASSERT_HOST(pr_it->word() != NULL);
00246     bool make_next_word_fuzzy = false;
00247     if (ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
00248       // Needs to be setup again to see the new outlines in the chopped_word.
00249       SetupWordPassN(pass_n, word);
00250     }
00251 
00252     classify_word_and_language(pass_n, pr_it, word);
00253     if (tessedit_dump_choices || debug_noise_removal) {
00254       tprintf("Pass%d: %s [%s]\n", pass_n,
00255               word->word->best_choice->unichar_string().string(),
00256               word->word->best_choice->debug_string().string());
00257     }
00258     pr_it->forward();
00259     if (make_next_word_fuzzy && pr_it->word() != NULL) {
00260       pr_it->MakeCurrentWordFuzzy();
00261     }
00262   }
00263   return true;
00264 }
00265 
00287 bool Tesseract::recog_all_words(PAGE_RES* page_res,
00288                                 ETEXT_DESC* monitor,
00289                                 const TBOX* target_word_box,
00290                                 const char* word_config,
00291                                 int dopasses) {
00292   PAGE_RES_IT page_res_it(page_res);
00293 
00294   if (tessedit_minimal_rej_pass1) {
00295     tessedit_test_adaption.set_value (TRUE);
00296     tessedit_minimal_rejection.set_value (TRUE);
00297   }
00298 
00299   if (dopasses==0 || dopasses==1) {
00300     page_res_it.restart_page();
00301     // ****************** Pass 1 *******************
00302 
00303     // If the adaptive classifier is full switch to one we prepared earlier,
00304     // ie on the previous page. If the current adaptive classifier is non-empty,
00305     // prepare a backup starting at this page, in case it fills up. Do all this
00306     // independently for each language.
00307     if (AdaptiveClassifierIsFull()) {
00308       SwitchAdaptiveClassifier();
00309     } else if (!AdaptiveClassifierIsEmpty()) {
00310       StartBackupAdaptiveClassifier();
00311     }
00312     // Now check the sub-langs as well.
00313     for (int i = 0; i < sub_langs_.size(); ++i) {
00314       if (sub_langs_[i]->AdaptiveClassifierIsFull()) {
00315         sub_langs_[i]->SwitchAdaptiveClassifier();
00316       } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {
00317         sub_langs_[i]->StartBackupAdaptiveClassifier();
00318       }
00319     }
00320     // Set up all words ready for recognition, so that if parallelism is on
00321     // all the input and output classes are ready to run the classifier.
00322     GenericVector<WordData> words;
00323     SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
00324     if (tessedit_parallelize) {
00325       PrerecAllWordsPar(words);
00326     }
00327 
00328     stats_.word_count = words.size();
00329 
00330     stats_.dict_words = 0;
00331     stats_.doc_blob_quality = 0;
00332     stats_.doc_outline_errs = 0;
00333     stats_.doc_char_quality = 0;
00334     stats_.good_char_count = 0;
00335     stats_.doc_good_char_quality = 0;
00336 
00337     most_recently_used_ = this;
00338     // Run pass 1 word recognition.
00339     if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
00340     // Pass 1 post-processing.
00341     for (page_res_it.restart_page(); page_res_it.word() != NULL;
00342          page_res_it.forward()) {
00343       if (page_res_it.word()->word->flag(W_REP_CHAR)) {
00344         fix_rep_char(&page_res_it);
00345         continue;
00346       }
00347 
00348       // Count dict words.
00349       if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
00350         ++(stats_.dict_words);
00351 
00352       // Update misadaption log (we only need to do it on pass 1, since
00353       // adaption only happens on this pass).
00354       if (page_res_it.word()->blamer_bundle != NULL &&
00355           page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
00356         page_res->misadaption_log.push_back(
00357             page_res_it.word()->blamer_bundle->misadaption_debug());
00358       }
00359     }
00360   }
00361 
00362   if (dopasses == 1) return true;
00363 
00364   // ****************** Pass 2 *******************
00365   if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption &&
00366       AnyTessLang()) {
00367     page_res_it.restart_page();
00368     GenericVector<WordData> words;
00369     SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
00370     if (tessedit_parallelize) {
00371       PrerecAllWordsPar(words);
00372     }
00373     most_recently_used_ = this;
00374     // Run pass 2 word recognition.
00375     if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
00376   }
00377 
00378   // The next passes can only be run if tesseract has been used, as cube
00379   // doesn't set all the necessary outputs in WERD_RES.
00380   if (AnyTessLang()) {
00381     // ****************** Pass 3 *******************
00382     // Fix fuzzy spaces.
00383     set_global_loc_code(LOC_FUZZY_SPACE);
00384 
00385     if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
00386         && !tessedit_word_for_word && !right_to_left())
00387       fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
00388 
00389     // ****************** Pass 4 *******************
00390     if (tessedit_enable_dict_correction) dictionary_correction_pass(page_res);
00391     if (tessedit_enable_bigram_correction) bigram_correction_pass(page_res);
00392 
00393     // ****************** Pass 5,6 *******************
00394     rejection_passes(page_res, monitor, target_word_box, word_config);
00395 
00396 #ifndef NO_CUBE_BUILD
00397     // ****************** Pass 7 *******************
00398     // Cube combiner.
00399     // If cube is loaded and its combiner is present, run it.
00400     if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
00401       run_cube_combiner(page_res);
00402     }
00403 #endif
00404 
00405     // ****************** Pass 8 *******************
00406     font_recognition_pass(page_res);
00407 
00408     // ****************** Pass 9 *******************
00409     // Check the correctness of the final results.
00410     blamer_pass(page_res);
00411     script_pos_pass(page_res);
00412   }
00413 
00414   // Write results pass.
00415   set_global_loc_code(LOC_WRITE_RESULTS);
00416   // This is now redundant, but retained commented so show how to obtain
00417   // bounding boxes and style information.
00418 
00419   // changed by jetsoft
00420   // needed for dll to output memory structure
00421   if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
00422     output_pass(page_res_it, target_word_box);
00423   // end jetsoft
00424   PageSegMode pageseg_mode = static_cast<PageSegMode>(
00425       static_cast<int>(tessedit_pageseg_mode));
00426   textord_.CleanupSingleRowResult(pageseg_mode, page_res);
00427 
00428   // Remove empty words, as these mess up the result iterators.
00429   for (page_res_it.restart_page(); page_res_it.word() != NULL;
00430        page_res_it.forward()) {
00431     WERD_RES* word = page_res_it.word();
00432     if (word->best_choice == NULL || word->best_choice->length() == 0)
00433       page_res_it.DeleteCurrentWord();
00434   }
00435 
00436   if (monitor != NULL) {
00437     monitor->progress = 100;
00438   }
00439   return true;
00440 }
00441 
00442 void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
00443   PAGE_RES_IT word_it(page_res);
00444 
00445   WERD_RES *w_prev = NULL;
00446   WERD_RES *w = word_it.word();
00447   while (1) {
00448     w_prev = w;
00449     while (word_it.forward() != NULL &&
00450            (!word_it.word() || word_it.word()->part_of_combo)) {
00451       // advance word_it, skipping over parts of combos
00452     }
00453     if (!word_it.word()) break;
00454     w = word_it.word();
00455     if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
00456       continue;
00457     }
00458     if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
00459       if (tessedit_bigram_debug) {
00460         tprintf("Skipping because one of the words is W_REP_CHAR\n");
00461       }
00462       continue;
00463     }
00464     // Two words sharing the same language model, excellent!
00465     GenericVector<WERD_CHOICE *> overrides_word1;
00466     GenericVector<WERD_CHOICE *> overrides_word2;
00467 
00468     STRING orig_w1_str = w_prev->best_choice->unichar_string();
00469     STRING orig_w2_str = w->best_choice->unichar_string();
00470     WERD_CHOICE prev_best(w->uch_set);
00471     {
00472       int w1start, w1end;
00473       w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
00474       prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
00475     }
00476     WERD_CHOICE this_best(w->uch_set);
00477     {
00478       int w2start, w2end;
00479       w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
00480       this_best = w->best_choice->shallow_copy(w2start, w2end);
00481     }
00482 
00483     if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
00484       if (tessedit_bigram_debug) {
00485         tprintf("Top choice \"%s %s\" verified by bigram model.\n",
00486                 orig_w1_str.string(), orig_w2_str.string());
00487       }
00488       continue;
00489     }
00490     if (tessedit_bigram_debug > 2) {
00491       tprintf("Examining alt choices for \"%s %s\".\n",
00492               orig_w1_str.string(), orig_w2_str.string());
00493     }
00494     if (tessedit_bigram_debug > 1) {
00495       if (!w_prev->best_choices.singleton()) {
00496         w_prev->PrintBestChoices();
00497       }
00498       if (!w->best_choices.singleton()) {
00499         w->PrintBestChoices();
00500       }
00501     }
00502     float best_rating = 0.0;
00503     int best_idx = 0;
00504     WERD_CHOICE_IT prev_it(&w_prev->best_choices);
00505     for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
00506       WERD_CHOICE *p1 = prev_it.data();
00507       WERD_CHOICE strip1(w->uch_set);
00508       {
00509         int p1start, p1end;
00510         p1->GetNonSuperscriptSpan(&p1start, &p1end);
00511         strip1 = p1->shallow_copy(p1start, p1end);
00512       }
00513       WERD_CHOICE_IT w_it(&w->best_choices);
00514       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
00515         WERD_CHOICE *p2 = w_it.data();
00516         WERD_CHOICE strip2(w->uch_set);
00517         {
00518           int p2start, p2end;
00519           p2->GetNonSuperscriptSpan(&p2start, &p2end);
00520           strip2 = p2->shallow_copy(p2start, p2end);
00521         }
00522         if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
00523           overrides_word1.push_back(p1);
00524           overrides_word2.push_back(p2);
00525           if (overrides_word1.size() == 1 ||
00526               p1->rating() + p2->rating() < best_rating) {
00527             best_rating = p1->rating() + p2->rating();
00528             best_idx = overrides_word1.size() - 1;
00529           }
00530         }
00531       }
00532     }
00533     if (overrides_word1.size() >= 1) {
00534       // Excellent, we have some bigram matches.
00535       if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice,
00536                                             *overrides_word1[best_idx]) &&
00537           EqualIgnoringCaseAndTerminalPunct(*w->best_choice,
00538                                             *overrides_word2[best_idx])) {
00539         if (tessedit_bigram_debug > 1) {
00540           tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
00541                   "model.\n", orig_w1_str.string(), orig_w2_str.string());
00542         }
00543         continue;
00544       }
00545       STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
00546       STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
00547       if (new_w1_str != orig_w1_str) {
00548         w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
00549       }
00550       if (new_w2_str != orig_w2_str) {
00551         w->ReplaceBestChoice(overrides_word2[best_idx]);
00552       }
00553       if (tessedit_bigram_debug > 0) {
00554         STRING choices_description;
00555         int num_bigram_choices
00556             = overrides_word1.size() * overrides_word2.size();
00557         if (num_bigram_choices == 1) {
00558           choices_description = "This was the unique bigram choice.";
00559         } else {
00560           if (tessedit_bigram_debug > 1) {
00561             STRING bigrams_list;
00562             const int kMaxChoicesToPrint = 20;
00563             for (int i = 0; i < overrides_word1.size() &&
00564                  i < kMaxChoicesToPrint; i++) {
00565               if (i > 0) { bigrams_list += ", "; }
00566               WERD_CHOICE *p1 = overrides_word1[i];
00567               WERD_CHOICE *p2 = overrides_word2[i];
00568               bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
00569               if (i == kMaxChoicesToPrint) {
00570                 bigrams_list += " ...";
00571               }
00572             }
00573             choices_description = "There were many choices: {";
00574             choices_description += bigrams_list;
00575             choices_description += "}";
00576           } else {
00577             choices_description.add_str_int("There were ", num_bigram_choices);
00578             choices_description += " compatible bigrams.";
00579           }
00580         }
00581         tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
00582                 orig_w1_str.string(), orig_w2_str.string(),
00583                 new_w1_str.string(), new_w2_str.string(),
00584                 choices_description.string());
00585       }
00586     }
00587   }
00588 }
00589 
00590 void Tesseract::rejection_passes(PAGE_RES* page_res,
00591                                  ETEXT_DESC* monitor,
00592                                  const TBOX* target_word_box,
00593                                  const char* word_config) {
00594   PAGE_RES_IT page_res_it(page_res);
00595   // ****************** Pass 5 *******************
00596   // Gather statistics on rejects.
00597   int word_index = 0;
00598   while (!tessedit_test_adaption && page_res_it.word() != NULL) {
00599     set_global_loc_code(LOC_MM_ADAPT);
00600     WERD_RES* word = page_res_it.word();
00601     word_index++;
00602     if (monitor != NULL) {
00603       monitor->ocr_alive = TRUE;
00604       monitor->progress = 95 + 5 * word_index / stats_.word_count;
00605     }
00606     if (word->rebuild_word == NULL) {
00607       // Word was not processed by tesseract.
00608       page_res_it.forward();
00609       continue;
00610     }
00611     check_debug_pt(word, 70);
00612 
00613     // changed by jetsoft
00614     // specific to its needs to extract one word when need
00615     if (target_word_box &&
00616         !ProcessTargetWord(word->word->bounding_box(),
00617                            *target_word_box, word_config, 4)) {
00618       page_res_it.forward();
00619       continue;
00620     }
00621     // end jetsoft
00622 
00623     page_res_it.rej_stat_word();
00624     int chars_in_word = word->reject_map.length();
00625     int rejects_in_word = word->reject_map.reject_count();
00626 
00627     int blob_quality = word_blob_quality(word, page_res_it.row()->row);
00628     stats_.doc_blob_quality += blob_quality;
00629     int outline_errs = word_outline_errs(word);
00630     stats_.doc_outline_errs += outline_errs;
00631     inT16 all_char_quality;
00632     inT16 accepted_all_char_quality;
00633     word_char_quality(word, page_res_it.row()->row,
00634                       &all_char_quality, &accepted_all_char_quality);
00635     stats_.doc_char_quality += all_char_quality;
00636     uinT8 permuter_type = word->best_choice->permuter();
00637     if ((permuter_type == SYSTEM_DAWG_PERM) ||
00638         (permuter_type == FREQ_DAWG_PERM) ||
00639         (permuter_type == USER_DAWG_PERM)) {
00640       stats_.good_char_count += chars_in_word - rejects_in_word;
00641       stats_.doc_good_char_quality += accepted_all_char_quality;
00642     }
00643     check_debug_pt(word, 80);
00644     if (tessedit_reject_bad_qual_wds &&
00645         (blob_quality == 0) && (outline_errs >= chars_in_word))
00646       word->reject_map.rej_word_bad_quality();
00647     check_debug_pt(word, 90);
00648     page_res_it.forward();
00649   }
00650 
00651   if (tessedit_debug_quality_metrics) {
00652     tprintf
00653       ("QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f"
00654        " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
00655       page_res->char_count, page_res->rej_count,
00656       page_res->rej_count / static_cast<float>(page_res->char_count),
00657       stats_.doc_blob_quality,
00658       stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
00659       stats_.doc_outline_errs,
00660       stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
00661       stats_.doc_char_quality,
00662       stats_.doc_char_quality / static_cast<float>(page_res->char_count),
00663       stats_.doc_good_char_quality,
00664       (stats_.good_char_count > 0) ?
00665       (stats_.doc_good_char_quality /
00666        static_cast<float>(stats_.good_char_count)) : 0.0);
00667   }
00668   BOOL8 good_quality_doc =
00669     ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
00670      quality_rej_pc) &&
00671     (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
00672      quality_blob_pc) &&
00673     (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
00674      quality_outline_pc) &&
00675     (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
00676      quality_char_pc);
00677 
00678   // ****************** Pass 6 *******************
00679   // Do whole document or whole block rejection pass
00680   if (!tessedit_test_adaption) {
00681     set_global_loc_code(LOC_DOC_BLK_REJ);
00682     quality_based_rejection(page_res_it, good_quality_doc);
00683   }
00684 }
00685 
00686 void Tesseract::blamer_pass(PAGE_RES* page_res) {
00687   if (!wordrec_run_blamer) return;
00688   PAGE_RES_IT page_res_it(page_res);
00689   for (page_res_it.restart_page(); page_res_it.word() != NULL;
00690       page_res_it.forward()) {
00691     WERD_RES *word = page_res_it.word();
00692     BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);
00693     page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;
00694   }
00695   tprintf("Blame reasons:\n");
00696   for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
00697     tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(
00698         static_cast<IncorrectResultReason>(bl)),
00699         page_res->blame_reasons[bl]);
00700   }
00701   if (page_res->misadaption_log.length() > 0) {
00702     tprintf("Misadaption log:\n");
00703     for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
00704       tprintf("%s\n", page_res->misadaption_log[i].string());
00705     }
00706   }
00707 }
00708 
00709 // Sets script positions and detects smallcaps on all output words.
00710 void Tesseract::script_pos_pass(PAGE_RES* page_res) {
00711   PAGE_RES_IT page_res_it(page_res);
00712   for (page_res_it.restart_page(); page_res_it.word() != NULL;
00713       page_res_it.forward()) {
00714     WERD_RES* word = page_res_it.word();
00715      if (word->word->flag(W_REP_CHAR)) {
00716       page_res_it.forward();
00717       continue;
00718     }
00719     float x_height = page_res_it.block()->block->x_height();
00720     float word_x_height = word->x_height;
00721     if (word_x_height < word->best_choice->min_x_height() ||
00722         word_x_height > word->best_choice->max_x_height()) {
00723       word_x_height = (word->best_choice->min_x_height() +
00724           word->best_choice->max_x_height()) / 2.0f;
00725     }
00726     // Test for small caps. Word capheight must be close to block xheight,
00727     // and word must contain no lower case letters, and at least one upper case.
00728     double small_cap_xheight = x_height * kXHeightCapRatio;
00729     double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
00730     if (word->uch_set->script_has_xheight() &&
00731         small_cap_xheight - small_cap_delta <= word_x_height &&
00732         word_x_height <= small_cap_xheight + small_cap_delta) {
00733       // Scan for upper/lower.
00734       int num_upper = 0;
00735       int num_lower = 0;
00736       for (int i = 0; i < word->best_choice->length(); ++i) {
00737         if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
00738           ++num_upper;
00739         else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
00740           ++num_lower;
00741       }
00742       if (num_upper > 0 && num_lower == 0)
00743         word->small_caps = true;
00744     }
00745     word->SetScriptPositions();
00746   }
00747 }
00748 
00749 // Factored helper considers the indexed word and updates all the pointed
00750 // values.
00751 static void EvaluateWord(const PointerVector<WERD_RES>& words, int index,
00752                          float* rating, float* certainty, bool* bad,
00753                          bool* valid_permuter, int* right, int* next_left) {
00754   *right = -MAX_INT32;
00755   *next_left = MAX_INT32;
00756   if (index < words.size()) {
00757     WERD_CHOICE* choice = words[index]->best_choice;
00758     if (choice == NULL) {
00759       *bad = true;
00760     } else {
00761       *rating += choice->rating();
00762       *certainty = MIN(*certainty, choice->certainty());
00763       if (!Dict::valid_word_permuter(choice->permuter(), false))
00764         *valid_permuter = false;
00765     }
00766     *right = words[index]->word->bounding_box().right();
00767     if (index + 1 < words.size())
00768       *next_left = words[index + 1]->word->bounding_box().left();
00769   } else {
00770     *valid_permuter = false;
00771     *bad = true;
00772   }
00773 }
00774 
00775 // Helper chooses the best combination of words, transferring good ones from
00776 // new_words to best_words. To win, a new word must have (better rating and
00777 // certainty) or (better permuter status and rating within rating ratio and
00778 // certainty within certainty margin) than current best.
00779 // All the new_words are consumed (moved to best_words or deleted.)
00780 // The return value is the number of new_words used minus the number of
00781 // best_words that remain in the output.
00782 static int SelectBestWords(double rating_ratio,
00783                            double certainty_margin,
00784                            bool debug,
00785                            PointerVector<WERD_RES>* new_words,
00786                            PointerVector<WERD_RES>* best_words) {
00787   // Process the smallest groups of words that have an overlapping word
00788   // boundary at the end.
00789   GenericVector<WERD_RES*> out_words;
00790   // Index into each word vector (best, new).
00791   int b = 0, n = 0;
00792   int num_best = 0, num_new = 0;
00793   while (b < best_words->size() || n < new_words->size()) {
00794     // Start of the current run in each.
00795     int start_b = b, start_n = n;
00796     // Rating of the current run in each.
00797     float b_rating = 0.0f, n_rating = 0.0f;
00798     // Certainty of the current run in each.
00799     float b_certainty = 0.0f, n_certainty = 0.0f;
00800     // True if any word is missing its best choice.
00801     bool b_bad = false, n_bad = false;
00802     // True if all words have a valid permuter.
00803     bool b_valid_permuter = true, n_valid_permuter = true;
00804 
00805     while (b < best_words->size() || n < new_words->size()) {
00806       int b_right = -MAX_INT32;
00807       int next_b_left = MAX_INT32;
00808       EvaluateWord(*best_words, b, &b_rating, &b_certainty, &b_bad,
00809                    &b_valid_permuter, &b_right, &next_b_left);
00810       int n_right = -MAX_INT32;
00811       int next_n_left = MAX_INT32;
00812       EvaluateWord(*new_words, n, &n_rating, &n_certainty, &n_bad,
00813                    &n_valid_permuter, &n_right, &next_n_left);
00814       if (MAX(b_right, n_right) < MIN(next_b_left, next_n_left)) {
00815         // The word breaks overlap. [start_b,b] and [start_n, n] match.
00816         break;
00817       }
00818       // Keep searching for the matching word break.
00819       if ((b_right < n_right && b < best_words->size()) ||
00820           n == new_words->size())
00821         ++b;
00822       else
00823         ++n;
00824     }
00825     bool new_better = false;
00826     if (!n_bad && (b_bad || (n_certainty > b_certainty &&
00827                              n_rating < b_rating) ||
00828                             (!b_valid_permuter && n_valid_permuter &&
00829                              n_rating < b_rating * rating_ratio &&
00830                              n_certainty > b_certainty - certainty_margin))) {
00831       // New is better.
00832       for (int i = start_n; i <= n; ++i) {
00833         out_words.push_back((*new_words)[i]);
00834         (*new_words)[i] = NULL;
00835         ++num_new;
00836       }
00837       new_better = true;
00838     } else if (!b_bad) {
00839       // Current best is better.
00840       for (int i = start_b; i <= b; ++i) {
00841         out_words.push_back((*best_words)[i]);
00842         (*best_words)[i] = NULL;
00843         ++num_best;
00844       }
00845     }
00846     int end_b = b < best_words->size() ? b + 1 : b;
00847     int end_n = n < new_words->size() ? n + 1 : n;
00848     if (debug) {
00849       tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"
00850               " valid dict: %d v %d\n",
00851               end_n - start_n, new_better ? "better" : "worse",
00852               end_b - start_b, n_rating, b_rating,
00853               n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
00854     }
00855     // Move on to the next group.
00856     b = end_b;
00857     n = end_n;
00858   }
00859   // Transfer from out_words to best_words.
00860   best_words->clear();
00861   for (int i = 0; i < out_words.size(); ++i)
00862     best_words->push_back(out_words[i]);
00863   return num_new - num_best;
00864 }
00865 
00866 // Helper to recognize the word using the given (language-specific) tesseract.
00867 // Returns positive if this recognizer found more new best words than the
00868 // number kept from best_words.
00869 int Tesseract::RetryWithLanguage(const WordData& word_data,
00870                                  WordRecognizer recognizer,
00871                                  WERD_RES** in_word,
00872                                  PointerVector<WERD_RES>* best_words) {
00873   bool debug = classify_debug_level || cube_debug_level;
00874   if (debug) {
00875     tprintf("Trying word using lang %s, oem %d\n",
00876             lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
00877   }
00878   // Run the recognizer on the word.
00879   PointerVector<WERD_RES> new_words;
00880   (this->*recognizer)(word_data, in_word, &new_words);
00881   if (new_words.empty()) {
00882     // Transfer input word to new_words, as the classifier must have put
00883     // the result back in the input.
00884     new_words.push_back(*in_word);
00885     *in_word = NULL;
00886   }
00887   if (debug) {
00888     for (int i = 0; i < new_words.size(); ++i)
00889       new_words[i]->DebugTopChoice("Lang result");
00890   }
00891   // Initial version is a bit of a hack based on better certainty and rating
00892   // (to reduce false positives from cube) or a dictionary vs non-dictionary
00893   // word.
00894   return SelectBestWords(classify_max_rating_ratio,
00895                          classify_max_certainty_margin,
00896                          debug, &new_words, best_words);
00897 }
00898 
00899 // Helper returns true if all the words are acceptable.
00900 static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
00901   for (int w = 0; w < words.size(); ++w) {
00902     if (words[w]->tess_failed || !words[w]->tess_accepted) return false;
00903   }
00904   return true;
00905 }
00906 
00907 // Moves good-looking "noise"/diacritics from the reject list to the main
00908 // blob list on the current word. Returns true if anything was done, and
00909 // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
00910 bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
00911                                    bool* make_next_word_fuzzy) {
00912   *make_next_word_fuzzy = false;
00913   WERD* real_word = pr_it->word()->word;
00914   if (real_word->rej_cblob_list()->empty() ||
00915       real_word->cblob_list()->empty() ||
00916       real_word->rej_cblob_list()->length() > noise_maxperword)
00917     return false;
00918   real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
00919   // Get the noise outlines into a vector with matching bool map.
00920   GenericVector<C_OUTLINE*> outlines;
00921   real_word->GetNoiseOutlines(&outlines);
00922   GenericVector<bool> word_wanted;
00923   GenericVector<bool> overlapped_any_blob;
00924   GenericVector<C_BLOB*> target_blobs;
00925   AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
00926                                      &word_wanted, &overlapped_any_blob,
00927                                      &target_blobs);
00928   // Filter the outlines that overlapped any blob and put them into the word
00929   // now. This simplifies the remaining task and also makes it more accurate
00930   // as it has more completed blobs to work on.
00931   GenericVector<bool> wanted;
00932   GenericVector<C_BLOB*> wanted_blobs;
00933   GenericVector<C_OUTLINE*> wanted_outlines;
00934   int num_overlapped = 0;
00935   int num_overlapped_used = 0;
00936   for (int i = 0; i < overlapped_any_blob.size(); ++i) {
00937     if (overlapped_any_blob[i]) {
00938       ++num_overlapped;
00939       if (word_wanted[i]) ++num_overlapped_used;
00940       wanted.push_back(word_wanted[i]);
00941       wanted_blobs.push_back(target_blobs[i]);
00942       wanted_outlines.push_back(outlines[i]);
00943       outlines[i] = NULL;
00944     }
00945   }
00946   real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, NULL);
00947   AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
00948                              &target_blobs);
00949   int non_overlapped = 0;
00950   int non_overlapped_used = 0;
00951   for (int i = 0; i < word_wanted.size(); ++i) {
00952     if (word_wanted[i]) ++non_overlapped_used;
00953     if (outlines[i] != NULL) ++non_overlapped_used;
00954   }
00955   if (debug_noise_removal) {
00956     tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
00957             num_overlapped_used, num_overlapped, non_overlapped_used,
00958             non_overlapped);
00959     real_word->bounding_box().print();
00960   }
00961   // Now we have decided which outlines we want, put them into the real_word.
00962   if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
00963                                      make_next_word_fuzzy)) {
00964     pr_it->MakeCurrentWordFuzzy();
00965   }
00966   // TODO(rays) Parts of combos have a deep copy of the real word, and need
00967   // to have their noise outlines moved/assigned in the same way!!
00968   return num_overlapped_used != 0 || non_overlapped_used != 0;
00969 }
00970 
00971 // Attempts to put noise/diacritic outlines into the blobs that they overlap.
00972 // Input: a set of noisy outlines that probably belong to the real_word.
00973 // Output: word_wanted indicates which outlines are to be assigned to a blob,
00974 //   target_blobs indicates which to assign to, and overlapped_any_blob is
00975 //   true for all outlines that overlapped a blob.
00976 void Tesseract::AssignDiacriticsToOverlappingBlobs(
00977     const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
00978     PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
00979     GenericVector<bool>* overlapped_any_blob,
00980     GenericVector<C_BLOB*>* target_blobs) {
00981   GenericVector<bool> blob_wanted;
00982   word_wanted->init_to_size(outlines.size(), false);
00983   overlapped_any_blob->init_to_size(outlines.size(), false);
00984   target_blobs->init_to_size(outlines.size(), NULL);
00985   // For each real blob, find the outlines that seriously overlap it.
00986   // A single blob could be several merged characters, so there can be quite
00987   // a few outlines overlapping, and the full engine needs to be used to chop
00988   // and join to get a sensible result.
00989   C_BLOB_IT blob_it(real_word->cblob_list());
00990   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00991     C_BLOB* blob = blob_it.data();
00992     TBOX blob_box = blob->bounding_box();
00993     blob_wanted.init_to_size(outlines.size(), false);
00994     int num_blob_outlines = 0;
00995     for (int i = 0; i < outlines.size(); ++i) {
00996       if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
00997           !(*word_wanted)[i]) {
00998         blob_wanted[i] = true;
00999         (*overlapped_any_blob)[i] = true;
01000         ++num_blob_outlines;
01001       }
01002     }
01003     if (debug_noise_removal) {
01004       tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
01005       blob_box.print();
01006     }
01007     // If any outlines overlap the blob, and not too many, classify the blob
01008     // (using the full engine, languages and all), and choose the maximal
01009     // combination of outlines that doesn't hurt the end-result classification
01010     // by too much. Mark them as wanted.
01011     if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
01012       if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
01013                                       outlines, num_blob_outlines,
01014                                       &blob_wanted)) {
01015         for (int i = 0; i < blob_wanted.size(); ++i) {
01016           if (blob_wanted[i]) {
01017             // Claim the outline and record where it is going.
01018             (*word_wanted)[i] = true;
01019             (*target_blobs)[i] = blob;
01020           }
01021         }
01022       }
01023     }
01024   }
01025 }
01026 
01027 // Attempts to assign non-overlapping outlines to their nearest blobs or
01028 // make new blobs out of them.
01029 void Tesseract::AssignDiacriticsToNewBlobs(
01030     const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
01031     PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
01032     GenericVector<C_BLOB*>* target_blobs) {
01033   GenericVector<bool> blob_wanted;
01034   word_wanted->init_to_size(outlines.size(), false);
01035   target_blobs->init_to_size(outlines.size(), NULL);
01036   // Check for outlines that need to be turned into stand-alone blobs.
01037   for (int i = 0; i < outlines.size(); ++i) {
01038     if (outlines[i] == NULL) continue;
01039     // Get a set of adjacent outlines that don't overlap any existing blob.
01040     blob_wanted.init_to_size(outlines.size(), false);
01041     int num_blob_outlines = 0;
01042     TBOX total_ol_box(outlines[i]->bounding_box());
01043     while (i < outlines.size() && outlines[i] != NULL) {
01044       blob_wanted[i] = true;
01045       total_ol_box += outlines[i]->bounding_box();
01046       ++i;
01047       ++num_blob_outlines;
01048     }
01049     // Find the insertion point.
01050     C_BLOB_IT blob_it(real_word->cblob_list());
01051     while (!blob_it.at_last() &&
01052            blob_it.data_relative(1)->bounding_box().left() <=
01053                total_ol_box.left()) {
01054       blob_it.forward();
01055     }
01056     // Choose which combination of them we actually want and where to put
01057     // them.
01058     if (debug_noise_removal)
01059       tprintf("Num blobless outlines = %d\n", num_blob_outlines);
01060     C_BLOB* left_blob = blob_it.data();
01061     TBOX left_box = left_blob->bounding_box();
01062     C_BLOB* right_blob = blob_it.at_last() ? NULL : blob_it.data_relative(1);
01063     if ((left_box.x_overlap(total_ol_box) || right_blob == NULL ||
01064          !right_blob->bounding_box().x_overlap(total_ol_box)) &&
01065         SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
01066                                     outlines, num_blob_outlines,
01067                                     &blob_wanted)) {
01068       if (debug_noise_removal) tprintf("Added to left blob\n");
01069       for (int j = 0; j < blob_wanted.size(); ++j) {
01070         if (blob_wanted[j]) {
01071           (*word_wanted)[j] = true;
01072           (*target_blobs)[j] = left_blob;
01073         }
01074       }
01075     } else if (right_blob != NULL &&
01076                (!left_box.x_overlap(total_ol_box) ||
01077                 right_blob->bounding_box().x_overlap(total_ol_box)) &&
01078                SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it,
01079                                            right_blob, outlines,
01080                                            num_blob_outlines, &blob_wanted)) {
01081       if (debug_noise_removal) tprintf("Added to right blob\n");
01082       for (int j = 0; j < blob_wanted.size(); ++j) {
01083         if (blob_wanted[j]) {
01084           (*word_wanted)[j] = true;
01085           (*target_blobs)[j] = right_blob;
01086         }
01087       }
01088     } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, NULL,
01089                                            outlines, num_blob_outlines,
01090                                            &blob_wanted)) {
01091       if (debug_noise_removal) tprintf("Fitted between blobs\n");
01092       for (int j = 0; j < blob_wanted.size(); ++j) {
01093         if (blob_wanted[j]) {
01094           (*word_wanted)[j] = true;
01095           (*target_blobs)[j] = NULL;
01096         }
01097       }
01098     }
01099   }
01100 }
01101 
01102 // Starting with ok_outlines set to indicate which outlines overlap the blob,
01103 // chooses the optimal set (approximately) and returns true if any outlines
01104 // are desired, in which case ok_outlines indicates which ones.
01105 bool Tesseract::SelectGoodDiacriticOutlines(
01106     int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob,
01107     const GenericVector<C_OUTLINE*>& outlines, int num_outlines,
01108     GenericVector<bool>* ok_outlines) {
01109   STRING best_str;
01110   float target_cert = certainty_threshold;
01111   if (blob != NULL) {
01112     float target_c2;
01113     target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
01114     if (debug_noise_removal) {
01115       tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(),
01116               target_cert, target_c2);
01117       blob->bounding_box().print();
01118     }
01119     target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
01120   }
01121   GenericVector<bool> test_outlines = *ok_outlines;
01122   // Start with all the outlines in.
01123   STRING all_str;
01124   GenericVector<bool> best_outlines = *ok_outlines;
01125   float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
01126                                              pr_it, blob, &all_str);
01127   if (debug_noise_removal) {
01128     TBOX ol_box;
01129     for (int i = 0; i < test_outlines.size(); ++i) {
01130       if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
01131     }
01132     tprintf("All Noise blob classified as %s=%g, delta=%g at:",
01133             all_str.string(), best_cert, best_cert - target_cert);
01134     ol_box.print();
01135   }
01136   // Iteratively zero out the bit that improves the certainty the most, until
01137   // we get past the threshold, have zero bits, or fail to improve.
01138   int best_index = 0;  // To zero out.
01139   while (num_outlines > 1 && best_index >= 0 &&
01140          (blob == NULL || best_cert < target_cert || blob != NULL)) {
01141     // Find the best bit to zero out.
01142     best_index = -1;
01143     for (int i = 0; i < outlines.size(); ++i) {
01144       if (test_outlines[i]) {
01145         test_outlines[i] = false;
01146         STRING str;
01147         float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
01148                                               pr_it, blob, &str);
01149         if (debug_noise_removal) {
01150           TBOX ol_box;
01151           for (int j = 0; j < outlines.size(); ++j) {
01152             if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
01153             tprintf("%d", test_outlines[j]);
01154           }
01155           tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(),
01156                   cert, cert - target_cert);
01157           ol_box.print();
01158         }
01159         if (cert > best_cert) {
01160           best_cert = cert;
01161           best_index = i;
01162           best_outlines = test_outlines;
01163         }
01164         test_outlines[i] = true;
01165       }
01166     }
01167     if (best_index >= 0) {
01168       test_outlines[best_index] = false;
01169       --num_outlines;
01170     }
01171   }
01172   if (best_cert >= target_cert) {
01173     // Save the best combination.
01174     *ok_outlines = best_outlines;
01175     if (debug_noise_removal) {
01176       tprintf("%s noise combination ", blob ? "Adding" : "New");
01177       for (int i = 0; i < best_outlines.size(); ++i) {
01178         tprintf("%d", best_outlines[i]);
01179       }
01180       tprintf(" yields certainty %g, beating target of %g\n", best_cert,
01181               target_cert);
01182     }
01183     return true;
01184   }
01185   return false;
01186 }
01187 
01188 // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
01189 // the inclusion of the outlines, and returns the certainty of the raw choice.
01190 float Tesseract::ClassifyBlobPlusOutlines(
01191     const GenericVector<bool>& ok_outlines,
01192     const GenericVector<C_OUTLINE*>& outlines, int pass_n, PAGE_RES_IT* pr_it,
01193     C_BLOB* blob, STRING* best_str) {
01194   C_OUTLINE_IT ol_it;
01195   C_OUTLINE* first_to_keep = NULL;
01196   if (blob != NULL) {
01197     // Add the required outlines to the blob.
01198     ol_it.set_to_list(blob->out_list());
01199     first_to_keep = ol_it.data();
01200   }
01201   for (int i = 0; i < ok_outlines.size(); ++i) {
01202     if (ok_outlines[i]) {
01203       // This outline is to be added.
01204       if (blob == NULL) {
01205         blob = new C_BLOB(outlines[i]);
01206         ol_it.set_to_list(blob->out_list());
01207       } else {
01208         ol_it.add_before_stay_put(outlines[i]);
01209       }
01210     }
01211   }
01212   float c2;
01213   float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
01214   ol_it.move_to_first();
01215   if (first_to_keep == NULL) {
01216     // We created blob. Empty its outlines and delete it.
01217     for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
01218     delete blob;
01219     cert = -c2;
01220   } else {
01221     // Remove the outlines that we put in.
01222     for (; ol_it.data() != first_to_keep; ol_it.forward()) {
01223       ol_it.extract();
01224     }
01225   }
01226   return cert;
01227 }
01228 
01229 // Classifies the given blob (part of word_data->word->word) as an individual
01230 // word, using languages, chopper etc, returning only the certainty of the
01231 // best raw choice, and undoing all the work done to fake out the word.
01232 float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it,
01233                                     C_BLOB* blob, STRING* best_str, float* c2) {
01234   WERD* real_word = pr_it->word()->word;
01235   WERD* word = real_word->ConstructFromSingleBlob(
01236       real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
01237   WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
01238   // Get a new iterator that points to the new word.
01239   PAGE_RES_IT it(pr_it->page_res);
01240   while (it.word() != word_res && it.word() != NULL) it.forward();
01241   ASSERT_HOST(it.word() == word_res);
01242   WordData wd(it);
01243   // Force full initialization.
01244   SetupWordPassN(1, &wd);
01245   classify_word_and_language(pass_n, &it, &wd);
01246   if (debug_noise_removal) {
01247     tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
01248             wd.row->x_height(), wd.word->raw_choice->min_x_height(),
01249             wd.word->raw_choice->max_x_height());
01250   }
01251   float cert = wd.word->raw_choice->certainty();
01252   float rat = wd.word->raw_choice->rating();
01253   *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
01254   *best_str = wd.word->raw_choice->unichar_string();
01255   it.DeleteCurrentWord();
01256   pr_it->ResetWordIterator();
01257   return cert;
01258 }
01259 
01260 // Generic function for classifying a word. Can be used either for pass1 or
01261 // pass2 according to the function passed to recognizer.
01262 // word_data holds the word to be recognized, and its block and row, and
01263 // pr_it points to the word as well, in case we are running LSTM and it wants
01264 // to output multiple words.
01265 // Recognizes in the current language, and if successful that is all.
01266 // If recognition was not successful, tries all available languages until
01267 // it gets a successful result or runs out of languages. Keeps the best result.
01268 void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
01269                                            WordData* word_data) {
01270   WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
01271                                           : &Tesseract::classify_word_pass2;
01272   // Best result so far.
01273   PointerVector<WERD_RES> best_words;
01274   // Points to the best result. May be word or in lang_words.
01275   WERD_RES* word = word_data->word;
01276   clock_t start_t = clock();
01277   if (classify_debug_level || cube_debug_level) {
01278     tprintf("%s word with lang %s at:",
01279             word->done ? "Already done" : "Processing",
01280             most_recently_used_->lang.string());
01281     word->word->bounding_box().print();
01282   }
01283   if (word->done) {
01284     // If done on pass1, leave it as-is.
01285     if (!word->tess_failed)
01286       most_recently_used_ = word->tesseract;
01287     return;
01288   }
01289   int sub = sub_langs_.size();
01290   if (most_recently_used_ != this) {
01291     // Get the index of the most_recently_used_.
01292     for (sub = 0; sub < sub_langs_.size() &&
01293          most_recently_used_ != sub_langs_[sub]; ++sub) {}
01294   }
01295   most_recently_used_->RetryWithLanguage(
01296       *word_data, recognizer, &word_data->lang_words[sub], &best_words);
01297   Tesseract* best_lang_tess = most_recently_used_;
01298   if (!WordsAcceptable(best_words)) {
01299     // Try all the other languages to see if they are any better.
01300     if (most_recently_used_ != this &&
01301         this->RetryWithLanguage(*word_data, recognizer,
01302                                 &word_data->lang_words[sub_langs_.size()],
01303                                 &best_words) > 0) {
01304       best_lang_tess = this;
01305     }
01306     for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
01307          ++i) {
01308       if (most_recently_used_ != sub_langs_[i] &&
01309           sub_langs_[i]->RetryWithLanguage(*word_data, recognizer,
01310                                            &word_data->lang_words[i],
01311                                            &best_words) > 0) {
01312         best_lang_tess = sub_langs_[i];
01313       }
01314     }
01315   }
01316   most_recently_used_ = best_lang_tess;
01317   if (!best_words.empty()) {
01318     if (best_words.size() == 1 && !best_words[0]->combination) {
01319       // Move the best single result to the main word.
01320       word_data->word->ConsumeWordResults(best_words[0]);
01321     } else {
01322       // Words came from LSTM, and must be moved to the PAGE_RES properly.
01323       word_data->word = best_words.back();
01324       pr_it->ReplaceCurrentWord(&best_words);
01325     }
01326     ASSERT_HOST(word_data->word->box_word != NULL);
01327   } else {
01328     tprintf("no best words!!\n");
01329   }
01330   clock_t ocr_t = clock();
01331   if (tessedit_timing_debug) {
01332     tprintf("%s (ocr took %.2f sec)\n",
01333             word->best_choice->unichar_string().string(),
01334             static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
01335   }
01336 }
01337 
01344 void Tesseract::classify_word_pass1(const WordData& word_data,
01345                                     WERD_RES** in_word,
01346                                     PointerVector<WERD_RES>* out_words) {
01347   ROW* row = word_data.row;
01348   BLOCK* block = word_data.block;
01349   prev_word_best_choice_ = word_data.prev_word != NULL
01350       ? word_data.prev_word->word->best_choice : NULL;
01351 #ifndef NO_CUBE_BUILD
01352   // If we only intend to run cube - run it and return.
01353   if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
01354     cube_word_pass1(block, row, *in_word);
01355     return;
01356   }
01357 #endif
01358   WERD_RES* word = *in_word;
01359   match_word_pass_n(1, word, row, block);
01360   if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
01361     word->tess_would_adapt = AdaptableWord(word);
01362     bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
01363 
01364     if (adapt_ok) {
01365       // Send word to adaptive classifier for training.
01366       word->BestChoiceToCorrectText();
01367       LearnWord(NULL, word);
01368       // Mark misadaptions if running blamer.
01369       if (word->blamer_bundle != NULL) {
01370         word->blamer_bundle->SetMisAdaptionDebug(word->best_choice,
01371                                                  wordrec_debug_blamer);
01372       }
01373     }
01374 
01375     if (tessedit_enable_doc_dict && !word->IsAmbiguous())
01376       tess_add_doc_word(word->best_choice);
01377   }
01378 }
01379 
01380 // Helper to report the result of the xheight fix.
01381 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht,
01382                                    WERD_RES* word, WERD_RES* new_word) {
01383   tprintf("New XHT Match:%s = %s ",
01384           word->best_choice->unichar_string().string(),
01385           word->best_choice->debug_string().string());
01386   word->reject_map.print(debug_fp);
01387   tprintf(" -> %s = %s ",
01388           new_word->best_choice->unichar_string().string(),
01389           new_word->best_choice->debug_string().string());
01390   new_word->reject_map.print(debug_fp);
01391   tprintf(" %s->%s %s %s\n",
01392           word->guessed_x_ht ? "GUESS" : "CERT",
01393           new_word->guessed_x_ht ? "GUESS" : "CERT",
01394           new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
01395           accept_new_word ? "ACCEPTED" : "");
01396 }
01397 
01398 // Run the x-height fix-up, based on min/max top/bottom information in
01399 // unicharset.
01400 // Returns true if the word was changed.
01401 // See the comment in fixxht.cpp for a description of the overall process.
01402 bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
01403   int original_misfits = CountMisfitTops(word);
01404   if (original_misfits == 0)
01405     return false;
01406   float baseline_shift = 0.0f;
01407   float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
01408   if (baseline_shift != 0.0f) {
01409     // Try the shift on its own first.
01410     if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
01411                               word, block, row))
01412       return false;
01413     original_misfits = CountMisfitTops(word);
01414     if (original_misfits > 0) {
01415       float new_baseline_shift;
01416       // Now recompute the new x_height.
01417       new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
01418       if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
01419         // No test of return value here, as we are definitely making a change
01420         // to the word by shifting the baseline.
01421         TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
01422                              word, block, row);
01423       }
01424     }
01425     return true;
01426   } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
01427     return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
01428                                 word, block, row);
01429   } else {
01430     return false;
01431   }
01432 }
01433 
01434 // Runs recognition with the test baseline shift and x-height and returns true
01435 // if there was an improvement in recognition result.
01436 bool Tesseract::TestNewNormalization(int original_misfits,
01437                                      float baseline_shift, float new_x_ht,
01438                                      WERD_RES *word, BLOCK* block, ROW *row) {
01439   bool accept_new_x_ht = false;
01440   WERD_RES new_x_ht_word(word->word);
01441   if (word->blamer_bundle != NULL) {
01442     new_x_ht_word.blamer_bundle = new BlamerBundle();
01443     new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
01444   }
01445   new_x_ht_word.x_height = new_x_ht;
01446   new_x_ht_word.baseline_shift = baseline_shift;
01447   new_x_ht_word.caps_height = 0.0;
01448   new_x_ht_word.SetupForRecognition(
01449         unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL,
01450         classify_bln_numeric_mode, textord_use_cjk_fp_model,
01451       poly_allow_detailed_fx, row, block);
01452   match_word_pass_n(2, &new_x_ht_word, row, block);
01453   if (!new_x_ht_word.tess_failed) {
01454     int new_misfits = CountMisfitTops(&new_x_ht_word);
01455     if (debug_x_ht_level >= 1) {
01456       tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
01457               original_misfits, word->x_height,
01458               new_misfits, new_x_ht);
01459       tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
01460               word->best_choice->rating(), word->best_choice->certainty(),
01461               new_x_ht_word.best_choice->rating(),
01462               new_x_ht_word.best_choice->certainty());
01463     }
01464     // The misfits must improve and either the rating or certainty.
01465     accept_new_x_ht = new_misfits < original_misfits &&
01466                       (new_x_ht_word.best_choice->certainty() >
01467                           word->best_choice->certainty() ||
01468                        new_x_ht_word.best_choice->rating() <
01469                           word->best_choice->rating());
01470     if (debug_x_ht_level >= 1) {
01471       ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
01472     }
01473   }
01474   if (accept_new_x_ht) {
01475     word->ConsumeWordResults(&new_x_ht_word);
01476     return true;
01477   }
01478   return false;
01479 }
01480 
01487 void Tesseract::classify_word_pass2(const WordData& word_data,
01488                                     WERD_RES** in_word,
01489                                     PointerVector<WERD_RES>* out_words) {
01490   // Return if we do not want to run Tesseract.
01491   if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY &&
01492       tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED &&
01493       word_data.word->best_choice != NULL)
01494     return;
01495   if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
01496     return;
01497   }
01498   ROW* row = word_data.row;
01499   BLOCK* block = word_data.block;
01500   WERD_RES* word = *in_word;
01501   prev_word_best_choice_ = word_data.prev_word != NULL
01502       ? word_data.prev_word->word->best_choice : NULL;
01503 
01504   set_global_subloc_code(SUBLOC_NORM);
01505   check_debug_pt(word, 30);
01506   if (!word->done) {
01507     word->caps_height = 0.0;
01508     if (word->x_height == 0.0f)
01509       word->x_height = row->x_height();
01510     match_word_pass_n(2, word, row, block);
01511     check_debug_pt(word, 40);
01512   }
01513 
01514   SubAndSuperscriptFix(word);
01515 
01516   if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
01517     if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() &&
01518         block->classify_rotation().y() == 0.0f) {
01519       // Use the tops and bottoms since they are available.
01520       TrainedXheightFix(word, block, row);
01521     }
01522 
01523     set_global_subloc_code(SUBLOC_NORM);
01524   }
01525 #ifndef GRAPHICS_DISABLED
01526   if (tessedit_display_outwords) {
01527     if (fx_win == NULL)
01528       create_fx_win();
01529     clear_fx_win();
01530     word->rebuild_word->plot(fx_win);
01531     TBOX wbox = word->rebuild_word->bounding_box();
01532     fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
01533                             wbox.right(), wbox.bottom());
01534     ScrollView::Update();
01535   }
01536 #endif
01537   set_global_subloc_code(SUBLOC_NORM);
01538   check_debug_pt(word, 50);
01539 }
01540 
01541 
01548 void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word,
01549                                   ROW *row, BLOCK* block) {
01550   if (word->tess_failed) return;
01551   tess_segment_pass_n(pass_n, word);
01552 
01553   if (!word->tess_failed) {
01554     if (!word->word->flag (W_REP_CHAR)) {
01555        word->fix_quotes();
01556       if (tessedit_fix_hyphens)
01557         word->fix_hyphens();
01558       /* Don't trust fix_quotes! - though I think I've fixed the bug */
01559       if (word->best_choice->length() != word->box_word->length()) {
01560         tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
01561                 " #Blobs=%d\n",
01562                 word->best_choice->debug_string().string(),
01563                 word->best_choice->length(),
01564                 word->box_word->length());
01565 
01566       }
01567       word->tess_accepted = tess_acceptable_word(word);
01568 
01569       // Also sets word->done flag
01570       make_reject_map(word, row, pass_n);
01571     }
01572   }
01573   set_word_fonts(word);
01574 
01575   ASSERT_HOST(word->raw_choice != NULL);
01576 }
01577 
01578 // Helper to return the best rated BLOB_CHOICE in the whole word that matches
01579 // the given char_id, or NULL if none can be found.
01580 static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
01581                                            WERD_RES* word_res) {
01582   // Find the corresponding best BLOB_CHOICE from any position in the word_res.
01583   BLOB_CHOICE* best_choice = NULL;
01584   for (int i = 0; i < word_res->best_choice->length(); ++i) {
01585     BLOB_CHOICE* choice = FindMatchingChoice(char_id,
01586                                              word_res->GetBlobChoices(i));
01587     if (choice != NULL) {
01588       if (best_choice == NULL || choice->rating() < best_choice->rating())
01589         best_choice = choice;
01590     }
01591   }
01592   return best_choice;
01593 }
01594 
01595 // Helper to insert blob_choice in each location in the leader word if there is
01596 // no matching BLOB_CHOICE there already, and correct any incorrect results
01597 // in the best_choice.
01598 static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice,
01599                                   WERD_RES* word_res) {
01600   WERD_CHOICE* word = word_res->best_choice;
01601   for (int i = 0; i < word_res->best_choice->length(); ++i) {
01602     BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(),
01603                                              word_res->GetBlobChoices(i));
01604     if (choice == NULL) {
01605       BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
01606       choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
01607     }
01608   }
01609   // Correct any incorrect results in word.
01610   for (int i = 0; i < word->length(); ++i) {
01611     if (word->unichar_id(i) != blob_choice->unichar_id())
01612       word->set_unichar_id(blob_choice->unichar_id(), i);
01613   }
01614 }
01615 
01623 void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
01624   WERD_RES *word_res = page_res_it->word();
01625   const WERD_CHOICE &word = *(word_res->best_choice);
01626 
01627   // Find the frequency of each unique character in the word.
01628   SortHelper<UNICHAR_ID> rep_ch(word.length());
01629   for (int i = 0; i < word.length(); ++i) {
01630     rep_ch.Add(word.unichar_id(i), 1);
01631   }
01632 
01633   // Find the most frequent result.
01634   UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
01635   int max_count = rep_ch.MaxCount(&maxch_id);
01636   // Find the best exemplar of a classifier result for maxch_id.
01637   BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
01638   if (best_choice == NULL) {
01639     tprintf("Failed to find a choice for %s, occurring %d times\n",
01640             word_res->uch_set->debug_str(maxch_id).string(), max_count);
01641     return;
01642   }
01643   word_res->done = TRUE;
01644 
01645   // Measure the mean space.
01646   int gap_count = 0;
01647   WERD* werd = word_res->word;
01648   C_BLOB_IT blob_it(werd->cblob_list());
01649   C_BLOB* prev_blob = blob_it.data();
01650   for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
01651     C_BLOB* blob = blob_it.data();
01652     int gap = blob->bounding_box().left();
01653     gap -= prev_blob->bounding_box().right();
01654     ++gap_count;
01655     prev_blob = blob;
01656   }
01657   // Just correct existing classification.
01658   CorrectRepcharChoices(best_choice, word_res);
01659   word_res->reject_map.initialise(word.length());
01660 }
01661 
01662 ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(
01663     const UNICHARSET& char_set, const char *s, const char *lengths) {
01664   int i = 0;
01665   int offset = 0;
01666   int leading_punct_count;
01667   int upper_count = 0;
01668   int hyphen_pos = -1;
01669   ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
01670 
01671   if (strlen (lengths) > 20)
01672     return word_type;
01673 
01674   /* Single Leading punctuation char*/
01675 
01676   if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
01677     offset += lengths[i++];
01678   leading_punct_count = i;
01679 
01680   /* Initial cap */
01681   while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
01682     offset += lengths[i++];
01683     upper_count++;
01684   }
01685   if (upper_count > 1) {
01686     word_type = AC_UPPER_CASE;
01687   } else {
01688     /* Lower case word, possibly with an initial cap */
01689     while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
01690       offset += lengths[i++];
01691     }
01692     if (i - leading_punct_count < quality_min_initial_alphas_reqd)
01693       goto not_a_word;
01694     /*
01695     Allow a single hyphen in a lower case word
01696     - don't trust upper case - I've seen several cases of "H" -> "I-I"
01697     */
01698     if (lengths[i] == 1 && s[offset] == '-') {
01699       hyphen_pos = i;
01700       offset += lengths[i++];
01701       if (s[offset] != '\0') {
01702         while ((s[offset] != '\0') &&
01703                char_set.get_islower(s + offset, lengths[i])) {
01704           offset += lengths[i++];
01705         }
01706         if (i < hyphen_pos + 3)
01707           goto not_a_word;
01708       }
01709     } else {
01710       /* Allow "'s" in NON hyphenated lower case words */
01711       if (lengths[i] == 1 && (s[offset] == '\'') &&
01712           lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
01713         offset += lengths[i++];
01714         offset += lengths[i++];
01715       }
01716     }
01717     if (upper_count > 0)
01718       word_type = AC_INITIAL_CAP;
01719     else
01720       word_type = AC_LOWER_CASE;
01721   }
01722 
01723   /* Up to two different, constrained trailing punctuation chars */
01724   if (lengths[i] == 1 && s[offset] != '\0' &&
01725       STRING(chs_trailing_punct1).contains(s[offset]))
01726     offset += lengths[i++];
01727   if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
01728       s[offset - lengths[i - 1]] != s[offset] &&
01729       STRING(chs_trailing_punct2).contains (s[offset]))
01730     offset += lengths[i++];
01731 
01732   if (s[offset] != '\0')
01733     word_type = AC_UNACCEPTABLE;
01734 
01735   not_a_word:
01736 
01737   if (word_type == AC_UNACCEPTABLE) {
01738     /* Look for abbreviation string */
01739     i = 0;
01740     offset = 0;
01741     if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
01742       word_type = AC_UC_ABBREV;
01743       while (s[offset] != '\0' &&
01744              char_set.get_isupper(s + offset, lengths[i]) &&
01745              lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
01746         offset += lengths[i++];
01747         offset += lengths[i++];
01748       }
01749     }
01750     else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
01751       word_type = AC_LC_ABBREV;
01752       while (s[offset] != '\0' &&
01753              char_set.get_islower(s + offset, lengths[i]) &&
01754              lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
01755         offset += lengths[i++];
01756         offset += lengths[i++];
01757       }
01758     }
01759     if (s[offset] != '\0')
01760       word_type = AC_UNACCEPTABLE;
01761   }
01762 
01763   return word_type;
01764 }
01765 
01766 BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) {
01767   BOOL8 show_map_detail = FALSE;
01768   inT16 i;
01769 
01770   if (!test_pt)
01771     return FALSE;
01772 
01773   tessedit_rejection_debug.set_value (FALSE);
01774   debug_x_ht_level.set_value(0);
01775 
01776   if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
01777     if (location < 0)
01778       return TRUE;               // For breakpoint use
01779     tessedit_rejection_debug.set_value (TRUE);
01780     debug_x_ht_level.set_value(2);
01781     tprintf ("\n\nTESTWD::");
01782     switch (location) {
01783       case 0:
01784         tprintf ("classify_word_pass1 start\n");
01785         word->word->print();
01786         break;
01787       case 10:
01788         tprintf ("make_reject_map: initial map");
01789         break;
01790       case 20:
01791         tprintf ("make_reject_map: after NN");
01792         break;
01793       case 30:
01794         tprintf ("classify_word_pass2 - START");
01795         break;
01796       case 40:
01797         tprintf ("classify_word_pass2 - Pre Xht");
01798         break;
01799       case 50:
01800         tprintf ("classify_word_pass2 - END");
01801         show_map_detail = TRUE;
01802         break;
01803       case 60:
01804         tprintf ("fixspace");
01805         break;
01806       case 70:
01807         tprintf ("MM pass START");
01808         break;
01809       case 80:
01810         tprintf ("MM pass END");
01811         break;
01812       case 90:
01813         tprintf ("After Poor quality rejection");
01814         break;
01815       case 100:
01816         tprintf ("unrej_good_quality_words - START");
01817         break;
01818       case 110:
01819         tprintf ("unrej_good_quality_words - END");
01820         break;
01821       case 120:
01822         tprintf ("Write results pass");
01823         show_map_detail = TRUE;
01824         break;
01825     }
01826     if (word->best_choice != NULL) {
01827       tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
01828       word->reject_map.print(debug_fp);
01829       tprintf("\n");
01830       if (show_map_detail) {
01831         tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
01832         for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
01833           tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
01834           word->reject_map[i].full_print(debug_fp);
01835         }
01836       }
01837     } else {
01838       tprintf("null best choice\n");
01839     }
01840     tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
01841     tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
01842     return TRUE;
01843   } else {
01844     return FALSE;
01845   }
01846 }
01847 
01853 static void find_modal_font(           //good chars in word
01854                      STATS *fonts,     //font stats
01855                      inT16 *font_out,   //output font
01856                      inT8 *font_count  //output count
01857                     ) {
01858   inT16 font;                     //font index
01859   inT32 count;                   //pile couat
01860 
01861   if (fonts->get_total () > 0) {
01862     font = (inT16) fonts->mode ();
01863     *font_out = font;
01864     count = fonts->pile_count (font);
01865     *font_count = count < MAX_INT8 ? count : MAX_INT8;
01866     fonts->add (font, -*font_count);
01867   }
01868   else {
01869     *font_out = -1;
01870     *font_count = 0;
01871   }
01872 }
01873 
01879 void Tesseract::set_word_fonts(WERD_RES *word) {
01880   // Don't try to set the word fonts for a cube word, as the configs
01881   // will be meaningless.
01882   if (word->chopped_word == NULL) return;
01883   ASSERT_HOST(word->best_choice != NULL);
01884 
01885   int fontinfo_size = get_fontinfo_table().size();
01886   if (fontinfo_size == 0) return;
01887   GenericVector<int> font_total_score;
01888   font_total_score.init_to_size(fontinfo_size, 0);
01889 
01890   word->italic = 0;
01891   word->bold = 0;
01892   // Compute the font scores for the word
01893   if (tessedit_debug_fonts) {
01894     tprintf("Examining fonts in %s\n",
01895             word->best_choice->debug_string().string());
01896   }
01897   for (int b = 0; b < word->best_choice->length(); ++b) {
01898     BLOB_CHOICE* choice = word->GetBlobChoice(b);
01899     if (choice == NULL) continue;
01900     const GenericVector<ScoredFont>& fonts = choice->fonts();
01901     for (int f = 0; f < fonts.size(); ++f) {
01902       int fontinfo_id = fonts[f].fontinfo_id;
01903       if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
01904         font_total_score[fontinfo_id] += fonts[f].score;
01905       }
01906     }
01907   }
01908   // Find the top and 2nd choice for the word.
01909   int score1 = 0, score2 = 0;
01910   inT16 font_id1 = -1, font_id2 = -1;
01911   for (int f = 0; f < fontinfo_size; ++f) {
01912     if (tessedit_debug_fonts && font_total_score[f] > 0) {
01913       tprintf("Font %s, total score = %d\n",
01914               fontinfo_table_.get(f).name, font_total_score[f]);
01915     }
01916     if (font_total_score[f] > score1) {
01917       score2 = score1;
01918       font_id2 = font_id1;
01919       score1 = font_total_score[f];
01920       font_id1 = f;
01921     } else if (font_total_score[f] > score2) {
01922       score2 = font_total_score[f];
01923       font_id2 = f;
01924     }
01925   }
01926   word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL;
01927   word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL;
01928   // Each score has a limit of MAX_UINT16, so divide by that to get the number
01929   // of "votes" for that font, ie number of perfect scores.
01930   word->fontinfo_id_count = ClipToRange(score1 / MAX_UINT16, 1, MAX_INT8);
01931   word->fontinfo_id2_count = ClipToRange(score2 / MAX_UINT16, 0, MAX_INT8);
01932   if (score1 > 0) {
01933     FontInfo fi = fontinfo_table_.get(font_id1);
01934     if (tessedit_debug_fonts) {
01935       if (word->fontinfo_id2_count > 0) {
01936         tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
01937                 fi.name, word->fontinfo_id_count,
01938                 fontinfo_table_.get(font_id2).name,
01939                 word->fontinfo_id2_count);
01940       } else {
01941         tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
01942                 fi.name, word->fontinfo_id_count);
01943       }
01944     }
01945     word->italic = (fi.is_italic() ? 1 : -1) * word->fontinfo_id_count;
01946     word->bold = (fi.is_bold() ? 1 : -1) * word->fontinfo_id_count;
01947   }
01948 }
01949 
01950 
01957 void Tesseract::font_recognition_pass(PAGE_RES* page_res) {
01958   PAGE_RES_IT page_res_it(page_res);
01959   WERD_RES *word;                // current word
01960   STATS doc_fonts(0, font_table_size_);           // font counters
01961 
01962   // Gather font id statistics.
01963   for (page_res_it.restart_page(); page_res_it.word() != NULL;
01964        page_res_it.forward()) {
01965     word = page_res_it.word();
01966     if (word->fontinfo != NULL) {
01967       doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
01968     }
01969     if (word->fontinfo2 != NULL) {
01970       doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
01971     }
01972   }
01973   inT16 doc_font;                 // modal font
01974   inT8 doc_font_count;           // modal font
01975   find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
01976   if (doc_font_count == 0)
01977     return;
01978   // Get the modal font pointer.
01979   const FontInfo* modal_font = NULL;
01980   for (page_res_it.restart_page(); page_res_it.word() != NULL;
01981        page_res_it.forward()) {
01982     word = page_res_it.word();
01983     if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) {
01984       modal_font = word->fontinfo;
01985       break;
01986     }
01987     if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) {
01988       modal_font = word->fontinfo2;
01989       break;
01990     }
01991   }
01992   ASSERT_HOST(modal_font != NULL);
01993 
01994   // Assign modal font to weak words.
01995   for (page_res_it.restart_page(); page_res_it.word() != NULL;
01996        page_res_it.forward()) {
01997     word = page_res_it.word();
01998     int length = word->best_choice->length();
01999 
02000     int count = word->fontinfo_id_count;
02001     if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
02002       word->fontinfo = modal_font;
02003       // Counts only get 1 as it came from the doc.
02004       word->fontinfo_id_count = 1;
02005       word->italic = modal_font->is_italic() ? 1 : -1;
02006       word->bold = modal_font->is_bold() ? 1 : -1;
02007     }
02008   }
02009 }
02010 
02011 // If a word has multiple alternates check if the best choice is in the
02012 // dictionary. If not, replace it with an alternate that exists in the
02013 // dictionary.
02014 void Tesseract::dictionary_correction_pass(PAGE_RES *page_res) {
02015   PAGE_RES_IT word_it(page_res);
02016   for (WERD_RES* word = word_it.word(); word != NULL;
02017        word = word_it.forward()) {
02018     if (word->best_choices.singleton())
02019       continue;  // There are no alternates.
02020 
02021     WERD_CHOICE* best = word->best_choice;
02022     if (word->tesseract->getDict().valid_word(*best) != 0)
02023       continue;  // The best choice is in the dictionary.
02024 
02025     WERD_CHOICE_IT choice_it(&word->best_choices);
02026     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
02027          choice_it.forward()) {
02028       WERD_CHOICE* alternate = choice_it.data();
02029       if (word->tesseract->getDict().valid_word(*alternate)) {
02030         // The alternate choice is in the dictionary.
02031         if (tessedit_bigram_debug) {
02032           tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
02033                   best->unichar_string().string(),
02034                   alternate->unichar_string().string());
02035         }
02036         // Replace the 'best' choice with a better choice.
02037         word->ReplaceBestChoice(alternate);
02038         break;
02039       }
02040     }
02041   }
02042 }
02043 
02044 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines