|
tesseract 3.04.01
|
00001 00002 // File: recogtraining.cpp 00003 // Description: Functions for ambiguity and parameter training. 00004 // Author: Daria Antonova 00005 // Created: Mon Aug 13 11:26:43 PDT 2009 00006 // 00007 // (C) Copyright 2009, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #include "tesseractclass.h" 00021 00022 #include "boxread.h" 00023 #include "control.h" 00024 #include "cutil.h" 00025 #include "host.h" 00026 #include "ratngs.h" 00027 #include "reject.h" 00028 #include "stopper.h" 00029 00030 namespace tesseract { 00031 00032 const inT16 kMaxBoxEdgeDiff = 2; 00033 00034 // Sets flags necessary for recognition in the training mode. 00035 // Opens and returns the pointer to the output file. 00036 FILE *Tesseract::init_recog_training(const STRING &fname) { 00037 if (tessedit_ambigs_training) { 00038 tessedit_tess_adaption_mode.set_value(0); // turn off adaption 00039 tessedit_enable_doc_dict.set_value(0); // turn off document dictionary 00040 // Explore all segmentations. 00041 getDict().stopper_no_acceptable_choices.set_value(1); 00042 } 00043 00044 STRING output_fname = fname; 00045 const char *lastdot = strrchr(output_fname.string(), '.'); 00046 if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0'; 00047 output_fname += ".txt"; 00048 FILE *output_file = open_file(output_fname.string(), "a+"); 00049 return output_file; 00050 } 00051 00052 // Copies the bounding box from page_res_it->word() to the given TBOX. 00053 bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) { 00054 while (page_res_it->block() != NULL && page_res_it->word() == NULL) 00055 page_res_it->forward(); 00056 00057 if (page_res_it->word() != NULL) { 00058 *tbox = page_res_it->word()->word->bounding_box(); 00059 00060 // If tbox->left() is negative, the training image has vertical text and 00061 // all the coordinates of bounding boxes of page_res are rotated by 90 00062 // degrees in a counterclockwise direction. We need to rotate the TBOX back 00063 // in order to compare with the TBOXes of box files. 00064 if (tbox->left() < 0) { 00065 tbox->rotate(FCOORD(0.0, -1.0)); 00066 } 00067 00068 return true; 00069 } else { 00070 return false; 00071 } 00072 } 00073 00074 // This function takes tif/box pair of files and runs recognition on the image, 00075 // while making sure that the word bounds that tesseract identified roughly 00076 // match to those specified by the input box file. For each word (ngram in a 00077 // single bounding box from the input box file) it outputs the ocred result, 00078 // the correct label, rating and certainty. 00079 void Tesseract::recog_training_segmented(const STRING &fname, 00080 PAGE_RES *page_res, 00081 volatile ETEXT_DESC *monitor, 00082 FILE *output_file) { 00083 STRING box_fname = fname; 00084 const char *lastdot = strrchr(box_fname.string(), '.'); 00085 if (lastdot != NULL) box_fname[lastdot - box_fname.string()] = '\0'; 00086 box_fname += ".box"; 00087 // ReadNextBox() will close box_file 00088 FILE *box_file = open_file(box_fname.string(), "r"); 00089 00090 PAGE_RES_IT page_res_it; 00091 page_res_it.page_res = page_res; 00092 page_res_it.restart_page(); 00093 STRING label; 00094 00095 // Process all the words on this page. 00096 TBOX tbox; // tesseract-identified box 00097 TBOX bbox; // box from the box file 00098 bool keep_going; 00099 int line_number = 0; 00100 int examined_words = 0; 00101 do { 00102 keep_going = read_t(&page_res_it, &tbox); 00103 keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label, 00104 &bbox); 00105 // Align bottom left points of the TBOXes. 00106 while (keep_going && 00107 !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) { 00108 if (bbox.bottom() < tbox.bottom()) { 00109 page_res_it.forward(); 00110 keep_going = read_t(&page_res_it, &tbox); 00111 } else { 00112 keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label, 00113 &bbox); 00114 } 00115 } 00116 while (keep_going && 00117 !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) { 00118 if (bbox.left() > tbox.left()) { 00119 page_res_it.forward(); 00120 keep_going = read_t(&page_res_it, &tbox); 00121 } else { 00122 keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label, 00123 &bbox); 00124 } 00125 } 00126 // OCR the word if top right points of the TBOXes are similar. 00127 if (keep_going && 00128 NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) && 00129 NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) { 00130 ambigs_classify_and_output(label.string(), &page_res_it, output_file); 00131 examined_words++; 00132 } 00133 page_res_it.forward(); 00134 } while (keep_going); 00135 00136 // Set up scripts on all of the words that did not get sent to 00137 // ambigs_classify_and_output. They all should have, but if all the 00138 // werd_res's don't get uch_sets, tesseract will crash when you try 00139 // to iterate over them. :-( 00140 int total_words = 0; 00141 for (page_res_it.restart_page(); page_res_it.block() != NULL; 00142 page_res_it.forward()) { 00143 if (page_res_it.word()) { 00144 if (page_res_it.word()->uch_set == NULL) 00145 page_res_it.word()->SetupFake(unicharset); 00146 total_words++; 00147 } 00148 } 00149 if (examined_words < 0.85 * total_words) { 00150 tprintf("TODO(antonova): clean up recog_training_segmented; " 00151 " It examined only a small fraction of the ambigs image.\n"); 00152 } 00153 tprintf("recog_training_segmented: examined %d / %d words.\n", 00154 examined_words, total_words); 00155 } 00156 00157 // Helper prints the given set of blob choices. 00158 static void PrintPath(int length, const BLOB_CHOICE** blob_choices, 00159 const UNICHARSET& unicharset, 00160 const char *label, FILE *output_file) { 00161 float rating = 0.0f; 00162 float certainty = 0.0f; 00163 for (int i = 0; i < length; ++i) { 00164 const BLOB_CHOICE* blob_choice = blob_choices[i]; 00165 fprintf(output_file, "%s", 00166 unicharset.id_to_unichar(blob_choice->unichar_id())); 00167 rating += blob_choice->rating(); 00168 if (certainty > blob_choice->certainty()) 00169 certainty = blob_choice->certainty(); 00170 } 00171 fprintf(output_file, "\t%s\t%.4f\t%.4f\n", 00172 label, rating, certainty); 00173 } 00174 00175 // Helper recursively prints all paths through the ratings matrix, starting 00176 // at column col. 00177 static void PrintMatrixPaths(int col, int dim, 00178 const MATRIX& ratings, 00179 int length, const BLOB_CHOICE** blob_choices, 00180 const UNICHARSET& unicharset, 00181 const char *label, FILE *output_file) { 00182 for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) { 00183 if (ratings.get(col, row) != NOT_CLASSIFIED) { 00184 BLOB_CHOICE_IT bc_it(ratings.get(col, row)); 00185 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { 00186 blob_choices[length] = bc_it.data(); 00187 if (row + 1 < dim) { 00188 PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices, 00189 unicharset, label, output_file); 00190 } else { 00191 PrintPath(length + 1, blob_choices, unicharset, label, output_file); 00192 } 00193 } 00194 } 00195 } 00196 } 00197 00198 // Runs classify_word_pass1() on the current word. Outputs Tesseract's 00199 // raw choice as a result of the classification. For words labeled with a 00200 // single unichar also outputs all alternatives from blob_choices of the 00201 // best choice. 00202 void Tesseract::ambigs_classify_and_output(const char *label, 00203 PAGE_RES_IT* pr_it, 00204 FILE *output_file) { 00205 // Classify word. 00206 fflush(stdout); 00207 WordData word_data(*pr_it); 00208 SetupWordPassN(1, &word_data); 00209 classify_word_and_language(1, pr_it, &word_data); 00210 WERD_RES* werd_res = word_data.word; 00211 WERD_CHOICE *best_choice = werd_res->best_choice; 00212 ASSERT_HOST(best_choice != NULL); 00213 00214 // Compute the number of unichars in the label. 00215 GenericVector<UNICHAR_ID> encoding; 00216 if (!unicharset.encode_string(label, true, &encoding, NULL, NULL)) { 00217 tprintf("Not outputting illegal unichar %s\n", label); 00218 return; 00219 } 00220 00221 // Dump all paths through the ratings matrix (which is normally small). 00222 int dim = werd_res->ratings->dimension(); 00223 const BLOB_CHOICE** blob_choices = new const BLOB_CHOICE*[dim]; 00224 PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, 00225 unicharset, label, output_file); 00226 delete [] blob_choices; 00227 } 00228 00229 } // namespace tesseract