tesseract 3.04.01

ccmain/applybox.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        applybox.cpp  (Formerly applybox.c)
00003  * Description: Re segment rows according to box file data
00004  * Author:      Phil Cheatle
00005  * Created:     Wed Nov 24 09:11:23 GMT 1993
00006  *
00007  * (C) Copyright 1993, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #endif
00023 
00024 #include <ctype.h>
00025 #include <string.h>
00026 #ifdef __UNIX__
00027 #include <assert.h>
00028 #include <errno.h>
00029 #endif
00030 #include "allheaders.h"
00031 #include "boxread.h"
00032 #include "chopper.h"
00033 #include "pageres.h"
00034 #include "unichar.h"
00035 #include "unicharset.h"
00036 #include "tesseractclass.h"
00037 #include "genericvector.h"
00038 
00040 const int kMaxGroupSize = 4;
00043 const double kMaxXHeightDeviationFraction = 0.125;
00044 
00080 namespace tesseract {
00081 
00082 static void clear_any_old_text(BLOCK_LIST *block_list) {
00083   BLOCK_IT block_it(block_list);
00084   for (block_it.mark_cycle_pt();
00085        !block_it.cycled_list(); block_it.forward()) {
00086     ROW_IT row_it(block_it.data()->row_list());
00087     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00088       WERD_IT word_it(row_it.data()->word_list());
00089       for (word_it.mark_cycle_pt();
00090            !word_it.cycled_list(); word_it.forward()) {
00091         word_it.data()->set_text("");
00092       }
00093     }
00094   }
00095 }
00096 
00097 // Applies the box file based on the image name fname, and resegments
00098 // the words in the block_list (page), with:
00099 // blob-mode: one blob per line in the box file, words as input.
00100 // word/line-mode: one blob per space-delimited unit after the #, and one word
00101 // per line in the box file. (See comment above for box file format.)
00102 // If find_segmentation is true, (word/line mode) then the classifier is used
00103 // to re-segment words/lines to match the space-delimited truth string for
00104 // each box. In this case, the input box may be for a word or even a whole
00105 // text line, and the output words will contain multiple blobs corresponding
00106 // to the space-delimited input string.
00107 // With find_segmentation false, no classifier is needed, but the chopper
00108 // can still be used to correctly segment touching characters with the help
00109 // of the input boxes.
00110 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
00111 // from normal classification, ie. with a word, chopped_word, rebuild_word,
00112 // seam_array, denorm, box_word, and best_state, but NO best_choice or
00113 // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
00114 // Instead, the correct_text member of WERD_RES is set, and this may be later
00115 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
00116 // is not required before calling ApplyBoxTraining.
00117 PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,
00118                                 bool find_segmentation,
00119                                 BLOCK_LIST *block_list) {
00120   GenericVector<TBOX> boxes;
00121   GenericVector<STRING> texts, full_texts;
00122   if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
00123                     NULL)) {
00124     return NULL;  // Can't do it.
00125   }
00126 
00127   int box_count = boxes.size();
00128   int box_failures = 0;
00129   // Add an empty everything to the end.
00130   boxes.push_back(TBOX());
00131   texts.push_back(STRING());
00132   full_texts.push_back(STRING());
00133 
00134   // In word mode, we use the boxes to make a word for each box, but
00135   // in blob mode we use the existing words and maximally chop them first.
00136   PAGE_RES* page_res = find_segmentation ?
00137       NULL : SetupApplyBoxes(boxes, block_list);
00138   clear_any_old_text(block_list);
00139 
00140   for (int i = 0; i < boxes.size() - 1; i++) {
00141     bool foundit = false;
00142     if (page_res != NULL) {
00143       if (i == 0) {
00144         foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
00145                                    full_texts[i].string());
00146       } else {
00147         foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
00148                                    boxes[i + 1], full_texts[i].string());
00149       }
00150     } else {
00151       foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
00152                                  texts[i].string());
00153     }
00154     if (!foundit) {
00155       box_failures++;
00156       ReportFailedBox(i, boxes[i], texts[i].string(),
00157                       "FAILURE! Couldn't find a matching blob");
00158     }
00159   }
00160 
00161   if (page_res == NULL) {
00162     // In word/line mode, we now maximally chop all the words and resegment
00163     // them with the classifier.
00164     page_res = SetupApplyBoxes(boxes, block_list);
00165     ReSegmentByClassification(page_res);
00166   }
00167   if (applybox_debug > 0) {
00168     tprintf("APPLY_BOXES:\n");
00169     tprintf("   Boxes read from boxfile:  %6d\n", box_count);
00170     if (box_failures > 0)
00171       tprintf("   Boxes failed resegmentation:  %6d\n", box_failures);
00172   }
00173   TidyUp(page_res);
00174   return page_res;
00175 }
00176 
00177 // Helper computes median xheight in the image.
00178 static double MedianXHeight(BLOCK_LIST *block_list) {
00179   BLOCK_IT block_it(block_list);
00180   STATS xheights(0, block_it.data()->bounding_box().height());
00181   for (block_it.mark_cycle_pt();
00182        !block_it.cycled_list(); block_it.forward()) {
00183     ROW_IT row_it(block_it.data()->row_list());
00184     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00185       xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
00186     }
00187   }
00188   return xheights.median();
00189 }
00190 
00193 void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
00194   double median_xheight = MedianXHeight(block_list);
00195   double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
00196   // Strip all fuzzy space markers to simplify the PAGE_RES.
00197   BLOCK_IT b_it(block_list);
00198   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00199     BLOCK* block = b_it.data();
00200     ROW_IT r_it(block->row_list());
00201     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
00202       ROW* row = r_it.data();
00203       float diff = fabs(row->x_height() - median_xheight);
00204       if (diff > max_deviation) {
00205         if (applybox_debug) {
00206           tprintf("row xheight=%g, but median xheight = %g\n",
00207                   row->x_height(), median_xheight);
00208         }
00209         row->set_x_height(static_cast<float>(median_xheight));
00210       }
00211     }
00212   }
00213 }
00214 
00217 PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
00218                                      BLOCK_LIST *block_list) {
00219   PreenXHeights(block_list);
00220   // Strip all fuzzy space markers to simplify the PAGE_RES.
00221   BLOCK_IT b_it(block_list);
00222   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00223     BLOCK* block = b_it.data();
00224     ROW_IT r_it(block->row_list());
00225     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
00226       ROW* row = r_it.data();
00227       WERD_IT w_it(row->word_list());
00228       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
00229         WERD* word = w_it.data();
00230         if (word->cblob_list()->empty()) {
00231           delete w_it.extract();
00232         } else {
00233           word->set_flag(W_FUZZY_SP, false);
00234           word->set_flag(W_FUZZY_NON, false);
00235         }
00236       }
00237     }
00238   }
00239   PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL);
00240   PAGE_RES_IT pr_it(page_res);
00241   WERD_RES* word_res;
00242   while ((word_res = pr_it.word()) != NULL) {
00243     MaximallyChopWord(boxes, pr_it.block()->block,
00244                       pr_it.row()->row, word_res);
00245     pr_it.forward();
00246   }
00247   return page_res;
00248 }
00249 
00253 void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
00254                                   BLOCK* block, ROW* row,
00255                                   WERD_RES* word_res) {
00256   if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
00257                                      tessedit_ocr_engine_mode, NULL,
00258                                      classify_bln_numeric_mode,
00259                                      textord_use_cjk_fp_model,
00260                                      poly_allow_detailed_fx,
00261                                      row, block)) {
00262     word_res->CloneChoppedToRebuild();
00263     return;
00264   }
00265   if (chop_debug) {
00266     tprintf("Maximally chopping word at:");
00267     word_res->word->bounding_box().print();
00268   }
00269   GenericVector<BLOB_CHOICE*> blob_choices;
00270   ASSERT_HOST(!word_res->chopped_word->blobs.empty());
00271   float rating = static_cast<float>(MAX_INT8);
00272   for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
00273     // The rating and certainty are not quite arbitrary. Since
00274     // select_blob_to_chop uses the worst certainty to choose, they all have
00275     // to be different, so starting with MAX_INT8, subtract 1/8 for each blob
00276     // in here, and then divide by e each time they are chopped, which
00277     // should guarantee a set of unequal values for the whole tree of blobs
00278     // produced, however much chopping is required. The chops are thus only
00279     // limited by the ability of the chopper to find suitable chop points,
00280     // and not by the value of the certainties.
00281     BLOB_CHOICE* choice =
00282         new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
00283     blob_choices.push_back(choice);
00284     rating -= 0.125f;
00285   }
00286   const double e = exp(1.0);  // The base of natural logs.
00287   int blob_number;
00288   int right_chop_index = 0;
00289   if (!assume_fixed_pitch_char_segment) {
00290     // We only chop if the language is not fixed pitch like CJK.
00291     SEAM* seam = NULL;
00292     while ((seam = chop_one_blob(boxes, blob_choices, word_res,
00293                                  &blob_number)) != NULL) {
00294       word_res->InsertSeam(blob_number, seam);
00295       BLOB_CHOICE* left_choice = blob_choices[blob_number];
00296       rating = left_choice->rating() / e;
00297       left_choice->set_rating(rating);
00298       left_choice->set_certainty(-rating);
00299       // combine confidence w/ serial #
00300       BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
00301                                                   rating - 0.125f, -rating, -1,
00302                                                   0.0f, 0.0f, 0.0f, BCC_FAKE);
00303       blob_choices.insert(right_choice, blob_number + 1);
00304     }
00305   }
00306   word_res->CloneChoppedToRebuild();
00307   word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
00308 }
00309 
00321 static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
00322   int overlap_area = box1.intersection(box2).area();
00323   double miss_metric = box1.area()- overlap_area;
00324   miss_metric /= box1.area();
00325   miss_metric *= box2.area() - overlap_area;
00326   miss_metric /= box2.area();
00327   return miss_metric;
00328 }
00329 
00340 bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
00341                                  const TBOX& box, const TBOX& next_box,
00342                                  const char* correct_text) {
00343   if (applybox_debug > 1) {
00344     tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
00345   }
00346   PAGE_RES_IT page_res_it(page_res);
00347   WERD_RES* word_res;
00348   for (word_res = page_res_it.word(); word_res != NULL;
00349        word_res = page_res_it.forward()) {
00350     if (!word_res->box_word->bounding_box().major_overlap(box))
00351       continue;
00352     if (applybox_debug > 1) {
00353       tprintf("Checking word box:");
00354       word_res->box_word->bounding_box().print();
00355     }
00356     int word_len = word_res->box_word->length();
00357     for (int i = 0; i < word_len; ++i) {
00358       TBOX char_box = TBOX();
00359       int blob_count = 0;
00360       for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
00361         TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
00362         if (!blob_box.major_overlap(box))
00363           break;
00364         if (word_res->correct_text[i + blob_count].length() > 0)
00365           break;  // Blob is claimed already.
00366         double current_box_miss_metric = BoxMissMetric(blob_box, box);
00367         double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
00368         if (applybox_debug > 2) {
00369           tprintf("Checking blob:");
00370           blob_box.print();
00371           tprintf("Current miss metric = %g, next = %g\n",
00372                   current_box_miss_metric, next_box_miss_metric);
00373         }
00374         if (current_box_miss_metric > next_box_miss_metric)
00375           break;  // Blob is a better match for next box.
00376         char_box += blob_box;
00377       }
00378       if (blob_count > 0) {
00379         if (applybox_debug > 1) {
00380           tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
00381         }
00382         if (!char_box.almost_equal(box, 3) &&
00383             (box.x_gap(next_box) < -3 ||
00384              (prev_box != NULL && prev_box->x_gap(box) < -3))) {
00385           return false;
00386         }
00387         // We refine just the box_word, best_state and correct_text here.
00388         // The rebuild_word is made in TidyUp.
00389         // blob_count blobs are put together to match the box. Merge the
00390         // box_word boxes, save the blob_count in the state and the text.
00391         word_res->box_word->MergeBoxes(i, i + blob_count);
00392         word_res->best_state[i] = blob_count;
00393         word_res->correct_text[i] = correct_text;
00394         if (applybox_debug > 2) {
00395           tprintf("%d Blobs match: blob box:", blob_count);
00396           word_res->box_word->BlobBox(i).print();
00397           tprintf("Matches box:");
00398           box.print();
00399           tprintf("With next box:");
00400           next_box.print();
00401         }
00402         // Eliminated best_state and correct_text entries for the consumed
00403         // blobs.
00404         for (int j = 1; j < blob_count; ++j) {
00405           word_res->best_state.remove(i + 1);
00406           word_res->correct_text.remove(i + 1);
00407         }
00408         // Assume that no box spans multiple source words, so we are done with
00409         // this box.
00410         if (applybox_debug > 1) {
00411           tprintf("Best state = ");
00412           for (int j = 0; j < word_res->best_state.size(); ++j) {
00413             tprintf("%d ", word_res->best_state[j]);
00414           }
00415           tprintf("\n");
00416           tprintf("Correct text = [[ ");
00417           for (int j = 0; j < word_res->correct_text.size(); ++j) {
00418             tprintf("%s ", word_res->correct_text[j].string());
00419           }
00420           tprintf("]]\n");
00421         }
00422         return true;
00423       }
00424     }
00425   }
00426   if (applybox_debug > 0) {
00427     tprintf("FAIL!\n");
00428   }
00429   return false;  // Failure.
00430 }
00431 
00438 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
00439                                  const TBOX& box, const TBOX& next_box,
00440                                  const char* correct_text) {
00441   if (applybox_debug > 1) {
00442     tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
00443   }
00444   WERD* new_word = NULL;
00445   BLOCK_IT b_it(block_list);
00446   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00447     BLOCK* block = b_it.data();
00448     if (!box.major_overlap(block->bounding_box()))
00449       continue;
00450     ROW_IT r_it(block->row_list());
00451     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
00452       ROW* row = r_it.data();
00453       if (!box.major_overlap(row->bounding_box()))
00454         continue;
00455       WERD_IT w_it(row->word_list());
00456       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
00457         WERD* word = w_it.data();
00458         if (applybox_debug > 2) {
00459           tprintf("Checking word:");
00460           word->bounding_box().print();
00461         }
00462         if (word->text() != NULL && word->text()[0] != '\0')
00463           continue;  // Ignore words that are already done.
00464         if (!box.major_overlap(word->bounding_box()))
00465           continue;
00466         C_BLOB_IT blob_it(word->cblob_list());
00467         for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
00468              blob_it.forward()) {
00469           C_BLOB* blob = blob_it.data();
00470           TBOX blob_box = blob->bounding_box();
00471           if (!blob_box.major_overlap(box))
00472             continue;
00473           double current_box_miss_metric = BoxMissMetric(blob_box, box);
00474           double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
00475           if (applybox_debug > 2) {
00476             tprintf("Checking blob:");
00477             blob_box.print();
00478             tprintf("Current miss metric = %g, next = %g\n",
00479                     current_box_miss_metric, next_box_miss_metric);
00480           }
00481           if (current_box_miss_metric > next_box_miss_metric)
00482             continue;  // Blob is a better match for next box.
00483           if (applybox_debug > 2) {
00484             tprintf("Blob match: blob:");
00485             blob_box.print();
00486             tprintf("Matches box:");
00487             box.print();
00488             tprintf("With next box:");
00489             next_box.print();
00490           }
00491           if (new_word == NULL) {
00492             // Make a new word with a single blob.
00493             new_word = word->shallow_copy();
00494             new_word->set_text(correct_text);
00495             w_it.add_to_end(new_word);
00496           }
00497           C_BLOB_IT new_blob_it(new_word->cblob_list());
00498           new_blob_it.add_to_end(blob_it.extract());
00499         }
00500       }
00501     }
00502   }
00503   if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
00504   return new_word != NULL;
00505 }
00506 
00509 void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
00510   PAGE_RES_IT pr_it(page_res);
00511   WERD_RES* word_res;
00512   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
00513     WERD* word = word_res->word;
00514     if (word->text() == NULL || word->text()[0] == '\0')
00515       continue;  // Ignore words that have no text.
00516     // Convert the correct text to a vector of UNICHAR_ID
00517     GenericVector<UNICHAR_ID> target_text;
00518     if (!ConvertStringToUnichars(word->text(), &target_text)) {
00519       tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
00520               word->text());
00521       pr_it.DeleteCurrentWord();
00522       continue;
00523     }
00524     if (!FindSegmentation(target_text, word_res)) {
00525       tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
00526               word->text());
00527       pr_it.DeleteCurrentWord();
00528       continue;
00529     }
00530   }
00531 }
00532 
00535 bool Tesseract::ConvertStringToUnichars(const char* utf8,
00536                                         GenericVector<UNICHAR_ID>* class_ids) {
00537   for (int step = 0; *utf8 != '\0'; utf8 += step) {
00538     const char* next_space = strchr(utf8, ' ');
00539     if (next_space == NULL)
00540       next_space = utf8 + strlen(utf8);
00541     step = next_space - utf8;
00542     UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
00543     if (class_id == INVALID_UNICHAR_ID) {
00544       return false;
00545     }
00546     while (utf8[step] == ' ')
00547       ++step;
00548     class_ids->push_back(class_id);
00549   }
00550   return true;
00551 }
00552 
00559 bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
00560                                  WERD_RES* word_res) {
00561   // Classify all required combinations of blobs and save results in choices.
00562   int word_length = word_res->box_word->length();
00563   GenericVector<BLOB_CHOICE_LIST*>* choices =
00564       new GenericVector<BLOB_CHOICE_LIST*>[word_length];
00565   for (int i = 0; i < word_length; ++i) {
00566     for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
00567       BLOB_CHOICE_LIST* match_result = classify_piece(
00568           word_res->seam_array, i, i + j - 1, "Applybox",
00569           word_res->chopped_word, word_res->blamer_bundle);
00570       if (applybox_debug > 2) {
00571         tprintf("%d+%d:", i, j);
00572         print_ratings_list("Segment:", match_result, unicharset);
00573       }
00574       choices[i].push_back(match_result);
00575     }
00576   }
00577   // Search the segmentation graph for the target text. Must be an exact
00578   // match. Using wildcards makes it difficult to find the correct
00579   // segmentation even when it is there.
00580   word_res->best_state.clear();
00581   GenericVector<int> search_segmentation;
00582   float best_rating = 0.0f;
00583   SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
00584                 &search_segmentation, &best_rating, &word_res->best_state);
00585   for (int i = 0; i < word_length; ++i)
00586     choices[i].delete_data_pointers();
00587   delete [] choices;
00588   if (word_res->best_state.empty()) {
00589     // Build the original segmentation and if it is the same length as the
00590     // truth, assume it will do.
00591     int blob_count = 1;
00592     for (int s = 0; s < word_res->seam_array.size(); ++s) {
00593       SEAM* seam = word_res->seam_array[s];
00594       if (!seam->HasAnySplits()) {
00595         word_res->best_state.push_back(blob_count);
00596         blob_count = 1;
00597       } else {
00598         ++blob_count;
00599       }
00600     }
00601     word_res->best_state.push_back(blob_count);
00602     if (word_res->best_state.size() != target_text.size()) {
00603       word_res->best_state.clear();  // No good. Original segmentation bad size.
00604       return false;
00605     }
00606   }
00607   word_res->correct_text.clear();
00608   for (int i = 0; i < target_text.size(); ++i) {
00609     word_res->correct_text.push_back(
00610         STRING(unicharset.id_to_unichar(target_text[i])));
00611   }
00612   return true;
00613 }
00614 
00629 void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
00630                               int choices_pos, int choices_length,
00631                               const GenericVector<UNICHAR_ID>& target_text,
00632                               int text_index,
00633                               float rating, GenericVector<int>* segmentation,
00634                               float* best_rating,
00635                               GenericVector<int>* best_segmentation) {
00636   const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs();
00637   for (int length = 1; length <= choices[choices_pos].size(); ++length) {
00638     // Rating of matching choice or worst choice if no match.
00639     float choice_rating = 0.0f;
00640     // Find the corresponding best BLOB_CHOICE.
00641     BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
00642     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
00643          choice_it.forward()) {
00644       BLOB_CHOICE* choice = choice_it.data();
00645       choice_rating = choice->rating();
00646       UNICHAR_ID class_id = choice->unichar_id();
00647       if (class_id == target_text[text_index]) {
00648         break;
00649       }
00650       // Search ambigs table.
00651       if (class_id < table.size() && table[class_id] != NULL) {
00652         AmbigSpec_IT spec_it(table[class_id]);
00653         for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
00654              spec_it.forward()) {
00655           const AmbigSpec *ambig_spec = spec_it.data();
00656           // We'll only do 1-1.
00657           if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
00658               ambig_spec->correct_ngram_id == target_text[text_index])
00659             break;
00660         }
00661         if (!spec_it.cycled_list())
00662           break;  // Found an ambig.
00663       }
00664     }
00665     if (choice_it.cycled_list())
00666       continue;  // No match.
00667     segmentation->push_back(length);
00668     if (choices_pos + length == choices_length &&
00669         text_index + 1 == target_text.size()) {
00670       // This is a complete match. If the rating is good record a new best.
00671       if (applybox_debug > 2) {
00672         tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
00673                 rating + choice_rating, *best_rating, segmentation->size(),
00674                 best_segmentation->size());
00675       }
00676       if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
00677         *best_segmentation = *segmentation;
00678         *best_rating = rating + choice_rating;
00679       }
00680     } else if (choices_pos + length < choices_length &&
00681                text_index + 1 < target_text.size()) {
00682       if (applybox_debug > 3) {
00683         tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
00684                 target_text[text_index],
00685                 unicharset.id_to_unichar(target_text[text_index]),
00686                 choice_it.data()->unichar_id() == target_text[text_index]
00687                      ? "Match" : "Ambig",
00688                 choices_pos, length);
00689       }
00690       SearchForText(choices, choices_pos + length, choices_length, target_text,
00691                     text_index + 1, rating + choice_rating, segmentation,
00692                     best_rating, best_segmentation);
00693       if (applybox_debug > 3) {
00694         tprintf("End recursion for %d=%s\n", target_text[text_index],
00695                 unicharset.id_to_unichar(target_text[text_index]));
00696       }
00697     }
00698     segmentation->truncate(segmentation->size() - 1);
00699   }
00700 }
00701 
00706 void Tesseract::TidyUp(PAGE_RES* page_res) {
00707   int ok_blob_count = 0;
00708   int bad_blob_count = 0;
00709   int ok_word_count = 0;
00710   int unlabelled_words = 0;
00711   PAGE_RES_IT pr_it(page_res);
00712   WERD_RES* word_res;
00713   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
00714     int ok_in_word = 0;
00715     int blob_count = word_res->correct_text.size();
00716     WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
00717     word_choice->set_permuter(TOP_CHOICE_PERM);
00718     for (int c = 0; c < blob_count; ++c) {
00719       if (word_res->correct_text[c].length() > 0) {
00720         ++ok_in_word;
00721       }
00722       // Since we only need a fake word_res->best_choice, the actual
00723       // unichar_ids do not matter. Which is fortunate, since TidyUp()
00724       // can be called while training Tesseract, at the stage where
00725       // unicharset is not meaningful yet.
00726       word_choice->append_unichar_id_space_allocated(
00727           INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
00728     }
00729     if (ok_in_word > 0) {
00730       ok_blob_count += ok_in_word;
00731       bad_blob_count += word_res->correct_text.size() - ok_in_word;
00732       word_res->LogNewRawChoice(word_choice);
00733       word_res->LogNewCookedChoice(1, false, word_choice);
00734     } else {
00735       ++unlabelled_words;
00736       if (applybox_debug > 0) {
00737         tprintf("APPLY_BOXES: Unlabelled word at :");
00738         word_res->word->bounding_box().print();
00739       }
00740       pr_it.DeleteCurrentWord();
00741       delete word_choice;
00742     }
00743   }
00744   pr_it.restart_page();
00745   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
00746     // Denormalize back to a BoxWord.
00747     word_res->RebuildBestState();
00748     word_res->SetupBoxWord();
00749     word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
00750     word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
00751   }
00752   if (applybox_debug > 0) {
00753     tprintf("   Found %d good blobs.\n", ok_blob_count);
00754     if (bad_blob_count > 0) {
00755       tprintf("   Leaving %d unlabelled blobs in %d words.\n",
00756               bad_blob_count, ok_word_count);
00757     }
00758     if (unlabelled_words > 0)
00759       tprintf("   %d remaining unlabelled words deleted.\n", unlabelled_words);
00760   }
00761 }
00762 
00764 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
00765                                 const char *box_ch, const char *err_msg) {
00766   tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
00767           boxfile_lineno + 1, box_ch,
00768           box.left(), box.bottom(), box.right(), box.top(), err_msg);
00769 }
00770 
00772 void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
00773   PAGE_RES_IT pr_it(page_res);
00774   for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
00775        word_res = pr_it.forward()) {
00776     WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
00777                                           word_res->correct_text.size());
00778     for (int i = 0; i < word_res->correct_text.size(); ++i) {
00779       // The part before the first space is the real ground truth, and the
00780       // rest is the bounding box location and page number.
00781       GenericVector<STRING> tokens;
00782       word_res->correct_text[i].split(' ', &tokens);
00783       UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
00784       choice->append_unichar_id_space_allocated(char_id,
00785                                                 word_res->best_state[i],
00786                                                 0.0f, 0.0f);
00787     }
00788     word_res->ClearWordChoices();
00789     word_res->LogNewRawChoice(choice);
00790     word_res->LogNewCookedChoice(1, false, choice);
00791   }
00792 }
00793 
00796 void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
00797   PAGE_RES_IT pr_it(page_res);
00798   int word_count = 0;
00799   for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
00800        word_res = pr_it.forward()) {
00801     LearnWord(fontname.string(), word_res);
00802     ++word_count;
00803   }
00804   tprintf("Generated training data for %d words\n", word_count);
00805 }
00806 
00807 
00808 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines