tesseract 3.04.01

wordrec/chopper.cpp

Go to the documentation of this file.
00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        chopper.c  (Formerly chopper.c)
00005  * Description:
00006  * Author:       Mark Seaman, OCR Technology
00007  * Created:      Fri Oct 16 14:37:00 1987
00008  * Modified:     Tue Jul 30 16:18:52 1991 (Mark Seaman) marks@hpgrlt
00009  * Language:     C
00010  * Package:      N/A
00011  * Status:       Reusable Software Component
00012  *
00013  * (c) Copyright 1987, Hewlett-Packard Company.
00014  ** Licensed under the Apache License, Version 2.0 (the "License");
00015  ** you may not use this file except in compliance with the License.
00016  ** You may obtain a copy of the License at
00017  ** http://www.apache.org/licenses/LICENSE-2.0
00018  ** Unless required by applicable law or agreed to in writing, software
00019  ** distributed under the License is distributed on an "AS IS" BASIS,
00020  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021  ** See the License for the specific language governing permissions and
00022  ** limitations under the License.
00023  *
00024  **************************************************************************/
00025 
00026 /*----------------------------------------------------------------------
00027           I n c l u d e s
00028 ----------------------------------------------------------------------*/
00029 
00030 #include <math.h>
00031 
00032 #include "chopper.h"
00033 
00034 #include "assert.h"
00035 #include "associate.h"
00036 #include "blobs.h"
00037 #include "callcpp.h"
00038 #include "const.h"
00039 #include "findseam.h"
00040 #include "freelist.h"
00041 #include "globals.h"
00042 #include "render.h"
00043 #include "pageres.h"
00044 #include "seam.h"
00045 #include "stopper.h"
00046 #include "structures.h"
00047 #include "unicharset.h"
00048 #include "wordrec.h"
00049 
00050 // Include automatically generated configuration file if running autoconf.
00051 #ifdef HAVE_CONFIG_H
00052 #include "config_auto.h"
00053 #endif
00054 
00055 // Even though the limit on the number of chunks may now be removed, keep
00056 // the same limit for repeatable behavior, and it may be a speed advantage.
00057 static const int kMaxNumChunks = 64;
00058 
00059 /*----------------------------------------------------------------------
00060           F u n c t i o n s
00061 ----------------------------------------------------------------------*/
00067 void preserve_outline(EDGEPT *start) {
00068   EDGEPT *srcpt;
00069 
00070   if (start == NULL)
00071     return;
00072   srcpt = start;
00073   do {
00074     srcpt->flags[1] = 1;
00075     srcpt = srcpt->next;
00076   }
00077   while (srcpt != start);
00078   srcpt->flags[1] = 2;
00079 }
00080 
00081 
00082 /**************************************************************************/
00083 void preserve_outline_tree(TESSLINE *srcline) {
00084   TESSLINE *outline;
00085 
00086   for (outline = srcline; outline != NULL; outline = outline->next) {
00087     preserve_outline (outline->loop);
00088   }
00089 }
00090 
00091 
00097 EDGEPT *restore_outline(EDGEPT *start) {
00098   EDGEPT *srcpt;
00099   EDGEPT *real_start;
00100 
00101   if (start == NULL)
00102     return NULL;
00103   srcpt = start;
00104   do {
00105     if (srcpt->flags[1] == 2)
00106       break;
00107     srcpt = srcpt->next;
00108   }
00109   while (srcpt != start);
00110   real_start = srcpt;
00111   do {
00112     srcpt = srcpt->next;
00113     if (srcpt->prev->flags[1] == 0) {
00114       remove_edgept(srcpt->prev);
00115     }
00116   }
00117   while (srcpt != real_start);
00118   return real_start;
00119 }
00120 
00121 
00122 /******************************************************************************/
00123 void restore_outline_tree(TESSLINE *srcline) {
00124   TESSLINE *outline;
00125 
00126   for (outline = srcline; outline != NULL; outline = outline->next) {
00127     outline->loop = restore_outline (outline->loop);
00128     outline->start = outline->loop->pos;
00129   }
00130 }
00131 
00132 // Helper runs all the checks on a seam to make sure it is valid.
00133 // Returns the seam if OK, otherwise deletes the seam and returns NULL.
00134 static SEAM* CheckSeam(int debug_level, inT32 blob_number, TWERD* word,
00135                        TBLOB* blob, TBLOB* other_blob,
00136                        const GenericVector<SEAM*>& seams, SEAM* seam) {
00137   if (seam == NULL || blob->outlines == NULL || other_blob->outlines == NULL ||
00138       total_containment(blob, other_blob) || check_blob(other_blob) ||
00139       !seam->ContainedByBlob(*blob) || !seam->ContainedByBlob(*other_blob) ||
00140       any_shared_split_points(seams, seam) ||
00141       !seam->PrepareToInsertSeam(seams, word->blobs, blob_number, false)) {
00142     word->blobs.remove(blob_number + 1);
00143     if (seam) {
00144       seam->UndoSeam(blob, other_blob);
00145       delete seam;
00146       seam = NULL;
00147 #ifndef GRAPHICS_DISABLED
00148       if (debug_level) {
00149         if (debug_level >2)
00150           display_blob(blob, Red);
00151         tprintf("\n** seam being removed ** \n");
00152       }
00153 #endif
00154     } else {
00155       delete other_blob;
00156     }
00157     return NULL;
00158   }
00159   return seam;
00160 }
00161 
00162 
00169 namespace tesseract {
00170 SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number,
00171                                  bool italic_blob,
00172                                  const GenericVector<SEAM*>& seams) {
00173   if (repair_unchopped_blobs)
00174     preserve_outline_tree (blob->outlines);
00175   TBLOB *other_blob = TBLOB::ShallowCopy(*blob);       /* Make new blob */
00176   // Insert it into the word.
00177   word->blobs.insert(other_blob, blob_number + 1);
00178 
00179   SEAM *seam = NULL;
00180   if (prioritize_division) {
00181     TPOINT location;
00182     if (divisible_blob(blob, italic_blob, &location)) {
00183       seam = new SEAM(0.0f, location);
00184     }
00185   }
00186   if (seam == NULL)
00187     seam = pick_good_seam(blob);
00188   if (chop_debug) {
00189     if (seam != NULL)
00190       seam->Print("Good seam picked=");
00191     else
00192       tprintf("\n** no seam picked *** \n");
00193   }
00194   if (seam) {
00195     seam->ApplySeam(italic_blob, blob, other_blob);
00196   }
00197 
00198   seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
00199                    seams, seam);
00200   if (seam == NULL) {
00201     if (repair_unchopped_blobs)
00202       restore_outline_tree(blob->outlines);
00203     if (allow_blob_division && !prioritize_division) {
00204       // If the blob can simply be divided into outlines, then do that.
00205       TPOINT location;
00206       if (divisible_blob(blob, italic_blob, &location)) {
00207         other_blob = TBLOB::ShallowCopy(*blob);       /* Make new blob */
00208         word->blobs.insert(other_blob, blob_number + 1);
00209         seam = new SEAM(0.0f, location);
00210         seam->ApplySeam(italic_blob, blob, other_blob);
00211         seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
00212                          seams, seam);
00213       }
00214     }
00215   }
00216   if (seam != NULL) {
00217     // Make sure this seam doesn't get chopped again.
00218     seam->Finalize();
00219   }
00220   return seam;
00221 }
00222 
00223 
00224 SEAM *Wordrec::chop_numbered_blob(TWERD *word, inT32 blob_number,
00225                                   bool italic_blob,
00226                                   const GenericVector<SEAM*>& seams) {
00227   return attempt_blob_chop(word, word->blobs[blob_number], blob_number,
00228                            italic_blob, seams);
00229 }
00230 
00231 
00232 SEAM *Wordrec::chop_overlapping_blob(const GenericVector<TBOX>& boxes,
00233                                      bool italic_blob, WERD_RES *word_res,
00234                                      int *blob_number) {
00235   TWERD *word = word_res->chopped_word;
00236   for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) {
00237     TBLOB *blob = word->blobs[*blob_number];
00238     TPOINT topleft, botright;
00239     topleft.x = blob->bounding_box().left();
00240     topleft.y = blob->bounding_box().top();
00241     botright.x = blob->bounding_box().right();
00242     botright.y = blob->bounding_box().bottom();
00243 
00244     TPOINT original_topleft, original_botright;
00245     word_res->denorm.DenormTransform(NULL, topleft, &original_topleft);
00246     word_res->denorm.DenormTransform(NULL, botright, &original_botright);
00247 
00248     TBOX original_box = TBOX(original_topleft.x, original_botright.y,
00249                              original_botright.x, original_topleft.y);
00250 
00251     bool almost_equal_box = false;
00252     int num_overlap = 0;
00253     for (int i = 0; i < boxes.size(); i++) {
00254       if (original_box.overlap_fraction(boxes[i]) > 0.125)
00255         num_overlap++;
00256       if (original_box.almost_equal(boxes[i], 3))
00257         almost_equal_box = true;
00258     }
00259 
00260     TPOINT location;
00261     if (divisible_blob(blob, italic_blob, &location) ||
00262         (!almost_equal_box && num_overlap > 1)) {
00263       SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
00264                                      italic_blob, word_res->seam_array);
00265       if (seam != NULL)
00266         return seam;
00267     }
00268   }
00269 
00270   *blob_number = -1;
00271   return NULL;
00272 }
00273 
00274 }  // namespace tesseract
00275 
00276 
00282 int any_shared_split_points(const GenericVector<SEAM*>& seams, SEAM *seam) {
00283   int length;
00284   int index;
00285 
00286   length = seams.size();
00287   for (index = 0; index < length; index++)
00288     if (seam->SharesPosition(*seams[index])) return TRUE;
00289   return FALSE;
00290 }
00291 
00292 
00298 int check_blob(TBLOB *blob) {
00299   TESSLINE *outline;
00300   EDGEPT *edgept;
00301 
00302   for (outline = blob->outlines; outline != NULL; outline = outline->next) {
00303     edgept = outline->loop;
00304     do {
00305       if (edgept == NULL)
00306         break;
00307       edgept = edgept->next;
00308     }
00309     while (edgept != outline->loop);
00310     if (edgept == NULL)
00311       return 1;
00312   }
00313   return 0;
00314 }
00315 
00316 
00317 namespace tesseract {
00330 SEAM* Wordrec::improve_one_blob(const GenericVector<BLOB_CHOICE*>& blob_choices,
00331                                 DANGERR *fixpt,
00332                                 bool split_next_to_fragment,
00333                                 bool italic_blob,
00334                                 WERD_RES* word,
00335                                 int* blob_number) {
00336   float rating_ceiling = MAX_FLOAT32;
00337   SEAM *seam = NULL;
00338   do {
00339     *blob_number = select_blob_to_split_from_fixpt(fixpt);
00340     if (chop_debug) tprintf("blob_number from fixpt = %d\n", *blob_number);
00341     bool split_point_from_dict = (*blob_number != -1);
00342     if (split_point_from_dict) {
00343       fixpt->clear();
00344     } else {
00345       *blob_number = select_blob_to_split(blob_choices, rating_ceiling,
00346                                           split_next_to_fragment);
00347     }
00348     if (chop_debug) tprintf("blob_number = %d\n", *blob_number);
00349     if (*blob_number == -1)
00350       return NULL;
00351 
00352     // TODO(rays) it may eventually help to allow italic_blob to be true,
00353     seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob,
00354                               word->seam_array);
00355     if (seam != NULL)
00356       return seam;  // Success!
00357     if (blob_choices[*blob_number] == NULL)
00358       return NULL;
00359     if (!split_point_from_dict) {
00360       // We chopped the worst rated blob, try something else next time.
00361       rating_ceiling = blob_choices[*blob_number]->rating();
00362     }
00363   } while (true);
00364   return seam;
00365 }
00366 
00374 SEAM* Wordrec::chop_one_blob(const GenericVector<TBOX>& boxes,
00375                              const GenericVector<BLOB_CHOICE*>& blob_choices,
00376                              WERD_RES* word_res,
00377                              int* blob_number) {
00378   if (prioritize_division) {
00379     return chop_overlapping_blob(boxes, true, word_res, blob_number);
00380   } else {
00381     return improve_one_blob(blob_choices, NULL, false, true, word_res,
00382                             blob_number);
00383   }
00384 }
00385 
00394 void Wordrec::chop_word_main(WERD_RES *word) {
00395   int num_blobs = word->chopped_word->NumBlobs();
00396   if (word->ratings == NULL) {
00397     word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks);
00398   }
00399   if (word->ratings->get(0, 0) == NULL) {
00400     // Run initial classification.
00401     for (int b = 0; b < num_blobs; ++b) {
00402       BLOB_CHOICE_LIST* choices = classify_piece(word->seam_array, b, b,
00403                                                  "Initial:", word->chopped_word,
00404                                                  word->blamer_bundle);
00405       word->ratings->put(b, b, choices);
00406     }
00407   } else {
00408     // Blobs have been pre-classified. Set matrix cell for all blob choices
00409     for (int col = 0; col < word->ratings->dimension(); ++col) {
00410       for (int row = col; row < word->ratings->dimension() &&
00411            row < col + word->ratings->bandwidth(); ++row) {
00412         BLOB_CHOICE_LIST* choices = word->ratings->get(col, row);
00413         if (choices != NULL) {
00414           BLOB_CHOICE_IT bc_it(choices);
00415           for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
00416             bc_it.data()->set_matrix_cell(col, row);
00417           }
00418         }
00419       }
00420     }
00421   }
00422 
00423   // Run Segmentation Search.
00424   BestChoiceBundle best_choice_bundle(word->ratings->dimension());
00425   SegSearch(word, &best_choice_bundle, word->blamer_bundle);
00426 
00427   if (word->best_choice == NULL) {
00428     // SegSearch found no valid paths, so just use the leading diagonal.
00429     word->FakeWordFromRatings();
00430   }
00431   word->RebuildBestState();
00432   // If we finished without a hyphen at the end of the word, let the next word
00433   // be found in the dictionary.
00434   if (word->word->flag(W_EOL) &&
00435       !getDict().has_hyphen_end(*word->best_choice)) {
00436     getDict().reset_hyphen_vars(true);
00437   }
00438 
00439   if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) {
00440     CallFillLattice(*word->ratings, word->best_choices,
00441                     *word->uch_set, word->blamer_bundle);
00442   }
00443   if (wordrec_debug_level > 0) {
00444     tprintf("Final Ratings Matrix:\n");
00445     word->ratings->print(getDict().getUnicharset());
00446   }
00447   word->FilterWordChoices(getDict().stopper_debug_level);
00448 }
00449 
00457 void Wordrec::improve_by_chopping(float rating_cert_scale,
00458                                   WERD_RES* word,
00459                                   BestChoiceBundle* best_choice_bundle,
00460                                   BlamerBundle* blamer_bundle,
00461                                   LMPainPoints* pain_points,
00462                                   GenericVector<SegSearchPending>* pending) {
00463   int blob_number;
00464   do {  // improvement loop.
00465     // Make a simple vector of BLOB_CHOICEs to make it easy to pick which
00466     // one to chop.
00467     GenericVector<BLOB_CHOICE*> blob_choices;
00468     int num_blobs = word->ratings->dimension();
00469     for (int i = 0; i < num_blobs; ++i) {
00470       BLOB_CHOICE_LIST* choices = word->ratings->get(i, i);
00471       if (choices == NULL || choices->empty()) {
00472         blob_choices.push_back(NULL);
00473       } else {
00474         BLOB_CHOICE_IT bc_it(choices);
00475         blob_choices.push_back(bc_it.data());
00476       }
00477     }
00478     SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt,
00479                                   false, false, word, &blob_number);
00480     if (seam == NULL) break;
00481     // A chop has been made. We have to correct all the data structures to
00482     // take into account the extra bottom-level blob.
00483     // Put the seam into the seam_array and correct everything else on the
00484     // word: ratings matrix (including matrix location in the BLOB_CHOICES),
00485     // states in WERD_CHOICEs, and blob widths.
00486     word->InsertSeam(blob_number, seam);
00487     // Insert a new entry in the beam array.
00488     best_choice_bundle->beam.insert(new LanguageModelState, blob_number);
00489     // Fixpts are outdated, but will get recalculated.
00490     best_choice_bundle->fixpt.clear();
00491     // Remap existing pain points.
00492     pain_points->RemapForSplit(blob_number);
00493     // Insert a new pending at the chop point.
00494     pending->insert(SegSearchPending(), blob_number);
00495 
00496     // Classify the two newly created blobs using ProcessSegSearchPainPoint,
00497     // as that updates the pending correctly and adds new pain points.
00498     MATRIX_COORD pain_point(blob_number, blob_number);
00499     ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word,
00500                               pain_points, blamer_bundle);
00501     pain_point.col = blob_number + 1;
00502     pain_point.row = blob_number + 1;
00503     ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word,
00504                               pain_points, blamer_bundle);
00505     if (language_model_->language_model_ngram_on) {
00506       // N-gram evaluation depends on the number of blobs in a chunk, so we
00507       // have to re-evaluate everything in the word.
00508       ResetNGramSearch(word, best_choice_bundle, pending);
00509       blob_number = 0;
00510     }
00511     // Run language model incrementally. (Except with the n-gram model on.)
00512     UpdateSegSearchNodes(rating_cert_scale, blob_number, pending,
00513                          word, pain_points, best_choice_bundle, blamer_bundle);
00514   } while (!language_model_->AcceptableChoiceFound() &&
00515            word->ratings->dimension() < kMaxNumChunks);
00516 
00517   // If after running only the chopper best_choice is incorrect and no blame
00518   // has been yet set, blame the classifier if best_choice is classifier's
00519   // top choice and is a dictionary word (i.e. language model could not have
00520   // helped). Otherwise blame the tradeoff between the classifier and
00521   // the old language model (permuters).
00522   if (word->blamer_bundle != NULL &&
00523       word->blamer_bundle->incorrect_result_reason() == IRR_CORRECT &&
00524       !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) {
00525     bool valid_permuter = word->best_choice != NULL &&
00526         Dict::valid_word_permuter(word->best_choice->permuter(), false);
00527     word->blamer_bundle->BlameClassifierOrLangModel(word,
00528                                                     getDict().getUnicharset(),
00529                                                     valid_permuter,
00530                                                     wordrec_debug_blamer);
00531   }
00532 }
00533 
00534 
00535 /**********************************************************************
00536  * select_blob_to_split
00537  *
00538  * These are the results of the last classification.  Find a likely
00539  * place to apply splits.  If none, return -1.
00540  **********************************************************************/
00541 int Wordrec::select_blob_to_split(
00542     const GenericVector<BLOB_CHOICE*>& blob_choices,
00543     float rating_ceiling, bool split_next_to_fragment) {
00544   BLOB_CHOICE *blob_choice;
00545   int x;
00546   float worst = -MAX_FLOAT32;
00547   int worst_index = -1;
00548   float worst_near_fragment = -MAX_FLOAT32;
00549   int worst_index_near_fragment = -1;
00550   const CHAR_FRAGMENT **fragments = NULL;
00551 
00552   if (chop_debug) {
00553     if (rating_ceiling < MAX_FLOAT32)
00554       tprintf("rating_ceiling = %8.4f\n", rating_ceiling);
00555     else
00556       tprintf("rating_ceiling = No Limit\n");
00557   }
00558 
00559   if (split_next_to_fragment && blob_choices.size() > 0) {
00560     fragments = new const CHAR_FRAGMENT *[blob_choices.length()];
00561     if (blob_choices[0] != NULL) {
00562       fragments[0] = getDict().getUnicharset().get_fragment(
00563           blob_choices[0]->unichar_id());
00564     } else {
00565       fragments[0] = NULL;
00566     }
00567   }
00568 
00569   for (x = 0; x < blob_choices.size(); ++x) {
00570     if (blob_choices[x] == NULL) {
00571       if (fragments != NULL) {
00572         delete[] fragments;
00573       }
00574       return x;
00575     } else {
00576       blob_choice = blob_choices[x];
00577       // Populate fragments for the following position.
00578       if (split_next_to_fragment && x+1 < blob_choices.size()) {
00579         if (blob_choices[x + 1] != NULL) {
00580           fragments[x + 1] = getDict().getUnicharset().get_fragment(
00581               blob_choices[x + 1]->unichar_id());
00582         } else {
00583           fragments[x + 1] = NULL;
00584         }
00585       }
00586       if (blob_choice->rating() < rating_ceiling &&
00587           blob_choice->certainty() < tessedit_certainty_threshold) {
00588         // Update worst and worst_index.
00589         if (blob_choice->rating() > worst) {
00590           worst_index = x;
00591           worst = blob_choice->rating();
00592         }
00593         if (split_next_to_fragment) {
00594           // Update worst_near_fragment and worst_index_near_fragment.
00595           bool expand_following_fragment =
00596             (x + 1 < blob_choices.size() &&
00597              fragments[x+1] != NULL && !fragments[x+1]->is_beginning());
00598           bool expand_preceding_fragment =
00599             (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending());
00600           if ((expand_following_fragment || expand_preceding_fragment) &&
00601               blob_choice->rating() > worst_near_fragment) {
00602             worst_index_near_fragment = x;
00603             worst_near_fragment = blob_choice->rating();
00604             if (chop_debug) {
00605               tprintf("worst_index_near_fragment=%d"
00606                       " expand_following_fragment=%d"
00607                       " expand_preceding_fragment=%d\n",
00608                       worst_index_near_fragment,
00609                       expand_following_fragment,
00610                       expand_preceding_fragment);
00611             }
00612           }
00613         }
00614       }
00615     }
00616   }
00617   if (fragments != NULL) {
00618     delete[] fragments;
00619   }
00620   // TODO(daria): maybe a threshold of badness for
00621   // worst_near_fragment would be useful.
00622   return worst_index_near_fragment != -1 ?
00623     worst_index_near_fragment : worst_index;
00624 }
00625 
00626 /**********************************************************************
00627  * select_blob_to_split_from_fixpt
00628  *
00629  * Given the fix point from a dictionary search, if there is a single
00630  * dangerous blob that maps to multiple characters, return that blob
00631  * index as a place we need to split.  If none, return -1.
00632  **********************************************************************/
00633 int Wordrec::select_blob_to_split_from_fixpt(DANGERR *fixpt) {
00634   if (!fixpt)
00635     return -1;
00636   for (int i = 0; i < fixpt->size(); i++) {
00637     if ((*fixpt)[i].begin + 1 == (*fixpt)[i].end &&
00638         (*fixpt)[i].dangerous &&
00639         (*fixpt)[i].correct_is_ngram) {
00640       return (*fixpt)[i].begin;
00641     }
00642   }
00643   return -1;
00644 }
00645 
00646 
00647 }  // namespace tesseract
00648 
00649 
00650 /**********************************************************************
00651  * total_containment
00652  *
00653  * Check to see if one of these outlines is totally contained within
00654  * the bounding box of the other.
00655  **********************************************************************/
00656 inT16 total_containment(TBLOB *blob1, TBLOB *blob2) {
00657   TBOX box1 = blob1->bounding_box();
00658   TBOX box2 = blob2->bounding_box();
00659   return box1.contains(box2) || box2.contains(box1);
00660 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines