|
tesseract 3.04.01
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: chopper.c (Formerly chopper.c) 00005 * Description: 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Tue Jul 30 16:18:52 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 **************************************************************************/ 00025 00026 /*---------------------------------------------------------------------- 00027 I n c l u d e s 00028 ----------------------------------------------------------------------*/ 00029 00030 #include <math.h> 00031 00032 #include "chopper.h" 00033 00034 #include "assert.h" 00035 #include "associate.h" 00036 #include "blobs.h" 00037 #include "callcpp.h" 00038 #include "const.h" 00039 #include "findseam.h" 00040 #include "freelist.h" 00041 #include "globals.h" 00042 #include "render.h" 00043 #include "pageres.h" 00044 #include "seam.h" 00045 #include "stopper.h" 00046 #include "structures.h" 00047 #include "unicharset.h" 00048 #include "wordrec.h" 00049 00050 // Include automatically generated configuration file if running autoconf. 00051 #ifdef HAVE_CONFIG_H 00052 #include "config_auto.h" 00053 #endif 00054 00055 // Even though the limit on the number of chunks may now be removed, keep 00056 // the same limit for repeatable behavior, and it may be a speed advantage. 00057 static const int kMaxNumChunks = 64; 00058 00059 /*---------------------------------------------------------------------- 00060 F u n c t i o n s 00061 ----------------------------------------------------------------------*/ 00067 void preserve_outline(EDGEPT *start) { 00068 EDGEPT *srcpt; 00069 00070 if (start == NULL) 00071 return; 00072 srcpt = start; 00073 do { 00074 srcpt->flags[1] = 1; 00075 srcpt = srcpt->next; 00076 } 00077 while (srcpt != start); 00078 srcpt->flags[1] = 2; 00079 } 00080 00081 00082 /**************************************************************************/ 00083 void preserve_outline_tree(TESSLINE *srcline) { 00084 TESSLINE *outline; 00085 00086 for (outline = srcline; outline != NULL; outline = outline->next) { 00087 preserve_outline (outline->loop); 00088 } 00089 } 00090 00091 00097 EDGEPT *restore_outline(EDGEPT *start) { 00098 EDGEPT *srcpt; 00099 EDGEPT *real_start; 00100 00101 if (start == NULL) 00102 return NULL; 00103 srcpt = start; 00104 do { 00105 if (srcpt->flags[1] == 2) 00106 break; 00107 srcpt = srcpt->next; 00108 } 00109 while (srcpt != start); 00110 real_start = srcpt; 00111 do { 00112 srcpt = srcpt->next; 00113 if (srcpt->prev->flags[1] == 0) { 00114 remove_edgept(srcpt->prev); 00115 } 00116 } 00117 while (srcpt != real_start); 00118 return real_start; 00119 } 00120 00121 00122 /******************************************************************************/ 00123 void restore_outline_tree(TESSLINE *srcline) { 00124 TESSLINE *outline; 00125 00126 for (outline = srcline; outline != NULL; outline = outline->next) { 00127 outline->loop = restore_outline (outline->loop); 00128 outline->start = outline->loop->pos; 00129 } 00130 } 00131 00132 // Helper runs all the checks on a seam to make sure it is valid. 00133 // Returns the seam if OK, otherwise deletes the seam and returns NULL. 00134 static SEAM* CheckSeam(int debug_level, inT32 blob_number, TWERD* word, 00135 TBLOB* blob, TBLOB* other_blob, 00136 const GenericVector<SEAM*>& seams, SEAM* seam) { 00137 if (seam == NULL || blob->outlines == NULL || other_blob->outlines == NULL || 00138 total_containment(blob, other_blob) || check_blob(other_blob) || 00139 !seam->ContainedByBlob(*blob) || !seam->ContainedByBlob(*other_blob) || 00140 any_shared_split_points(seams, seam) || 00141 !seam->PrepareToInsertSeam(seams, word->blobs, blob_number, false)) { 00142 word->blobs.remove(blob_number + 1); 00143 if (seam) { 00144 seam->UndoSeam(blob, other_blob); 00145 delete seam; 00146 seam = NULL; 00147 #ifndef GRAPHICS_DISABLED 00148 if (debug_level) { 00149 if (debug_level >2) 00150 display_blob(blob, Red); 00151 tprintf("\n** seam being removed ** \n"); 00152 } 00153 #endif 00154 } else { 00155 delete other_blob; 00156 } 00157 return NULL; 00158 } 00159 return seam; 00160 } 00161 00162 00169 namespace tesseract { 00170 SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number, 00171 bool italic_blob, 00172 const GenericVector<SEAM*>& seams) { 00173 if (repair_unchopped_blobs) 00174 preserve_outline_tree (blob->outlines); 00175 TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */ 00176 // Insert it into the word. 00177 word->blobs.insert(other_blob, blob_number + 1); 00178 00179 SEAM *seam = NULL; 00180 if (prioritize_division) { 00181 TPOINT location; 00182 if (divisible_blob(blob, italic_blob, &location)) { 00183 seam = new SEAM(0.0f, location); 00184 } 00185 } 00186 if (seam == NULL) 00187 seam = pick_good_seam(blob); 00188 if (chop_debug) { 00189 if (seam != NULL) 00190 seam->Print("Good seam picked="); 00191 else 00192 tprintf("\n** no seam picked *** \n"); 00193 } 00194 if (seam) { 00195 seam->ApplySeam(italic_blob, blob, other_blob); 00196 } 00197 00198 seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, 00199 seams, seam); 00200 if (seam == NULL) { 00201 if (repair_unchopped_blobs) 00202 restore_outline_tree(blob->outlines); 00203 if (allow_blob_division && !prioritize_division) { 00204 // If the blob can simply be divided into outlines, then do that. 00205 TPOINT location; 00206 if (divisible_blob(blob, italic_blob, &location)) { 00207 other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */ 00208 word->blobs.insert(other_blob, blob_number + 1); 00209 seam = new SEAM(0.0f, location); 00210 seam->ApplySeam(italic_blob, blob, other_blob); 00211 seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, 00212 seams, seam); 00213 } 00214 } 00215 } 00216 if (seam != NULL) { 00217 // Make sure this seam doesn't get chopped again. 00218 seam->Finalize(); 00219 } 00220 return seam; 00221 } 00222 00223 00224 SEAM *Wordrec::chop_numbered_blob(TWERD *word, inT32 blob_number, 00225 bool italic_blob, 00226 const GenericVector<SEAM*>& seams) { 00227 return attempt_blob_chop(word, word->blobs[blob_number], blob_number, 00228 italic_blob, seams); 00229 } 00230 00231 00232 SEAM *Wordrec::chop_overlapping_blob(const GenericVector<TBOX>& boxes, 00233 bool italic_blob, WERD_RES *word_res, 00234 int *blob_number) { 00235 TWERD *word = word_res->chopped_word; 00236 for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) { 00237 TBLOB *blob = word->blobs[*blob_number]; 00238 TPOINT topleft, botright; 00239 topleft.x = blob->bounding_box().left(); 00240 topleft.y = blob->bounding_box().top(); 00241 botright.x = blob->bounding_box().right(); 00242 botright.y = blob->bounding_box().bottom(); 00243 00244 TPOINT original_topleft, original_botright; 00245 word_res->denorm.DenormTransform(NULL, topleft, &original_topleft); 00246 word_res->denorm.DenormTransform(NULL, botright, &original_botright); 00247 00248 TBOX original_box = TBOX(original_topleft.x, original_botright.y, 00249 original_botright.x, original_topleft.y); 00250 00251 bool almost_equal_box = false; 00252 int num_overlap = 0; 00253 for (int i = 0; i < boxes.size(); i++) { 00254 if (original_box.overlap_fraction(boxes[i]) > 0.125) 00255 num_overlap++; 00256 if (original_box.almost_equal(boxes[i], 3)) 00257 almost_equal_box = true; 00258 } 00259 00260 TPOINT location; 00261 if (divisible_blob(blob, italic_blob, &location) || 00262 (!almost_equal_box && num_overlap > 1)) { 00263 SEAM *seam = attempt_blob_chop(word, blob, *blob_number, 00264 italic_blob, word_res->seam_array); 00265 if (seam != NULL) 00266 return seam; 00267 } 00268 } 00269 00270 *blob_number = -1; 00271 return NULL; 00272 } 00273 00274 } // namespace tesseract 00275 00276 00282 int any_shared_split_points(const GenericVector<SEAM*>& seams, SEAM *seam) { 00283 int length; 00284 int index; 00285 00286 length = seams.size(); 00287 for (index = 0; index < length; index++) 00288 if (seam->SharesPosition(*seams[index])) return TRUE; 00289 return FALSE; 00290 } 00291 00292 00298 int check_blob(TBLOB *blob) { 00299 TESSLINE *outline; 00300 EDGEPT *edgept; 00301 00302 for (outline = blob->outlines; outline != NULL; outline = outline->next) { 00303 edgept = outline->loop; 00304 do { 00305 if (edgept == NULL) 00306 break; 00307 edgept = edgept->next; 00308 } 00309 while (edgept != outline->loop); 00310 if (edgept == NULL) 00311 return 1; 00312 } 00313 return 0; 00314 } 00315 00316 00317 namespace tesseract { 00330 SEAM* Wordrec::improve_one_blob(const GenericVector<BLOB_CHOICE*>& blob_choices, 00331 DANGERR *fixpt, 00332 bool split_next_to_fragment, 00333 bool italic_blob, 00334 WERD_RES* word, 00335 int* blob_number) { 00336 float rating_ceiling = MAX_FLOAT32; 00337 SEAM *seam = NULL; 00338 do { 00339 *blob_number = select_blob_to_split_from_fixpt(fixpt); 00340 if (chop_debug) tprintf("blob_number from fixpt = %d\n", *blob_number); 00341 bool split_point_from_dict = (*blob_number != -1); 00342 if (split_point_from_dict) { 00343 fixpt->clear(); 00344 } else { 00345 *blob_number = select_blob_to_split(blob_choices, rating_ceiling, 00346 split_next_to_fragment); 00347 } 00348 if (chop_debug) tprintf("blob_number = %d\n", *blob_number); 00349 if (*blob_number == -1) 00350 return NULL; 00351 00352 // TODO(rays) it may eventually help to allow italic_blob to be true, 00353 seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob, 00354 word->seam_array); 00355 if (seam != NULL) 00356 return seam; // Success! 00357 if (blob_choices[*blob_number] == NULL) 00358 return NULL; 00359 if (!split_point_from_dict) { 00360 // We chopped the worst rated blob, try something else next time. 00361 rating_ceiling = blob_choices[*blob_number]->rating(); 00362 } 00363 } while (true); 00364 return seam; 00365 } 00366 00374 SEAM* Wordrec::chop_one_blob(const GenericVector<TBOX>& boxes, 00375 const GenericVector<BLOB_CHOICE*>& blob_choices, 00376 WERD_RES* word_res, 00377 int* blob_number) { 00378 if (prioritize_division) { 00379 return chop_overlapping_blob(boxes, true, word_res, blob_number); 00380 } else { 00381 return improve_one_blob(blob_choices, NULL, false, true, word_res, 00382 blob_number); 00383 } 00384 } 00385 00394 void Wordrec::chop_word_main(WERD_RES *word) { 00395 int num_blobs = word->chopped_word->NumBlobs(); 00396 if (word->ratings == NULL) { 00397 word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks); 00398 } 00399 if (word->ratings->get(0, 0) == NULL) { 00400 // Run initial classification. 00401 for (int b = 0; b < num_blobs; ++b) { 00402 BLOB_CHOICE_LIST* choices = classify_piece(word->seam_array, b, b, 00403 "Initial:", word->chopped_word, 00404 word->blamer_bundle); 00405 word->ratings->put(b, b, choices); 00406 } 00407 } else { 00408 // Blobs have been pre-classified. Set matrix cell for all blob choices 00409 for (int col = 0; col < word->ratings->dimension(); ++col) { 00410 for (int row = col; row < word->ratings->dimension() && 00411 row < col + word->ratings->bandwidth(); ++row) { 00412 BLOB_CHOICE_LIST* choices = word->ratings->get(col, row); 00413 if (choices != NULL) { 00414 BLOB_CHOICE_IT bc_it(choices); 00415 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { 00416 bc_it.data()->set_matrix_cell(col, row); 00417 } 00418 } 00419 } 00420 } 00421 } 00422 00423 // Run Segmentation Search. 00424 BestChoiceBundle best_choice_bundle(word->ratings->dimension()); 00425 SegSearch(word, &best_choice_bundle, word->blamer_bundle); 00426 00427 if (word->best_choice == NULL) { 00428 // SegSearch found no valid paths, so just use the leading diagonal. 00429 word->FakeWordFromRatings(); 00430 } 00431 word->RebuildBestState(); 00432 // If we finished without a hyphen at the end of the word, let the next word 00433 // be found in the dictionary. 00434 if (word->word->flag(W_EOL) && 00435 !getDict().has_hyphen_end(*word->best_choice)) { 00436 getDict().reset_hyphen_vars(true); 00437 } 00438 00439 if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) { 00440 CallFillLattice(*word->ratings, word->best_choices, 00441 *word->uch_set, word->blamer_bundle); 00442 } 00443 if (wordrec_debug_level > 0) { 00444 tprintf("Final Ratings Matrix:\n"); 00445 word->ratings->print(getDict().getUnicharset()); 00446 } 00447 word->FilterWordChoices(getDict().stopper_debug_level); 00448 } 00449 00457 void Wordrec::improve_by_chopping(float rating_cert_scale, 00458 WERD_RES* word, 00459 BestChoiceBundle* best_choice_bundle, 00460 BlamerBundle* blamer_bundle, 00461 LMPainPoints* pain_points, 00462 GenericVector<SegSearchPending>* pending) { 00463 int blob_number; 00464 do { // improvement loop. 00465 // Make a simple vector of BLOB_CHOICEs to make it easy to pick which 00466 // one to chop. 00467 GenericVector<BLOB_CHOICE*> blob_choices; 00468 int num_blobs = word->ratings->dimension(); 00469 for (int i = 0; i < num_blobs; ++i) { 00470 BLOB_CHOICE_LIST* choices = word->ratings->get(i, i); 00471 if (choices == NULL || choices->empty()) { 00472 blob_choices.push_back(NULL); 00473 } else { 00474 BLOB_CHOICE_IT bc_it(choices); 00475 blob_choices.push_back(bc_it.data()); 00476 } 00477 } 00478 SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt, 00479 false, false, word, &blob_number); 00480 if (seam == NULL) break; 00481 // A chop has been made. We have to correct all the data structures to 00482 // take into account the extra bottom-level blob. 00483 // Put the seam into the seam_array and correct everything else on the 00484 // word: ratings matrix (including matrix location in the BLOB_CHOICES), 00485 // states in WERD_CHOICEs, and blob widths. 00486 word->InsertSeam(blob_number, seam); 00487 // Insert a new entry in the beam array. 00488 best_choice_bundle->beam.insert(new LanguageModelState, blob_number); 00489 // Fixpts are outdated, but will get recalculated. 00490 best_choice_bundle->fixpt.clear(); 00491 // Remap existing pain points. 00492 pain_points->RemapForSplit(blob_number); 00493 // Insert a new pending at the chop point. 00494 pending->insert(SegSearchPending(), blob_number); 00495 00496 // Classify the two newly created blobs using ProcessSegSearchPainPoint, 00497 // as that updates the pending correctly and adds new pain points. 00498 MATRIX_COORD pain_point(blob_number, blob_number); 00499 ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word, 00500 pain_points, blamer_bundle); 00501 pain_point.col = blob_number + 1; 00502 pain_point.row = blob_number + 1; 00503 ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word, 00504 pain_points, blamer_bundle); 00505 if (language_model_->language_model_ngram_on) { 00506 // N-gram evaluation depends on the number of blobs in a chunk, so we 00507 // have to re-evaluate everything in the word. 00508 ResetNGramSearch(word, best_choice_bundle, pending); 00509 blob_number = 0; 00510 } 00511 // Run language model incrementally. (Except with the n-gram model on.) 00512 UpdateSegSearchNodes(rating_cert_scale, blob_number, pending, 00513 word, pain_points, best_choice_bundle, blamer_bundle); 00514 } while (!language_model_->AcceptableChoiceFound() && 00515 word->ratings->dimension() < kMaxNumChunks); 00516 00517 // If after running only the chopper best_choice is incorrect and no blame 00518 // has been yet set, blame the classifier if best_choice is classifier's 00519 // top choice and is a dictionary word (i.e. language model could not have 00520 // helped). Otherwise blame the tradeoff between the classifier and 00521 // the old language model (permuters). 00522 if (word->blamer_bundle != NULL && 00523 word->blamer_bundle->incorrect_result_reason() == IRR_CORRECT && 00524 !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) { 00525 bool valid_permuter = word->best_choice != NULL && 00526 Dict::valid_word_permuter(word->best_choice->permuter(), false); 00527 word->blamer_bundle->BlameClassifierOrLangModel(word, 00528 getDict().getUnicharset(), 00529 valid_permuter, 00530 wordrec_debug_blamer); 00531 } 00532 } 00533 00534 00535 /********************************************************************** 00536 * select_blob_to_split 00537 * 00538 * These are the results of the last classification. Find a likely 00539 * place to apply splits. If none, return -1. 00540 **********************************************************************/ 00541 int Wordrec::select_blob_to_split( 00542 const GenericVector<BLOB_CHOICE*>& blob_choices, 00543 float rating_ceiling, bool split_next_to_fragment) { 00544 BLOB_CHOICE *blob_choice; 00545 int x; 00546 float worst = -MAX_FLOAT32; 00547 int worst_index = -1; 00548 float worst_near_fragment = -MAX_FLOAT32; 00549 int worst_index_near_fragment = -1; 00550 const CHAR_FRAGMENT **fragments = NULL; 00551 00552 if (chop_debug) { 00553 if (rating_ceiling < MAX_FLOAT32) 00554 tprintf("rating_ceiling = %8.4f\n", rating_ceiling); 00555 else 00556 tprintf("rating_ceiling = No Limit\n"); 00557 } 00558 00559 if (split_next_to_fragment && blob_choices.size() > 0) { 00560 fragments = new const CHAR_FRAGMENT *[blob_choices.length()]; 00561 if (blob_choices[0] != NULL) { 00562 fragments[0] = getDict().getUnicharset().get_fragment( 00563 blob_choices[0]->unichar_id()); 00564 } else { 00565 fragments[0] = NULL; 00566 } 00567 } 00568 00569 for (x = 0; x < blob_choices.size(); ++x) { 00570 if (blob_choices[x] == NULL) { 00571 if (fragments != NULL) { 00572 delete[] fragments; 00573 } 00574 return x; 00575 } else { 00576 blob_choice = blob_choices[x]; 00577 // Populate fragments for the following position. 00578 if (split_next_to_fragment && x+1 < blob_choices.size()) { 00579 if (blob_choices[x + 1] != NULL) { 00580 fragments[x + 1] = getDict().getUnicharset().get_fragment( 00581 blob_choices[x + 1]->unichar_id()); 00582 } else { 00583 fragments[x + 1] = NULL; 00584 } 00585 } 00586 if (blob_choice->rating() < rating_ceiling && 00587 blob_choice->certainty() < tessedit_certainty_threshold) { 00588 // Update worst and worst_index. 00589 if (blob_choice->rating() > worst) { 00590 worst_index = x; 00591 worst = blob_choice->rating(); 00592 } 00593 if (split_next_to_fragment) { 00594 // Update worst_near_fragment and worst_index_near_fragment. 00595 bool expand_following_fragment = 00596 (x + 1 < blob_choices.size() && 00597 fragments[x+1] != NULL && !fragments[x+1]->is_beginning()); 00598 bool expand_preceding_fragment = 00599 (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending()); 00600 if ((expand_following_fragment || expand_preceding_fragment) && 00601 blob_choice->rating() > worst_near_fragment) { 00602 worst_index_near_fragment = x; 00603 worst_near_fragment = blob_choice->rating(); 00604 if (chop_debug) { 00605 tprintf("worst_index_near_fragment=%d" 00606 " expand_following_fragment=%d" 00607 " expand_preceding_fragment=%d\n", 00608 worst_index_near_fragment, 00609 expand_following_fragment, 00610 expand_preceding_fragment); 00611 } 00612 } 00613 } 00614 } 00615 } 00616 } 00617 if (fragments != NULL) { 00618 delete[] fragments; 00619 } 00620 // TODO(daria): maybe a threshold of badness for 00621 // worst_near_fragment would be useful. 00622 return worst_index_near_fragment != -1 ? 00623 worst_index_near_fragment : worst_index; 00624 } 00625 00626 /********************************************************************** 00627 * select_blob_to_split_from_fixpt 00628 * 00629 * Given the fix point from a dictionary search, if there is a single 00630 * dangerous blob that maps to multiple characters, return that blob 00631 * index as a place we need to split. If none, return -1. 00632 **********************************************************************/ 00633 int Wordrec::select_blob_to_split_from_fixpt(DANGERR *fixpt) { 00634 if (!fixpt) 00635 return -1; 00636 for (int i = 0; i < fixpt->size(); i++) { 00637 if ((*fixpt)[i].begin + 1 == (*fixpt)[i].end && 00638 (*fixpt)[i].dangerous && 00639 (*fixpt)[i].correct_is_ngram) { 00640 return (*fixpt)[i].begin; 00641 } 00642 } 00643 return -1; 00644 } 00645 00646 00647 } // namespace tesseract 00648 00649 00650 /********************************************************************** 00651 * total_containment 00652 * 00653 * Check to see if one of these outlines is totally contained within 00654 * the bounding box of the other. 00655 **********************************************************************/ 00656 inT16 total_containment(TBLOB *blob1, TBLOB *blob2) { 00657 TBOX box1 = blob1->bounding_box(); 00658 TBOX box2 = blob2->bounding_box(); 00659 return box1.contains(box2) || box2.contains(box1); 00660 }