|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: werd.cpp (Formerly word.c) 00003 * Description: Code for the WERD class. 00004 * Author: Ray Smith 00005 * Created: Tue Oct 08 14:32:12 BST 1991 00006 * 00007 * (C) Copyright 1991, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include "blckerr.h" 00021 #include "helpers.h" 00022 #include "linlsq.h" 00023 #include "werd.h" 00024 00025 // Include automatically generated configuration file if running autoconf. 00026 #ifdef HAVE_CONFIG_H 00027 #include "config_auto.h" 00028 #endif 00029 00030 #define FIRST_COLOUR ScrollView::RED //< first rainbow colour 00031 #define LAST_COLOUR ScrollView::AQUAMARINE //< last rainbow colour 00032 #define CHILD_COLOUR ScrollView::BROWN //< colour of children 00033 00034 const ERRCODE CANT_SCALE_EDGESTEPS = 00035 "Attempted to scale an edgestep format word"; 00036 00037 ELIST2IZE(WERD) 00038 00039 00048 WERD::WERD(C_BLOB_LIST *blob_list, uinT8 blank_count, const char *text) 00049 : blanks(blank_count), 00050 flags(0), 00051 script_id_(0), 00052 correct(text) { 00053 C_BLOB_IT start_it = &cblobs; 00054 C_BLOB_IT rej_cblob_it = &rej_cblobs; 00055 C_OUTLINE_IT c_outline_it; 00056 inT16 inverted_vote = 0; 00057 inT16 non_inverted_vote = 0; 00058 00059 // Move blob_list's elements into cblobs. 00060 start_it.add_list_after(blob_list); 00061 00062 /* 00063 Set white on black flag for the WERD, moving any duff blobs onto the 00064 rej_cblobs list. 00065 First, walk the cblobs checking the inverse flag for each outline of each 00066 cblob. If a cblob has inconsistent flag settings for its different 00067 outlines, move the blob to the reject list. Otherwise, increment the 00068 appropriate w-on-b or b-on-w vote for the word. 00069 00070 Now set the inversion flag for the WERD by maximum vote. 00071 00072 Walk the blobs again, moving any blob whose inversion flag does not agree 00073 with the concencus onto the reject list. 00074 */ 00075 start_it.set_to_list(&cblobs); 00076 if (start_it.empty()) 00077 return; 00078 for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) { 00079 BOOL8 reject_blob = FALSE; 00080 BOOL8 blob_inverted; 00081 00082 c_outline_it.set_to_list(start_it.data()->out_list()); 00083 blob_inverted = c_outline_it.data()->flag(COUT_INVERSE); 00084 for (c_outline_it.mark_cycle_pt(); 00085 !c_outline_it.cycled_list() && !reject_blob; 00086 c_outline_it.forward()) { 00087 reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted; 00088 } 00089 if (reject_blob) { 00090 rej_cblob_it.add_after_then_move(start_it.extract()); 00091 } else { 00092 if (blob_inverted) 00093 inverted_vote++; 00094 else 00095 non_inverted_vote++; 00096 } 00097 } 00098 00099 flags.set_bit(W_INVERSE, (inverted_vote > non_inverted_vote)); 00100 00101 start_it.set_to_list(&cblobs); 00102 if (start_it.empty()) 00103 return; 00104 for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) { 00105 c_outline_it.set_to_list(start_it.data()->out_list()); 00106 if (c_outline_it.data()->flag(COUT_INVERSE) != flags.bit(W_INVERSE)) 00107 rej_cblob_it.add_after_then_move(start_it.extract()); 00108 } 00109 } 00110 00111 00119 WERD::WERD(C_BLOB_LIST * blob_list, //< In word order 00120 WERD * clone) //< Source of flags 00121 : flags(clone->flags), 00122 script_id_(clone->script_id_), 00123 correct(clone->correct) { 00124 C_BLOB_IT start_it = blob_list; // iterator 00125 C_BLOB_IT end_it = blob_list; // another 00126 00127 while (!end_it.at_last ()) 00128 end_it.forward (); //move to last 00129 ((C_BLOB_LIST *) (&cblobs))->assign_to_sublist (&start_it, &end_it); 00130 //move to our list 00131 blanks = clone->blanks; 00132 // fprintf(stderr,"Wrong constructor!!!!\n"); 00133 } 00134 00135 // Construct a WERD from a single_blob and clone the flags from this. 00136 // W_BOL and W_EOL flags are set according to the given values. 00137 WERD* WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob) { 00138 C_BLOB_LIST temp_blobs; 00139 C_BLOB_IT temp_it(&temp_blobs); 00140 temp_it.add_after_then_move(blob); 00141 WERD* blob_word = new WERD(&temp_blobs, this); 00142 blob_word->set_flag(W_BOL, bol); 00143 blob_word->set_flag(W_EOL, eol); 00144 return blob_word; 00145 } 00146 00160 TBOX WERD::bounding_box() const { return restricted_bounding_box(true, true); } 00161 00162 // Returns the bounding box including the desired combination of upper and 00163 // lower noise/diacritic elements. 00164 TBOX WERD::restricted_bounding_box(bool upper_dots, bool lower_dots) const { 00165 TBOX box = true_bounding_box(); 00166 int bottom = box.bottom(); 00167 int top = box.top(); 00168 // This is a read-only iteration of the rejected blobs. 00169 C_BLOB_IT it(const_cast<C_BLOB_LIST*>(&rej_cblobs)); 00170 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00171 TBOX dot_box = it.data()->bounding_box(); 00172 if ((upper_dots || dot_box.bottom() <= top) && 00173 (lower_dots || dot_box.top() >= bottom)) { 00174 box += dot_box; 00175 } 00176 } 00177 return box; 00178 } 00179 00180 // Returns the bounding box of only the good blobs. 00181 TBOX WERD::true_bounding_box() const { 00182 TBOX box; // box being built 00183 // This is a read-only iteration of the good blobs. 00184 C_BLOB_IT it(const_cast<C_BLOB_LIST*>(&cblobs)); 00185 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00186 box += it.data()->bounding_box(); 00187 } 00188 return box; 00189 } 00190 00198 void WERD::move(const ICOORD vec) { 00199 C_BLOB_IT cblob_it(&cblobs); // cblob iterator 00200 00201 for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) 00202 cblob_it.data()->move(vec); 00203 } 00204 00211 void WERD::join_on(WERD* other) { 00212 C_BLOB_IT blob_it(&cblobs); 00213 C_BLOB_IT src_it(&other->cblobs); 00214 C_BLOB_IT rej_cblob_it(&rej_cblobs); 00215 C_BLOB_IT src_rej_it(&other->rej_cblobs); 00216 00217 while (!src_it.empty()) { 00218 blob_it.add_to_end(src_it.extract()); 00219 src_it.forward(); 00220 } 00221 while (!src_rej_it.empty()) { 00222 rej_cblob_it.add_to_end(src_rej_it.extract()); 00223 src_rej_it.forward(); 00224 } 00225 } 00226 00227 00234 void WERD::copy_on(WERD* other) { 00235 bool reversed = other->bounding_box().left() < bounding_box().left(); 00236 C_BLOB_IT c_blob_it(&cblobs); 00237 C_BLOB_LIST c_blobs; 00238 00239 c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy); 00240 if (reversed) { 00241 c_blob_it.add_list_before(&c_blobs); 00242 } else { 00243 c_blob_it.move_to_last(); 00244 c_blob_it.add_list_after(&c_blobs); 00245 } 00246 if (!other->rej_cblobs.empty()) { 00247 C_BLOB_IT rej_c_blob_it(&rej_cblobs); 00248 C_BLOB_LIST new_rej_c_blobs; 00249 00250 new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy); 00251 if (reversed) { 00252 rej_c_blob_it.add_list_before(&new_rej_c_blobs); 00253 } else { 00254 rej_c_blob_it.move_to_last(); 00255 rej_c_blob_it.add_list_after(&new_rej_c_blobs); 00256 } 00257 } 00258 } 00259 00266 void WERD::print() { 00267 tprintf("Blanks= %d\n", blanks); 00268 bounding_box().print(); 00269 tprintf("Flags = %d = 0%o\n", flags.val, flags.val); 00270 tprintf(" W_SEGMENTED = %s\n", flags.bit(W_SEGMENTED) ? "TRUE" : "FALSE "); 00271 tprintf(" W_ITALIC = %s\n", flags.bit(W_ITALIC) ? "TRUE" : "FALSE "); 00272 tprintf(" W_BOL = %s\n", flags.bit(W_BOL) ? "TRUE" : "FALSE "); 00273 tprintf(" W_EOL = %s\n", flags.bit(W_EOL) ? "TRUE" : "FALSE "); 00274 tprintf(" W_NORMALIZED = %s\n", 00275 flags.bit(W_NORMALIZED) ? "TRUE" : "FALSE "); 00276 tprintf(" W_SCRIPT_HAS_XHEIGHT = %s\n", 00277 flags.bit(W_SCRIPT_HAS_XHEIGHT) ? "TRUE" : "FALSE "); 00278 tprintf(" W_SCRIPT_IS_LATIN = %s\n", 00279 flags.bit(W_SCRIPT_IS_LATIN) ? "TRUE" : "FALSE "); 00280 tprintf(" W_DONT_CHOP = %s\n", flags.bit(W_DONT_CHOP) ? "TRUE" : "FALSE "); 00281 tprintf(" W_REP_CHAR = %s\n", flags.bit(W_REP_CHAR) ? "TRUE" : "FALSE "); 00282 tprintf(" W_FUZZY_SP = %s\n", flags.bit(W_FUZZY_SP) ? "TRUE" : "FALSE "); 00283 tprintf(" W_FUZZY_NON = %s\n", flags.bit(W_FUZZY_NON) ? "TRUE" : "FALSE "); 00284 tprintf("Correct= %s\n", correct.string()); 00285 tprintf("Rejected cblob count = %d\n", rej_cblobs.length()); 00286 tprintf("Script = %d\n", script_id_); 00287 } 00288 00289 00296 #ifndef GRAPHICS_DISABLED 00297 void WERD::plot(ScrollView *window, ScrollView::Color colour) { 00298 C_BLOB_IT it = &cblobs; 00299 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00300 it.data()->plot(window, colour, colour); 00301 } 00302 plot_rej_blobs(window); 00303 } 00304 00305 // Get the next color in the (looping) rainbow. 00306 ScrollView::Color WERD::NextColor(ScrollView::Color colour) { 00307 ScrollView::Color next = static_cast<ScrollView::Color>(colour + 1); 00308 if (next >= LAST_COLOUR || next < FIRST_COLOUR) 00309 next = FIRST_COLOUR; 00310 return next; 00311 } 00312 00319 void WERD::plot(ScrollView* window) { 00320 ScrollView::Color colour = FIRST_COLOUR; 00321 C_BLOB_IT it = &cblobs; 00322 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00323 it.data()->plot(window, colour, CHILD_COLOUR); 00324 colour = NextColor(colour); 00325 } 00326 plot_rej_blobs(window); 00327 } 00328 00329 00337 void WERD::plot_rej_blobs(ScrollView *window) { 00338 C_BLOB_IT it = &rej_cblobs; 00339 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00340 it.data()->plot(window, ScrollView::GREY, ScrollView::GREY); 00341 } 00342 } 00343 #endif // GRAPHICS_DISABLED 00344 00345 00352 WERD *WERD::shallow_copy() { 00353 WERD *new_word = new WERD; 00354 00355 new_word->blanks = blanks; 00356 new_word->flags = flags; 00357 new_word->dummy = dummy; 00358 new_word->correct = correct; 00359 return new_word; 00360 } 00361 00362 00369 WERD & WERD::operator= (const WERD & source) { 00370 this->ELIST2_LINK::operator= (source); 00371 blanks = source.blanks; 00372 flags = source.flags; 00373 script_id_ = source.script_id_; 00374 dummy = source.dummy; 00375 correct = source.correct; 00376 if (!cblobs.empty()) 00377 cblobs.clear(); 00378 cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy); 00379 00380 if (!rej_cblobs.empty()) 00381 rej_cblobs.clear(); 00382 rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy); 00383 return *this; 00384 } 00385 00386 00394 int word_comparator(const void *word1p, const void *word2p) { 00395 WERD *word1 = *(WERD **)word1p; 00396 WERD *word2 = *(WERD **)word2p; 00397 return word1->bounding_box().left() - word2->bounding_box().left(); 00398 } 00399 00412 WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs, 00413 C_BLOB_LIST* orphan_blobs) { 00414 C_BLOB_LIST current_blob_list; 00415 C_BLOB_IT werd_blobs_it(¤t_blob_list); 00416 // Add the word's c_blobs. 00417 werd_blobs_it.add_list_after(cblob_list()); 00418 00419 // New blob list. These contain the blobs which will form the new word. 00420 C_BLOB_LIST new_werd_blobs; 00421 C_BLOB_IT new_blobs_it(&new_werd_blobs); 00422 00423 // not_found_blobs contains the list of current word's blobs for which a 00424 // corresponding blob wasn't found in the input all_blobs list. 00425 C_BLOB_LIST not_found_blobs; 00426 C_BLOB_IT not_found_it(¬_found_blobs); 00427 not_found_it.move_to_last(); 00428 00429 werd_blobs_it.move_to_first(); 00430 for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list(); 00431 werd_blobs_it.forward()) { 00432 C_BLOB* werd_blob = werd_blobs_it.extract(); 00433 TBOX werd_blob_box = werd_blob->bounding_box(); 00434 bool found = false; 00435 // Now find the corresponding blob for this blob in the all_blobs 00436 // list. For now, follow the inefficient method of pairwise 00437 // comparisons. Ideally, one can pre-bucket the blobs by row. 00438 C_BLOB_IT all_blobs_it(all_blobs); 00439 for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); 00440 all_blobs_it.forward()) { 00441 C_BLOB* a_blob = all_blobs_it.data(); 00442 // Compute the overlap of the two blobs. If major, a_blob should 00443 // be added to the new blobs list. 00444 TBOX a_blob_box = a_blob->bounding_box(); 00445 if (a_blob_box.null_box()) { 00446 tprintf("Bounding box couldn't be ascertained\n"); 00447 } 00448 if (werd_blob_box.contains(a_blob_box) || 00449 werd_blob_box.major_overlap(a_blob_box)) { 00450 // Old blobs are from minimal splits, therefore are expected to be 00451 // bigger. The new small blobs should cover a significant portion. 00452 // This is it. 00453 all_blobs_it.extract(); 00454 new_blobs_it.add_after_then_move(a_blob); 00455 found = true; 00456 } 00457 } 00458 if (!found) { 00459 not_found_it.add_after_then_move(werd_blob); 00460 } else { 00461 delete werd_blob; 00462 } 00463 } 00464 // Iterate over all not found blobs. Some of them may be due to 00465 // under-segmentation (which is OK, since the corresponding blob is already 00466 // in the list in that case. 00467 not_found_it.move_to_first(); 00468 for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); 00469 not_found_it.forward()) { 00470 C_BLOB* not_found = not_found_it.data(); 00471 TBOX not_found_box = not_found->bounding_box(); 00472 C_BLOB_IT existing_blobs_it(new_blobs_it); 00473 for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list(); 00474 existing_blobs_it.forward()) { 00475 C_BLOB* a_blob = existing_blobs_it.data(); 00476 TBOX a_blob_box = a_blob->bounding_box(); 00477 if ((not_found_box.major_overlap(a_blob_box) || 00478 a_blob_box.major_overlap(not_found_box)) && 00479 not_found_box.y_overlap_fraction(a_blob_box) > 0.8) { 00480 // Already taken care of. 00481 delete not_found_it.extract(); 00482 break; 00483 } 00484 } 00485 } 00486 if (orphan_blobs) { 00487 C_BLOB_IT orphan_blobs_it(orphan_blobs); 00488 orphan_blobs_it.move_to_last(); 00489 orphan_blobs_it.add_list_after(¬_found_blobs); 00490 } 00491 00492 // New blobs are ready. Create a new werd object with these. 00493 WERD* new_werd = NULL; 00494 if (!new_werd_blobs.empty()) { 00495 new_werd = new WERD(&new_werd_blobs, this); 00496 } else { 00497 // Add the blobs back to this word so that it can be reused. 00498 C_BLOB_IT this_list_it(cblob_list()); 00499 this_list_it.add_list_after(¬_found_blobs); 00500 } 00501 return new_werd; 00502 } 00503 00504 // Removes noise from the word by moving small outlines to the rej_cblobs 00505 // list, based on the size_threshold. 00506 void WERD::CleanNoise(float size_threshold) { 00507 C_BLOB_IT blob_it(&cblobs); 00508 C_BLOB_IT rej_it(&rej_cblobs); 00509 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { 00510 C_BLOB* blob = blob_it.data(); 00511 C_OUTLINE_IT ol_it(blob->out_list()); 00512 for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) { 00513 C_OUTLINE* outline = ol_it.data(); 00514 TBOX ol_box = outline->bounding_box(); 00515 int ol_size = 00516 ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height(); 00517 if (ol_size < size_threshold) { 00518 // This outline is too small. Move it to a separate blob in the 00519 // reject blobs list. 00520 C_BLOB* rej_blob = new C_BLOB(ol_it.extract()); 00521 rej_it.add_after_then_move(rej_blob); 00522 } 00523 } 00524 if (blob->out_list()->empty()) delete blob_it.extract(); 00525 } 00526 } 00527 00528 // Extracts all the noise outlines and stuffs the pointers into the given 00529 // vector of outlines. Afterwards, the outlines vector owns the pointers. 00530 void WERD::GetNoiseOutlines(GenericVector<C_OUTLINE*>* outlines) { 00531 C_BLOB_IT rej_it(&rej_cblobs); 00532 for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) { 00533 C_BLOB* blob = rej_it.extract(); 00534 C_OUTLINE_IT ol_it(blob->out_list()); 00535 outlines->push_back(ol_it.extract()); 00536 delete blob; 00537 } 00538 } 00539 00540 // Adds the selected outlines to the indcated real blobs, and puts the rest 00541 // back in rej_cblobs where they came from. Where the target_blobs entry is 00542 // NULL, a run of wanted outlines is put into a single new blob. 00543 // Ownership of the outlines is transferred back to the word. (Hence 00544 // GenericVector and not PointerVector.) 00545 // Returns true if any new blob was added to the start of the word, which 00546 // suggests that it might need joining to the word before it, and likewise 00547 // sets make_next_word_fuzzy true if any new blob was added to the end. 00548 bool WERD::AddSelectedOutlines(const GenericVector<bool>& wanted, 00549 const GenericVector<C_BLOB*>& target_blobs, 00550 const GenericVector<C_OUTLINE*>& outlines, 00551 bool* make_next_word_fuzzy) { 00552 bool outline_added_to_start = false; 00553 if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = false; 00554 C_BLOB_IT rej_it(&rej_cblobs); 00555 for (int i = 0; i < outlines.size(); ++i) { 00556 C_OUTLINE* outline = outlines[i]; 00557 if (outline == NULL) continue; // Already used it. 00558 if (wanted[i]) { 00559 C_BLOB* target_blob = target_blobs[i]; 00560 TBOX noise_box = outline->bounding_box(); 00561 if (target_blob == NULL) { 00562 target_blob = new C_BLOB(outline); 00563 // Need to find the insertion point. 00564 C_BLOB_IT blob_it(&cblobs); 00565 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); 00566 blob_it.forward()) { 00567 C_BLOB* blob = blob_it.data(); 00568 TBOX blob_box = blob->bounding_box(); 00569 if (blob_box.left() > noise_box.left()) { 00570 if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) { 00571 // We might want to join this word to its predecessor. 00572 outline_added_to_start = true; 00573 } 00574 blob_it.add_before_stay_put(target_blob); 00575 break; 00576 } 00577 } 00578 if (blob_it.cycled_list()) { 00579 blob_it.add_to_end(target_blob); 00580 if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = true; 00581 } 00582 // Add all consecutive wanted, but null-blob outlines to same blob. 00583 C_OUTLINE_IT ol_it(target_blob->out_list()); 00584 while (i + 1 < outlines.size() && wanted[i + 1] && 00585 target_blobs[i + 1] == NULL) { 00586 ++i; 00587 ol_it.add_to_end(outlines[i]); 00588 } 00589 } else { 00590 // Insert outline into this blob. 00591 C_OUTLINE_IT ol_it(target_blob->out_list()); 00592 ol_it.add_to_end(outline); 00593 } 00594 } else { 00595 // Put back on noise list. 00596 rej_it.add_to_end(new C_BLOB(outline)); 00597 } 00598 } 00599 return outline_added_to_start; 00600 }