tesseract 3.04.01

ccstruct/werd.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        werd.cpp  (Formerly word.c)
00003  * Description: Code for the WERD class.
00004  * Author:      Ray Smith
00005  * Created:     Tue Oct 08 14:32:12 BST 1991
00006  *
00007  * (C) Copyright 1991, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include "blckerr.h"
00021 #include "helpers.h"
00022 #include "linlsq.h"
00023 #include "werd.h"
00024 
00025 // Include automatically generated configuration file if running autoconf.
00026 #ifdef HAVE_CONFIG_H
00027 #include "config_auto.h"
00028 #endif
00029 
00030 #define FIRST_COLOUR    ScrollView::RED         //< first rainbow colour
00031 #define LAST_COLOUR     ScrollView::AQUAMARINE  //< last rainbow colour
00032 #define CHILD_COLOUR    ScrollView::BROWN       //< colour of children
00033 
00034 const ERRCODE CANT_SCALE_EDGESTEPS =
00035     "Attempted to scale an edgestep format word";
00036 
00037 ELIST2IZE(WERD)
00038 
00039 
00048 WERD::WERD(C_BLOB_LIST *blob_list, uinT8 blank_count, const char *text)
00049   : blanks(blank_count),
00050     flags(0),
00051     script_id_(0),
00052     correct(text) {
00053   C_BLOB_IT start_it = &cblobs;
00054   C_BLOB_IT rej_cblob_it = &rej_cblobs;
00055   C_OUTLINE_IT c_outline_it;
00056   inT16 inverted_vote = 0;
00057   inT16 non_inverted_vote = 0;
00058 
00059   // Move blob_list's elements into cblobs.
00060   start_it.add_list_after(blob_list);
00061 
00062   /*
00063     Set white on black flag for the WERD, moving any duff blobs onto the
00064     rej_cblobs list.
00065     First, walk the cblobs checking the inverse flag for each outline of each
00066     cblob. If a cblob has inconsistent flag settings for its different
00067     outlines, move the blob to the reject list. Otherwise, increment the
00068     appropriate w-on-b or b-on-w vote for the word.
00069 
00070     Now set the inversion flag for the WERD by maximum vote.
00071 
00072     Walk the blobs again, moving any blob whose inversion flag does not agree
00073     with the concencus onto the reject list.
00074   */
00075   start_it.set_to_list(&cblobs);
00076   if (start_it.empty())
00077     return;
00078   for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
00079     BOOL8 reject_blob = FALSE;
00080     BOOL8 blob_inverted;
00081 
00082     c_outline_it.set_to_list(start_it.data()->out_list());
00083     blob_inverted = c_outline_it.data()->flag(COUT_INVERSE);
00084     for (c_outline_it.mark_cycle_pt();
00085          !c_outline_it.cycled_list() && !reject_blob;
00086          c_outline_it.forward()) {
00087       reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted;
00088     }
00089     if (reject_blob) {
00090       rej_cblob_it.add_after_then_move(start_it.extract());
00091     } else {
00092       if (blob_inverted)
00093         inverted_vote++;
00094       else
00095         non_inverted_vote++;
00096     }
00097   }
00098 
00099   flags.set_bit(W_INVERSE, (inverted_vote > non_inverted_vote));
00100 
00101   start_it.set_to_list(&cblobs);
00102   if (start_it.empty())
00103     return;
00104   for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
00105     c_outline_it.set_to_list(start_it.data()->out_list());
00106     if (c_outline_it.data()->flag(COUT_INVERSE) != flags.bit(W_INVERSE))
00107       rej_cblob_it.add_after_then_move(start_it.extract());
00108   }
00109 }
00110 
00111 
00119 WERD::WERD(C_BLOB_LIST * blob_list,         //< In word order
00120            WERD * clone)                    //< Source of flags
00121   : flags(clone->flags),
00122     script_id_(clone->script_id_),
00123     correct(clone->correct) {
00124   C_BLOB_IT start_it = blob_list;  // iterator
00125   C_BLOB_IT end_it = blob_list;    // another
00126 
00127   while (!end_it.at_last ())
00128     end_it.forward ();           //move to last
00129   ((C_BLOB_LIST *) (&cblobs))->assign_to_sublist (&start_it, &end_it);
00130   //move to our list
00131   blanks = clone->blanks;
00132   //      fprintf(stderr,"Wrong constructor!!!!\n");
00133 }
00134 
00135 // Construct a WERD from a single_blob and clone the flags from this.
00136 // W_BOL and W_EOL flags are set according to the given values.
00137 WERD* WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob) {
00138   C_BLOB_LIST temp_blobs;
00139   C_BLOB_IT temp_it(&temp_blobs);
00140   temp_it.add_after_then_move(blob);
00141   WERD* blob_word = new WERD(&temp_blobs, this);
00142   blob_word->set_flag(W_BOL, bol);
00143   blob_word->set_flag(W_EOL, eol);
00144   return blob_word;
00145 }
00146 
00160 TBOX WERD::bounding_box() const { return restricted_bounding_box(true, true); }
00161 
00162 // Returns the bounding box including the desired combination of upper and
00163 // lower noise/diacritic elements.
00164 TBOX WERD::restricted_bounding_box(bool upper_dots, bool lower_dots) const {
00165   TBOX box = true_bounding_box();
00166   int bottom = box.bottom();
00167   int top = box.top();
00168   // This is a read-only iteration of the rejected blobs.
00169   C_BLOB_IT it(const_cast<C_BLOB_LIST*>(&rej_cblobs));
00170   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00171     TBOX dot_box = it.data()->bounding_box();
00172     if ((upper_dots || dot_box.bottom() <= top) &&
00173         (lower_dots || dot_box.top() >= bottom)) {
00174       box += dot_box;
00175     }
00176   }
00177   return box;
00178 }
00179 
00180 // Returns the bounding box of only the good blobs.
00181 TBOX WERD::true_bounding_box() const {
00182   TBOX box;  // box being built
00183   // This is a read-only iteration of the good blobs.
00184   C_BLOB_IT it(const_cast<C_BLOB_LIST*>(&cblobs));
00185   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00186     box += it.data()->bounding_box();
00187   }
00188   return box;
00189 }
00190 
00198 void WERD::move(const ICOORD vec) {
00199   C_BLOB_IT cblob_it(&cblobs);  // cblob iterator
00200 
00201   for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward())
00202     cblob_it.data()->move(vec);
00203 }
00204 
00211 void WERD::join_on(WERD* other) {
00212   C_BLOB_IT blob_it(&cblobs);
00213   C_BLOB_IT src_it(&other->cblobs);
00214   C_BLOB_IT rej_cblob_it(&rej_cblobs);
00215   C_BLOB_IT src_rej_it(&other->rej_cblobs);
00216 
00217   while (!src_it.empty()) {
00218     blob_it.add_to_end(src_it.extract());
00219     src_it.forward();
00220   }
00221   while (!src_rej_it.empty()) {
00222     rej_cblob_it.add_to_end(src_rej_it.extract());
00223     src_rej_it.forward();
00224   }
00225 }
00226 
00227 
00234 void WERD::copy_on(WERD* other) {
00235   bool reversed = other->bounding_box().left() < bounding_box().left();
00236   C_BLOB_IT c_blob_it(&cblobs);
00237   C_BLOB_LIST c_blobs;
00238 
00239   c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy);
00240   if (reversed) {
00241     c_blob_it.add_list_before(&c_blobs);
00242   } else {
00243     c_blob_it.move_to_last();
00244     c_blob_it.add_list_after(&c_blobs);
00245   }
00246   if (!other->rej_cblobs.empty()) {
00247     C_BLOB_IT rej_c_blob_it(&rej_cblobs);
00248     C_BLOB_LIST new_rej_c_blobs;
00249 
00250     new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy);
00251     if (reversed) {
00252       rej_c_blob_it.add_list_before(&new_rej_c_blobs);
00253     } else {
00254       rej_c_blob_it.move_to_last();
00255       rej_c_blob_it.add_list_after(&new_rej_c_blobs);
00256     }
00257   }
00258 }
00259 
00266 void WERD::print() {
00267   tprintf("Blanks= %d\n", blanks);
00268   bounding_box().print();
00269   tprintf("Flags = %d = 0%o\n", flags.val, flags.val);
00270   tprintf("   W_SEGMENTED = %s\n", flags.bit(W_SEGMENTED) ? "TRUE" : "FALSE ");
00271   tprintf("   W_ITALIC = %s\n", flags.bit(W_ITALIC) ? "TRUE" : "FALSE ");
00272   tprintf("   W_BOL = %s\n", flags.bit(W_BOL) ? "TRUE" : "FALSE ");
00273   tprintf("   W_EOL = %s\n", flags.bit(W_EOL) ? "TRUE" : "FALSE ");
00274   tprintf("   W_NORMALIZED = %s\n",
00275           flags.bit(W_NORMALIZED) ? "TRUE" : "FALSE ");
00276   tprintf("   W_SCRIPT_HAS_XHEIGHT = %s\n",
00277           flags.bit(W_SCRIPT_HAS_XHEIGHT) ? "TRUE" : "FALSE ");
00278   tprintf("   W_SCRIPT_IS_LATIN = %s\n",
00279           flags.bit(W_SCRIPT_IS_LATIN) ? "TRUE" : "FALSE ");
00280   tprintf("   W_DONT_CHOP = %s\n", flags.bit(W_DONT_CHOP) ? "TRUE" : "FALSE ");
00281   tprintf("   W_REP_CHAR = %s\n", flags.bit(W_REP_CHAR) ? "TRUE" : "FALSE ");
00282   tprintf("   W_FUZZY_SP = %s\n", flags.bit(W_FUZZY_SP) ? "TRUE" : "FALSE ");
00283   tprintf("   W_FUZZY_NON = %s\n", flags.bit(W_FUZZY_NON) ? "TRUE" : "FALSE ");
00284   tprintf("Correct= %s\n", correct.string());
00285   tprintf("Rejected cblob count = %d\n", rej_cblobs.length());
00286   tprintf("Script = %d\n", script_id_);
00287 }
00288 
00289 
00296 #ifndef GRAPHICS_DISABLED
00297 void WERD::plot(ScrollView *window, ScrollView::Color colour) {
00298   C_BLOB_IT it = &cblobs;
00299   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00300     it.data()->plot(window, colour, colour);
00301   }
00302   plot_rej_blobs(window);
00303 }
00304 
00305 // Get the next color in the (looping) rainbow.
00306 ScrollView::Color WERD::NextColor(ScrollView::Color colour) {
00307   ScrollView::Color next = static_cast<ScrollView::Color>(colour + 1);
00308   if (next >= LAST_COLOUR || next < FIRST_COLOUR)
00309     next = FIRST_COLOUR;
00310   return next;
00311 }
00312 
00319 void WERD::plot(ScrollView* window) {
00320   ScrollView::Color colour = FIRST_COLOUR;
00321   C_BLOB_IT it = &cblobs;
00322   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00323     it.data()->plot(window, colour, CHILD_COLOUR);
00324     colour = NextColor(colour);
00325   }
00326   plot_rej_blobs(window);
00327 }
00328 
00329 
00337 void WERD::plot_rej_blobs(ScrollView *window) {
00338   C_BLOB_IT it = &rej_cblobs;
00339   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00340     it.data()->plot(window, ScrollView::GREY, ScrollView::GREY);
00341   }
00342 }
00343 #endif  // GRAPHICS_DISABLED
00344 
00345 
00352 WERD *WERD::shallow_copy() {
00353   WERD *new_word = new WERD;
00354 
00355   new_word->blanks = blanks;
00356   new_word->flags = flags;
00357   new_word->dummy = dummy;
00358   new_word->correct = correct;
00359   return new_word;
00360 }
00361 
00362 
00369 WERD & WERD::operator= (const WERD & source) {
00370   this->ELIST2_LINK::operator= (source);
00371   blanks = source.blanks;
00372   flags = source.flags;
00373   script_id_ = source.script_id_;
00374   dummy = source.dummy;
00375   correct = source.correct;
00376   if (!cblobs.empty())
00377     cblobs.clear();
00378   cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy);
00379 
00380   if (!rej_cblobs.empty())
00381     rej_cblobs.clear();
00382   rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy);
00383   return *this;
00384 }
00385 
00386 
00394 int word_comparator(const void *word1p, const void *word2p) {
00395   WERD *word1 = *(WERD **)word1p;
00396   WERD *word2 = *(WERD **)word2p;
00397   return word1->bounding_box().left() - word2->bounding_box().left();
00398 }
00399 
00412 WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
00413                                       C_BLOB_LIST* orphan_blobs) {
00414   C_BLOB_LIST current_blob_list;
00415   C_BLOB_IT werd_blobs_it(&current_blob_list);
00416   // Add the word's c_blobs.
00417   werd_blobs_it.add_list_after(cblob_list());
00418 
00419   // New blob list. These contain the blobs which will form the new word.
00420   C_BLOB_LIST new_werd_blobs;
00421   C_BLOB_IT new_blobs_it(&new_werd_blobs);
00422 
00423   // not_found_blobs contains the list of current word's blobs for which a
00424   // corresponding blob wasn't found in the input all_blobs list.
00425   C_BLOB_LIST not_found_blobs;
00426   C_BLOB_IT not_found_it(&not_found_blobs);
00427   not_found_it.move_to_last();
00428 
00429   werd_blobs_it.move_to_first();
00430   for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list();
00431        werd_blobs_it.forward()) {
00432     C_BLOB* werd_blob = werd_blobs_it.extract();
00433     TBOX werd_blob_box = werd_blob->bounding_box();
00434     bool found = false;
00435     // Now find the corresponding blob for this blob in the all_blobs
00436     // list. For now, follow the inefficient method of pairwise
00437     // comparisons. Ideally, one can pre-bucket the blobs by row.
00438     C_BLOB_IT all_blobs_it(all_blobs);
00439     for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list();
00440          all_blobs_it.forward()) {
00441       C_BLOB* a_blob = all_blobs_it.data();
00442       // Compute the overlap of the two blobs. If major, a_blob should
00443       // be added to the new blobs list.
00444       TBOX a_blob_box = a_blob->bounding_box();
00445       if (a_blob_box.null_box()) {
00446         tprintf("Bounding box couldn't be ascertained\n");
00447       }
00448       if (werd_blob_box.contains(a_blob_box) ||
00449           werd_blob_box.major_overlap(a_blob_box)) {
00450         // Old blobs are from minimal splits, therefore are expected to be
00451         // bigger. The new small blobs should cover a significant portion.
00452         // This is it.
00453         all_blobs_it.extract();
00454         new_blobs_it.add_after_then_move(a_blob);
00455         found = true;
00456       }
00457     }
00458     if (!found) {
00459       not_found_it.add_after_then_move(werd_blob);
00460     } else {
00461       delete werd_blob;
00462     }
00463   }
00464   // Iterate over all not found blobs. Some of them may be due to
00465   // under-segmentation (which is OK, since the corresponding blob is already
00466   // in the list in that case.
00467   not_found_it.move_to_first();
00468   for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list();
00469        not_found_it.forward()) {
00470     C_BLOB* not_found = not_found_it.data();
00471     TBOX not_found_box = not_found->bounding_box();
00472     C_BLOB_IT existing_blobs_it(new_blobs_it);
00473     for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list();
00474          existing_blobs_it.forward()) {
00475       C_BLOB* a_blob = existing_blobs_it.data();
00476       TBOX a_blob_box = a_blob->bounding_box();
00477       if ((not_found_box.major_overlap(a_blob_box) ||
00478            a_blob_box.major_overlap(not_found_box)) &&
00479            not_found_box.y_overlap_fraction(a_blob_box) > 0.8) {
00480         // Already taken care of.
00481         delete not_found_it.extract();
00482         break;
00483       }
00484     }
00485   }
00486   if (orphan_blobs) {
00487     C_BLOB_IT orphan_blobs_it(orphan_blobs);
00488     orphan_blobs_it.move_to_last();
00489     orphan_blobs_it.add_list_after(&not_found_blobs);
00490   }
00491 
00492   // New blobs are ready. Create a new werd object with these.
00493   WERD* new_werd = NULL;
00494   if (!new_werd_blobs.empty()) {
00495     new_werd = new WERD(&new_werd_blobs, this);
00496   } else {
00497     // Add the blobs back to this word so that it can be reused.
00498     C_BLOB_IT this_list_it(cblob_list());
00499     this_list_it.add_list_after(&not_found_blobs);
00500   }
00501   return new_werd;
00502 }
00503 
00504 // Removes noise from the word by moving small outlines to the rej_cblobs
00505 // list, based on the size_threshold.
00506 void WERD::CleanNoise(float size_threshold) {
00507   C_BLOB_IT blob_it(&cblobs);
00508   C_BLOB_IT rej_it(&rej_cblobs);
00509   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00510     C_BLOB* blob = blob_it.data();
00511     C_OUTLINE_IT ol_it(blob->out_list());
00512     for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
00513       C_OUTLINE* outline = ol_it.data();
00514       TBOX ol_box = outline->bounding_box();
00515       int ol_size =
00516           ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height();
00517       if (ol_size < size_threshold) {
00518         // This outline is too small. Move it to a separate blob in the
00519         // reject blobs list.
00520         C_BLOB* rej_blob = new C_BLOB(ol_it.extract());
00521         rej_it.add_after_then_move(rej_blob);
00522       }
00523     }
00524     if (blob->out_list()->empty()) delete blob_it.extract();
00525   }
00526 }
00527 
00528 // Extracts all the noise outlines and stuffs the pointers into the given
00529 // vector of outlines. Afterwards, the outlines vector owns the pointers.
00530 void WERD::GetNoiseOutlines(GenericVector<C_OUTLINE*>* outlines) {
00531   C_BLOB_IT rej_it(&rej_cblobs);
00532   for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) {
00533     C_BLOB* blob = rej_it.extract();
00534     C_OUTLINE_IT ol_it(blob->out_list());
00535     outlines->push_back(ol_it.extract());
00536     delete blob;
00537   }
00538 }
00539 
00540 // Adds the selected outlines to the indcated real blobs, and puts the rest
00541 // back in rej_cblobs where they came from. Where the target_blobs entry is
00542 // NULL, a run of wanted outlines is put into a single new blob.
00543 // Ownership of the outlines is transferred back to the word. (Hence
00544 // GenericVector and not PointerVector.)
00545 // Returns true if any new blob was added to the start of the word, which
00546 // suggests that it might need joining to the word before it, and likewise
00547 // sets make_next_word_fuzzy true if any new blob was added to the end.
00548 bool WERD::AddSelectedOutlines(const GenericVector<bool>& wanted,
00549                                const GenericVector<C_BLOB*>& target_blobs,
00550                                const GenericVector<C_OUTLINE*>& outlines,
00551                                bool* make_next_word_fuzzy) {
00552   bool outline_added_to_start = false;
00553   if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = false;
00554   C_BLOB_IT rej_it(&rej_cblobs);
00555   for (int i = 0; i < outlines.size(); ++i) {
00556     C_OUTLINE* outline = outlines[i];
00557     if (outline == NULL) continue;  // Already used it.
00558     if (wanted[i]) {
00559       C_BLOB* target_blob = target_blobs[i];
00560       TBOX noise_box = outline->bounding_box();
00561       if (target_blob == NULL) {
00562         target_blob = new C_BLOB(outline);
00563         // Need to find the insertion point.
00564         C_BLOB_IT blob_it(&cblobs);
00565         for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
00566              blob_it.forward()) {
00567           C_BLOB* blob = blob_it.data();
00568           TBOX blob_box = blob->bounding_box();
00569           if (blob_box.left() > noise_box.left()) {
00570             if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) {
00571               // We might want to join this word to its predecessor.
00572               outline_added_to_start = true;
00573             }
00574             blob_it.add_before_stay_put(target_blob);
00575             break;
00576           }
00577         }
00578         if (blob_it.cycled_list()) {
00579           blob_it.add_to_end(target_blob);
00580           if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = true;
00581         }
00582         // Add all consecutive wanted, but null-blob outlines to same blob.
00583         C_OUTLINE_IT ol_it(target_blob->out_list());
00584         while (i + 1 < outlines.size() && wanted[i + 1] &&
00585                target_blobs[i + 1] == NULL) {
00586           ++i;
00587           ol_it.add_to_end(outlines[i]);
00588         }
00589       } else {
00590         // Insert outline into this blob.
00591         C_OUTLINE_IT ol_it(target_blob->out_list());
00592         ol_it.add_to_end(outline);
00593       }
00594     } else {
00595       // Put back on noise list.
00596       rej_it.add_to_end(new C_BLOB(outline));
00597     }
00598   }
00599   return outline_added_to_start;
00600 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines