tesseract 3.04.01

ccstruct/werd.h

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:                        word.c
00003  * Description: Code for the WERD class.
00004  * Author:              Ray Smith
00005  * Created:             Tue Oct 08 14:32:12 BST 1991
00006  *
00007  * (C) Copyright 1991, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifndef           WERD_H
00021 #define           WERD_H
00022 
00023 #include          "params.h"
00024 #include          "bits16.h"
00025 #include          "elst2.h"
00026 #include          "strngs.h"
00027 #include          "blckerr.h"
00028 #include          "stepblob.h"
00029 
00030 enum WERD_FLAGS
00031 {
00032   W_SEGMENTED,                   //< correctly segmented
00033   W_ITALIC,                      //< italic text
00034   W_BOLD,                        //< bold text
00035   W_BOL,                         //< start of line
00036   W_EOL,                         //< end of line
00037   W_NORMALIZED,                  //< flags
00038   W_SCRIPT_HAS_XHEIGHT,          //< x-height concept makes sense.
00039   W_SCRIPT_IS_LATIN,             //< Special case latin for y. splitting.
00040   W_DONT_CHOP,                   //< fixed pitch chopped
00041   W_REP_CHAR,                    //< repeated character
00042   W_FUZZY_SP,                    //< fuzzy space
00043   W_FUZZY_NON,                   //< fuzzy nonspace
00044   W_INVERSE                      //< white on black
00045 };
00046 
00047 enum DISPLAY_FLAGS
00048 {
00049   /* Display flags bit number allocations */
00050   DF_BOX,                        //< Bounding box
00051   DF_TEXT,                       //< Correct ascii
00052   DF_POLYGONAL,                  //< Polyg approx
00053   DF_EDGE_STEP,                  //< Edge steps
00054   DF_BN_POLYGONAL,               //< BL normalisd polyapx
00055   DF_BLAMER                      //< Blamer information
00056 };
00057 
00058 class ROW;                       //forward decl
00059 
00060 class WERD : public ELIST2_LINK {
00061   public:
00062     WERD() {}
00063     // WERD constructed with:
00064     //   blob_list - blobs of the word (we take this list's contents)
00065     //   blanks - number of blanks before the word
00066     //   text - correct text (outlives WERD)
00067     WERD(C_BLOB_LIST *blob_list, uinT8 blanks, const char *text);
00068 
00069     // WERD constructed from:
00070     //   blob_list - blobs in the word
00071     //   clone - werd to clone flags, etc from.
00072     WERD(C_BLOB_LIST *blob_list, WERD *clone);
00073 
00074     // Construct a WERD from a single_blob and clone the flags from this.
00075     // W_BOL and W_EOL flags are set according to the given values.
00076     WERD* ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob);
00077 
00078     ~WERD() {
00079     }
00080 
00081     // assignment
00082     WERD & operator= (const WERD &source);
00083 
00084     // This method returns a new werd constructed using the blobs in the input
00085     // all_blobs list, which correspond to the blobs in this werd object. The
00086     // blobs used to construct the new word are consumed and removed from the
00087     // input all_blobs list.
00088     // Returns NULL if the word couldn't be constructed.
00089     // Returns original blobs for which no matches were found in the output list
00090     // orphan_blobs (appends).
00091     WERD *ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs,
00092                                     C_BLOB_LIST *orphan_blobs);
00093 
00094     // Accessors for reject / DUFF blobs in various formats
00095     C_BLOB_LIST *rej_cblob_list() {  // compact format
00096       return &rej_cblobs;
00097     }
00098 
00099     // Accessors for good blobs in various formats.
00100     C_BLOB_LIST *cblob_list() {  // get compact blobs
00101       return &cblobs;
00102     }
00103 
00104     uinT8 space() {  // access function
00105       return blanks;
00106     }
00107     void set_blanks(uinT8 new_blanks) {
00108       blanks = new_blanks;
00109     }
00110     int script_id() const {
00111       return script_id_;
00112     }
00113     void set_script_id(int id) {
00114       script_id_ = id;
00115     }
00116 
00117     // Returns the (default) bounding box including all the dots.
00118     TBOX bounding_box() const;  // compute bounding box
00119     // Returns the bounding box including the desired combination of upper and
00120     // lower noise/diacritic elements.
00121     TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
00122     // Returns the bounding box of only the good blobs.
00123     TBOX true_bounding_box() const;
00124 
00125     const char *text() const { return correct.string(); }
00126     void set_text(const char *new_text) { correct = new_text; }
00127 
00128     BOOL8 flag(WERD_FLAGS mask) const { return flags.bit(mask); }
00129     void set_flag(WERD_FLAGS mask, BOOL8 value) { flags.set_bit(mask, value); }
00130 
00131     BOOL8 display_flag(uinT8 flag) const { return disp_flags.bit(flag); }
00132     void set_display_flag(uinT8 flag, BOOL8 value) {
00133       disp_flags.set_bit(flag, value);
00134     }
00135 
00136     WERD *shallow_copy();  // shallow copy word
00137 
00138     // reposition word by vector
00139     void move(const ICOORD vec);
00140 
00141     // join other's blobs onto this werd, emptying out other.
00142     void join_on(WERD* other);
00143 
00144     // copy other's blobs onto this word, leaving other intact.
00145     void copy_on(WERD* other);
00146 
00147     // tprintf word metadata (but not blob innards)
00148     void print();
00149 
00150     #ifndef GRAPHICS_DISABLED
00151     // plot word on window in a uniform colour
00152     void plot(ScrollView *window, ScrollView::Color colour);
00153 
00154     // Get the next color in the (looping) rainbow.
00155     static ScrollView::Color NextColor(ScrollView::Color colour);
00156 
00157     // plot word on window in a rainbow of colours
00158     void plot(ScrollView *window);
00159 
00160     // plot rejected blobs in a rainbow of colours
00161     void plot_rej_blobs(ScrollView *window);
00162     #endif  // GRAPHICS_DISABLED
00163 
00164     // Removes noise from the word by moving small outlines to the rej_cblobs
00165     // list, based on the size_threshold.
00166     void CleanNoise(float size_threshold);
00167 
00168     // Extracts all the noise outlines and stuffs the pointers into the given
00169     // vector of outlines. Afterwards, the outlines vector owns the pointers.
00170     void GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines);
00171     // Adds the selected outlines to the indcated real blobs, and puts the rest
00172     // back in rej_cblobs where they came from. Where the target_blobs entry is
00173     // NULL, a run of wanted outlines is put into a single new blob.
00174     // Ownership of the outlines is transferred back to the word. (Hence
00175     // GenericVector and not PointerVector.)
00176     // Returns true if any new blob was added to the start of the word, which
00177     // suggests that it might need joining to the word before it, and likewise
00178     // sets make_next_word_fuzzy true if any new blob was added to the end.
00179     bool AddSelectedOutlines(const GenericVector<bool> &wanted,
00180                              const GenericVector<C_BLOB *> &target_blobs,
00181                              const GenericVector<C_OUTLINE *> &outlines,
00182                              bool *make_next_word_fuzzy);
00183 
00184  private:
00185     uinT8 blanks;                // no of blanks
00186     uinT8 dummy;                 // padding
00187     BITS16 flags;                // flags about word
00188     BITS16 disp_flags;           // display flags
00189     inT16 script_id_;            // From unicharset.
00190     STRING correct;              // correct text
00191     C_BLOB_LIST cblobs;          // compacted blobs
00192     C_BLOB_LIST rej_cblobs;      // DUFF blobs
00193 };
00194 
00195 ELIST2IZEH (WERD)
00196 #include          "ocrrow.h"     // placed here due to
00197 // compare words by increasing order of left edge, suitable for qsort(3)
00198 int word_comparator(const void *word1p, const void *word2p);
00199 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines