|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: word.c 00003 * Description: Code for the WERD class. 00004 * Author: Ray Smith 00005 * Created: Tue Oct 08 14:32:12 BST 1991 00006 * 00007 * (C) Copyright 1991, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifndef WERD_H 00021 #define WERD_H 00022 00023 #include "params.h" 00024 #include "bits16.h" 00025 #include "elst2.h" 00026 #include "strngs.h" 00027 #include "blckerr.h" 00028 #include "stepblob.h" 00029 00030 enum WERD_FLAGS 00031 { 00032 W_SEGMENTED, //< correctly segmented 00033 W_ITALIC, //< italic text 00034 W_BOLD, //< bold text 00035 W_BOL, //< start of line 00036 W_EOL, //< end of line 00037 W_NORMALIZED, //< flags 00038 W_SCRIPT_HAS_XHEIGHT, //< x-height concept makes sense. 00039 W_SCRIPT_IS_LATIN, //< Special case latin for y. splitting. 00040 W_DONT_CHOP, //< fixed pitch chopped 00041 W_REP_CHAR, //< repeated character 00042 W_FUZZY_SP, //< fuzzy space 00043 W_FUZZY_NON, //< fuzzy nonspace 00044 W_INVERSE //< white on black 00045 }; 00046 00047 enum DISPLAY_FLAGS 00048 { 00049 /* Display flags bit number allocations */ 00050 DF_BOX, //< Bounding box 00051 DF_TEXT, //< Correct ascii 00052 DF_POLYGONAL, //< Polyg approx 00053 DF_EDGE_STEP, //< Edge steps 00054 DF_BN_POLYGONAL, //< BL normalisd polyapx 00055 DF_BLAMER //< Blamer information 00056 }; 00057 00058 class ROW; //forward decl 00059 00060 class WERD : public ELIST2_LINK { 00061 public: 00062 WERD() {} 00063 // WERD constructed with: 00064 // blob_list - blobs of the word (we take this list's contents) 00065 // blanks - number of blanks before the word 00066 // text - correct text (outlives WERD) 00067 WERD(C_BLOB_LIST *blob_list, uinT8 blanks, const char *text); 00068 00069 // WERD constructed from: 00070 // blob_list - blobs in the word 00071 // clone - werd to clone flags, etc from. 00072 WERD(C_BLOB_LIST *blob_list, WERD *clone); 00073 00074 // Construct a WERD from a single_blob and clone the flags from this. 00075 // W_BOL and W_EOL flags are set according to the given values. 00076 WERD* ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob); 00077 00078 ~WERD() { 00079 } 00080 00081 // assignment 00082 WERD & operator= (const WERD &source); 00083 00084 // This method returns a new werd constructed using the blobs in the input 00085 // all_blobs list, which correspond to the blobs in this werd object. The 00086 // blobs used to construct the new word are consumed and removed from the 00087 // input all_blobs list. 00088 // Returns NULL if the word couldn't be constructed. 00089 // Returns original blobs for which no matches were found in the output list 00090 // orphan_blobs (appends). 00091 WERD *ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, 00092 C_BLOB_LIST *orphan_blobs); 00093 00094 // Accessors for reject / DUFF blobs in various formats 00095 C_BLOB_LIST *rej_cblob_list() { // compact format 00096 return &rej_cblobs; 00097 } 00098 00099 // Accessors for good blobs in various formats. 00100 C_BLOB_LIST *cblob_list() { // get compact blobs 00101 return &cblobs; 00102 } 00103 00104 uinT8 space() { // access function 00105 return blanks; 00106 } 00107 void set_blanks(uinT8 new_blanks) { 00108 blanks = new_blanks; 00109 } 00110 int script_id() const { 00111 return script_id_; 00112 } 00113 void set_script_id(int id) { 00114 script_id_ = id; 00115 } 00116 00117 // Returns the (default) bounding box including all the dots. 00118 TBOX bounding_box() const; // compute bounding box 00119 // Returns the bounding box including the desired combination of upper and 00120 // lower noise/diacritic elements. 00121 TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const; 00122 // Returns the bounding box of only the good blobs. 00123 TBOX true_bounding_box() const; 00124 00125 const char *text() const { return correct.string(); } 00126 void set_text(const char *new_text) { correct = new_text; } 00127 00128 BOOL8 flag(WERD_FLAGS mask) const { return flags.bit(mask); } 00129 void set_flag(WERD_FLAGS mask, BOOL8 value) { flags.set_bit(mask, value); } 00130 00131 BOOL8 display_flag(uinT8 flag) const { return disp_flags.bit(flag); } 00132 void set_display_flag(uinT8 flag, BOOL8 value) { 00133 disp_flags.set_bit(flag, value); 00134 } 00135 00136 WERD *shallow_copy(); // shallow copy word 00137 00138 // reposition word by vector 00139 void move(const ICOORD vec); 00140 00141 // join other's blobs onto this werd, emptying out other. 00142 void join_on(WERD* other); 00143 00144 // copy other's blobs onto this word, leaving other intact. 00145 void copy_on(WERD* other); 00146 00147 // tprintf word metadata (but not blob innards) 00148 void print(); 00149 00150 #ifndef GRAPHICS_DISABLED 00151 // plot word on window in a uniform colour 00152 void plot(ScrollView *window, ScrollView::Color colour); 00153 00154 // Get the next color in the (looping) rainbow. 00155 static ScrollView::Color NextColor(ScrollView::Color colour); 00156 00157 // plot word on window in a rainbow of colours 00158 void plot(ScrollView *window); 00159 00160 // plot rejected blobs in a rainbow of colours 00161 void plot_rej_blobs(ScrollView *window); 00162 #endif // GRAPHICS_DISABLED 00163 00164 // Removes noise from the word by moving small outlines to the rej_cblobs 00165 // list, based on the size_threshold. 00166 void CleanNoise(float size_threshold); 00167 00168 // Extracts all the noise outlines and stuffs the pointers into the given 00169 // vector of outlines. Afterwards, the outlines vector owns the pointers. 00170 void GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines); 00171 // Adds the selected outlines to the indcated real blobs, and puts the rest 00172 // back in rej_cblobs where they came from. Where the target_blobs entry is 00173 // NULL, a run of wanted outlines is put into a single new blob. 00174 // Ownership of the outlines is transferred back to the word. (Hence 00175 // GenericVector and not PointerVector.) 00176 // Returns true if any new blob was added to the start of the word, which 00177 // suggests that it might need joining to the word before it, and likewise 00178 // sets make_next_word_fuzzy true if any new blob was added to the end. 00179 bool AddSelectedOutlines(const GenericVector<bool> &wanted, 00180 const GenericVector<C_BLOB *> &target_blobs, 00181 const GenericVector<C_OUTLINE *> &outlines, 00182 bool *make_next_word_fuzzy); 00183 00184 private: 00185 uinT8 blanks; // no of blanks 00186 uinT8 dummy; // padding 00187 BITS16 flags; // flags about word 00188 BITS16 disp_flags; // display flags 00189 inT16 script_id_; // From unicharset. 00190 STRING correct; // correct text 00191 C_BLOB_LIST cblobs; // compacted blobs 00192 C_BLOB_LIST rej_cblobs; // DUFF blobs 00193 }; 00194 00195 ELIST2IZEH (WERD) 00196 #include "ocrrow.h" // placed here due to 00197 // compare words by increasing order of left edge, suitable for qsort(3) 00198 int word_comparator(const void *word1p, const void *word2p); 00199 #endif