|
tesseract 3.04.01
|
00001 00002 // File: imagedata.h 00003 // Description: Class to hold information about a single image and its 00004 // corresponding boxes or text file. 00005 // Author: Ray Smith 00006 // Created: Mon Jul 22 14:17:06 PDT 2013 00007 // 00008 // (C) Copyright 2013, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00019 00020 #ifndef TESSERACT_IMAGE_IMAGEDATA_H_ 00021 #define TESSERACT_IMAGE_IMAGEDATA_H_ 00022 00023 00024 #include "genericvector.h" 00025 #include "normalis.h" 00026 #include "rect.h" 00027 #include "strngs.h" 00028 00029 struct Pix; 00030 00031 namespace tesseract { 00032 00033 // Amount of padding to apply in output pixels in feature mode. 00034 const int kFeaturePadding = 2; 00035 // Number of pixels to pad around text boxes. 00036 const int kImagePadding = 4; 00037 // Number of training images to combine into a mini-batch for training. 00038 const int kNumPagesPerMiniBatch = 100; 00039 00040 class WordFeature { 00041 public: 00042 WordFeature(); 00043 WordFeature(const FCOORD& fcoord, uinT8 dir); 00044 00045 // Computes the maximum x and y value in the features. 00046 static void ComputeSize(const GenericVector<WordFeature>& features, 00047 int* max_x, int* max_y); 00048 // Draws the features in the given window. 00049 static void Draw(const GenericVector<WordFeature>& features, 00050 ScrollView* window); 00051 00052 // Accessors. 00053 int x() const { return x_; } 00054 int y() const { return y_; } 00055 int dir() const { return dir_; } 00056 00057 // Writes to the given file. Returns false in case of error. 00058 bool Serialize(FILE* fp) const; 00059 // Reads from the given file. Returns false in case of error. 00060 // If swap is true, assumes a big/little-endian swap is needed. 00061 bool DeSerialize(bool swap, FILE* fp); 00062 00063 private: 00064 inT16 x_; 00065 uinT8 y_; 00066 uinT8 dir_; 00067 }; 00068 00069 // A floating-point version of WordFeature, used as an intermediate during 00070 // scaling. 00071 struct FloatWordFeature { 00072 static void FromWordFeatures(const GenericVector<WordFeature>& word_features, 00073 GenericVector<FloatWordFeature>* float_features); 00074 // Sort function to sort first by x-bucket, then by y. 00075 static int SortByXBucket(const void*, const void*); 00076 00077 float x; 00078 float y; 00079 float dir; 00080 int x_bucket; 00081 }; 00082 00083 // Class to hold information on a single image: 00084 // Filename, cached image as a Pix*, character boxes, text transcription. 00085 // The text transcription is the ground truth UTF-8 text for the image. 00086 // Character boxes are optional and indicate the desired segmentation of 00087 // the text into recognition units. 00088 class ImageData { 00089 public: 00090 ImageData(); 00091 // Takes ownership of the pix. 00092 ImageData(bool vertical, Pix* pix); 00093 ~ImageData(); 00094 00095 // Builds and returns an ImageData from the basic data. Note that imagedata, 00096 // truth_text, and box_text are all the actual file data, NOT filenames. 00097 static ImageData* Build(const char* name, int page_number, const char* lang, 00098 const char* imagedata, int imagedatasize, 00099 const char* truth_text, const char* box_text); 00100 00101 // Writes to the given file. Returns false in case of error. 00102 bool Serialize(TFile* fp) const; 00103 // Reads from the given file. Returns false in case of error. 00104 // If swap is true, assumes a big/little-endian swap is needed. 00105 bool DeSerialize(bool swap, TFile* fp); 00106 00107 // Other accessors. 00108 const STRING& imagefilename() const { 00109 return imagefilename_; 00110 } 00111 void set_imagefilename(const STRING& name) { 00112 imagefilename_ = name; 00113 } 00114 int page_number() const { 00115 return page_number_; 00116 } 00117 void set_page_number(int num) { 00118 page_number_ = num; 00119 } 00120 const GenericVector<char>& image_data() const { 00121 return image_data_; 00122 } 00123 const STRING& language() const { 00124 return language_; 00125 } 00126 void set_language(const STRING& lang) { 00127 language_ = lang; 00128 } 00129 const STRING& transcription() const { 00130 return transcription_; 00131 } 00132 const GenericVector<TBOX>& boxes() const { 00133 return boxes_; 00134 } 00135 const GenericVector<STRING>& box_texts() const { 00136 return box_texts_; 00137 } 00138 const STRING& box_text(int index) const { 00139 return box_texts_[index]; 00140 } 00141 // Saves the given Pix as a PNG-encoded string and destroys it. 00142 void SetPix(Pix* pix); 00143 // Returns the Pix image for *this. Must be pixDestroyed after use. 00144 Pix* GetPix() const; 00145 // Gets anything and everything with a non-NULL pointer, prescaled to a 00146 // given target_height (if 0, then the original image height), and aligned. 00147 // Also returns (if not NULL) the width and height of the scaled image. 00148 // The return value is the scale factor that was applied to the image to 00149 // achieve the target_height. 00150 float PreScale(int target_height, Pix** pix, 00151 int* scaled_width, int* scaled_height, 00152 GenericVector<TBOX>* boxes) const; 00153 00154 int MemoryUsed() const; 00155 00156 // Draws the data in a new window. 00157 void Display() const; 00158 00159 // Adds the supplied boxes and transcriptions that correspond to the correct 00160 // page number. 00161 void AddBoxes(const GenericVector<TBOX>& boxes, 00162 const GenericVector<STRING>& texts, 00163 const GenericVector<int>& box_pages); 00164 00165 private: 00166 // Saves the given Pix as a PNG-encoded string and destroys it. 00167 static void SetPixInternal(Pix* pix, GenericVector<char>* image_data); 00168 // Returns the Pix image for the image_data. Must be pixDestroyed after use. 00169 static Pix* GetPixInternal(const GenericVector<char>& image_data); 00170 // Parses the text string as a box file and adds any discovered boxes that 00171 // match the page number. Returns false on error. 00172 bool AddBoxes(const char* box_text); 00173 00174 private: 00175 STRING imagefilename_; // File to read image from. 00176 inT32 page_number_; // Page number if multi-page tif or -1. 00177 GenericVector<char> image_data_; // PNG file data. 00178 STRING language_; // Language code for image. 00179 STRING transcription_; // UTF-8 ground truth of image. 00180 GenericVector<TBOX> boxes_; // If non-empty boxes of the image. 00181 GenericVector<STRING> box_texts_; // String for text in each box. 00182 bool vertical_text_; // Image has been rotated from vertical. 00183 }; 00184 00185 // A collection of ImageData that knows roughly how much memory it is using. 00186 class DocumentData { 00187 public: 00188 explicit DocumentData(const STRING& name); 00189 ~DocumentData(); 00190 00191 // Reads all the pages in the given lstmf filename to the cache. The reader 00192 // is used to read the file. 00193 bool LoadDocument(const char* filename, const char* lang, int start_page, 00194 inT64 max_memory, FileReader reader); 00195 // Writes all the pages to the given filename. Returns false on error. 00196 bool SaveDocument(const char* filename, FileWriter writer); 00197 bool SaveToBuffer(GenericVector<char>* buffer); 00198 00199 // Adds the given page data to this document, counting up memory. 00200 void AddPageToDocument(ImageData* page); 00201 00202 const STRING& document_name() const { 00203 return document_name_; 00204 } 00205 int NumPages() const { 00206 return total_pages_; 00207 } 00208 inT64 memory_used() const { 00209 return memory_used_; 00210 } 00211 // Returns a pointer to the page with the given index, modulo the total 00212 // number of pages, recaching if needed. 00213 const ImageData* GetPage(int index); 00214 // Takes ownership of the given page index. The page is made NULL in *this. 00215 ImageData* TakePage(int index) { 00216 ImageData* page = pages_[index]; 00217 pages_[index] = NULL; 00218 return page; 00219 } 00220 00221 private: 00222 // Loads as many pages can fit in max_memory_ starting at index pages_offset_. 00223 bool ReCachePages(); 00224 00225 private: 00226 // A name for this document. 00227 STRING document_name_; 00228 // The language of this document. 00229 STRING lang_; 00230 // A group of pages that corresponds in some loose way to a document. 00231 PointerVector<ImageData> pages_; 00232 // Page number of the first index in pages_. 00233 int pages_offset_; 00234 // Total number of pages in document (may exceed size of pages_.) 00235 int total_pages_; 00236 // Total of all pix sizes in the document. 00237 inT64 memory_used_; 00238 // Max memory to use at any time. 00239 inT64 max_memory_; 00240 // Saved reader from LoadDocument to allow re-caching. 00241 FileReader reader_; 00242 }; 00243 00244 // A collection of DocumentData that knows roughly how much memory it is using. 00245 class DocumentCache { 00246 public: 00247 explicit DocumentCache(inT64 max_memory); 00248 ~DocumentCache(); 00249 00250 // Adds all the documents in the list of filenames, counting memory. 00251 // The reader is used to read the files. 00252 bool LoadDocuments(const GenericVector<STRING>& filenames, const char* lang, 00253 FileReader reader); 00254 00255 // Adds document to the cache, throwing out other documents if needed. 00256 bool AddToCache(DocumentData* data); 00257 00258 // Finds and returns a document by name. 00259 DocumentData* FindDocument(const STRING& document_name) const; 00260 00261 // Returns a page by serial number, selecting them in a round-robin fashion 00262 // from all the documents. 00263 const ImageData* GetPageBySerial(int serial); 00264 00265 const PointerVector<DocumentData>& documents() const { 00266 return documents_; 00267 } 00268 int total_pages() const { 00269 return total_pages_; 00270 } 00271 00272 private: 00273 // A group of pages that corresponds in some loose way to a document. 00274 PointerVector<DocumentData> documents_; 00275 // Total of all pages. 00276 int total_pages_; 00277 // Total of all memory used by the cache. 00278 inT64 memory_used_; 00279 // Max memory allowed in this cache. 00280 inT64 max_memory_; 00281 }; 00282 00283 } // namespace tesseract 00284 00285 00286 #endif // TESSERACT_IMAGE_IMAGEDATA_H_