tesseract 3.04.01

ccstruct/imagedata.h

Go to the documentation of this file.
00001 
00002 // File:        imagedata.h
00003 // Description: Class to hold information about a single image and its
00004 //              corresponding boxes or text file.
00005 // Author:      Ray Smith
00006 // Created:     Mon Jul 22 14:17:06 PDT 2013
00007 //
00008 // (C) Copyright 2013, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00019 
00020 #ifndef TESSERACT_IMAGE_IMAGEDATA_H_
00021 #define TESSERACT_IMAGE_IMAGEDATA_H_
00022 
00023 
00024 #include "genericvector.h"
00025 #include "normalis.h"
00026 #include "rect.h"
00027 #include "strngs.h"
00028 
00029 struct Pix;
00030 
00031 namespace tesseract {
00032 
00033 // Amount of padding to apply in output pixels in feature mode.
00034 const int kFeaturePadding = 2;
00035 // Number of pixels to pad around text boxes.
00036 const int kImagePadding = 4;
00037 // Number of training images to combine into a mini-batch for training.
00038 const int kNumPagesPerMiniBatch = 100;
00039 
00040 class WordFeature {
00041  public:
00042   WordFeature();
00043   WordFeature(const FCOORD& fcoord, uinT8 dir);
00044 
00045   // Computes the maximum x and y value in the features.
00046   static void ComputeSize(const GenericVector<WordFeature>& features,
00047                           int* max_x, int* max_y);
00048   // Draws the features in the given window.
00049   static void Draw(const GenericVector<WordFeature>& features,
00050                    ScrollView* window);
00051 
00052   // Accessors.
00053   int x() const { return x_; }
00054   int y() const { return y_; }
00055   int dir() const { return dir_; }
00056 
00057   // Writes to the given file. Returns false in case of error.
00058   bool Serialize(FILE* fp) const;
00059   // Reads from the given file. Returns false in case of error.
00060   // If swap is true, assumes a big/little-endian swap is needed.
00061   bool DeSerialize(bool swap, FILE* fp);
00062 
00063  private:
00064   inT16 x_;
00065   uinT8 y_;
00066   uinT8 dir_;
00067 };
00068 
00069 // A floating-point version of WordFeature, used as an intermediate during
00070 // scaling.
00071 struct FloatWordFeature {
00072   static void FromWordFeatures(const GenericVector<WordFeature>& word_features,
00073                                GenericVector<FloatWordFeature>* float_features);
00074   // Sort function to sort first by x-bucket, then by y.
00075   static int SortByXBucket(const void*, const void*);
00076 
00077   float x;
00078   float y;
00079   float dir;
00080   int x_bucket;
00081 };
00082 
00083 // Class to hold information on a single image:
00084 // Filename, cached image as a Pix*, character boxes, text transcription.
00085 // The text transcription is the ground truth UTF-8 text for the image.
00086 // Character boxes are optional and indicate the desired segmentation of
00087 // the text into recognition units.
00088 class ImageData {
00089  public:
00090   ImageData();
00091   // Takes ownership of the pix.
00092   ImageData(bool vertical, Pix* pix);
00093   ~ImageData();
00094 
00095   // Builds and returns an ImageData from the basic data. Note that imagedata,
00096   // truth_text, and box_text are all the actual file data, NOT filenames.
00097   static ImageData* Build(const char* name, int page_number, const char* lang,
00098                           const char* imagedata, int imagedatasize,
00099                           const char* truth_text, const char* box_text);
00100 
00101   // Writes to the given file. Returns false in case of error.
00102   bool Serialize(TFile* fp) const;
00103   // Reads from the given file. Returns false in case of error.
00104   // If swap is true, assumes a big/little-endian swap is needed.
00105   bool DeSerialize(bool swap, TFile* fp);
00106 
00107   // Other accessors.
00108   const STRING& imagefilename() const {
00109     return imagefilename_;
00110   }
00111   void set_imagefilename(const STRING& name) {
00112     imagefilename_ = name;
00113   }
00114   int page_number() const {
00115     return page_number_;
00116   }
00117   void set_page_number(int num) {
00118     page_number_ = num;
00119   }
00120   const GenericVector<char>& image_data() const {
00121     return image_data_;
00122   }
00123   const STRING& language() const {
00124     return language_;
00125   }
00126   void set_language(const STRING& lang) {
00127     language_ = lang;
00128   }
00129   const STRING& transcription() const {
00130     return transcription_;
00131   }
00132   const GenericVector<TBOX>& boxes() const {
00133     return boxes_;
00134   }
00135   const GenericVector<STRING>& box_texts() const {
00136     return box_texts_;
00137   }
00138   const STRING& box_text(int index) const {
00139     return box_texts_[index];
00140   }
00141   // Saves the given Pix as a PNG-encoded string and destroys it.
00142   void SetPix(Pix* pix);
00143   // Returns the Pix image for *this. Must be pixDestroyed after use.
00144   Pix* GetPix() const;
00145   // Gets anything and everything with a non-NULL pointer, prescaled to a
00146   // given target_height (if 0, then the original image height), and aligned.
00147   // Also returns (if not NULL) the width and height of the scaled image.
00148   // The return value is the scale factor that was applied to the image to
00149   // achieve the target_height.
00150   float PreScale(int target_height, Pix** pix,
00151                  int* scaled_width, int* scaled_height,
00152                  GenericVector<TBOX>* boxes) const;
00153 
00154   int MemoryUsed() const;
00155 
00156   // Draws the data in a new window.
00157   void Display() const;
00158 
00159   // Adds the supplied boxes and transcriptions that correspond to the correct
00160   // page number.
00161   void AddBoxes(const GenericVector<TBOX>& boxes,
00162                 const GenericVector<STRING>& texts,
00163                 const GenericVector<int>& box_pages);
00164 
00165  private:
00166   // Saves the given Pix as a PNG-encoded string and destroys it.
00167   static void SetPixInternal(Pix* pix, GenericVector<char>* image_data);
00168   // Returns the Pix image for the image_data. Must be pixDestroyed after use.
00169   static Pix* GetPixInternal(const GenericVector<char>& image_data);
00170   // Parses the text string as a box file and adds any discovered boxes that
00171   // match the page number. Returns false on error.
00172   bool AddBoxes(const char* box_text);
00173 
00174  private:
00175   STRING imagefilename_;             // File to read image from.
00176   inT32 page_number_;                // Page number if multi-page tif or -1.
00177   GenericVector<char> image_data_;   // PNG file data.
00178   STRING language_;                  // Language code for image.
00179   STRING transcription_;             // UTF-8 ground truth of image.
00180   GenericVector<TBOX> boxes_;        // If non-empty boxes of the image.
00181   GenericVector<STRING> box_texts_;  // String for text in each box.
00182   bool vertical_text_;               // Image has been rotated from vertical.
00183 };
00184 
00185 // A collection of ImageData that knows roughly how much memory it is using.
00186 class DocumentData {
00187  public:
00188   explicit DocumentData(const STRING& name);
00189   ~DocumentData();
00190 
00191   // Reads all the pages in the given lstmf filename to the cache. The reader
00192   // is used to read the file.
00193   bool LoadDocument(const char* filename, const char* lang, int start_page,
00194                     inT64 max_memory, FileReader reader);
00195   // Writes all the pages to the given filename. Returns false on error.
00196   bool SaveDocument(const char* filename, FileWriter writer);
00197   bool SaveToBuffer(GenericVector<char>* buffer);
00198 
00199   // Adds the given page data to this document, counting up memory.
00200   void AddPageToDocument(ImageData* page);
00201 
00202   const STRING& document_name() const {
00203     return document_name_;
00204   }
00205   int NumPages() const {
00206     return total_pages_;
00207   }
00208   inT64 memory_used() const {
00209     return memory_used_;
00210   }
00211   // Returns a pointer to the page with the given index, modulo the total
00212   // number of pages, recaching if needed.
00213   const ImageData* GetPage(int index);
00214   // Takes ownership of the given page index. The page is made NULL in *this.
00215   ImageData* TakePage(int index) {
00216     ImageData* page = pages_[index];
00217     pages_[index] = NULL;
00218     return page;
00219   }
00220 
00221  private:
00222   // Loads as many pages can fit in max_memory_ starting at index pages_offset_.
00223   bool ReCachePages();
00224 
00225  private:
00226   // A name for this document.
00227   STRING document_name_;
00228   // The language of this document.
00229   STRING lang_;
00230   // A group of pages that corresponds in some loose way to a document.
00231   PointerVector<ImageData> pages_;
00232   // Page number of the first index in pages_.
00233   int pages_offset_;
00234   // Total number of pages in document (may exceed size of pages_.)
00235   int total_pages_;
00236   // Total of all pix sizes in the document.
00237   inT64 memory_used_;
00238   // Max memory to use at any time.
00239   inT64 max_memory_;
00240   // Saved reader from LoadDocument to allow re-caching.
00241   FileReader reader_;
00242 };
00243 
00244 // A collection of DocumentData that knows roughly how much memory it is using.
00245 class DocumentCache {
00246  public:
00247   explicit DocumentCache(inT64 max_memory);
00248   ~DocumentCache();
00249 
00250   // Adds all the documents in the list of filenames, counting memory.
00251   // The reader is used to read the files.
00252   bool LoadDocuments(const GenericVector<STRING>& filenames, const char* lang,
00253                      FileReader reader);
00254 
00255   // Adds document to the cache, throwing out other documents if needed.
00256   bool AddToCache(DocumentData* data);
00257 
00258   // Finds and returns a document by name.
00259   DocumentData* FindDocument(const STRING& document_name) const;
00260 
00261   // Returns a page by serial number, selecting them in a round-robin fashion
00262   // from all the documents.
00263   const ImageData* GetPageBySerial(int serial);
00264 
00265   const PointerVector<DocumentData>& documents() const {
00266     return documents_;
00267   }
00268   int total_pages() const {
00269     return total_pages_;
00270   }
00271 
00272  private:
00273   // A group of pages that corresponds in some loose way to a document.
00274   PointerVector<DocumentData> documents_;
00275   // Total of all pages.
00276   int total_pages_;
00277   // Total of all memory used by the cache.
00278   inT64 memory_used_;
00279   // Max memory allowed in this cache.
00280   inT64 max_memory_;
00281 };
00282 
00283 }  // namespace tesseract
00284 
00285 
00286 #endif  // TESSERACT_IMAGE_IMAGEDATA_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines