tesseract 3.04.01

ccstruct/imagedata.cpp

Go to the documentation of this file.
00001 
00002 // File:        imagedata.h
00003 // Description: Class to hold information about a single multi-page tiff
00004 //              training file and its corresponding boxes or text file.
00005 // Author:      Ray Smith
00006 // Created:     Tue May 28 08:56:06 PST 2013
00007 //
00008 // (C) Copyright 2013, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00019 
00020 // Include automatically generated configuration file if running autoconf.
00021 #ifdef HAVE_CONFIG_H
00022 #include "config_auto.h"
00023 #endif
00024 
00025 #include "imagedata.h"
00026 
00027 #include "allheaders.h"
00028 #include "boxread.h"
00029 #include "callcpp.h"
00030 #include "helpers.h"
00031 #include "tprintf.h"
00032 
00033 namespace tesseract {
00034 
00035 WordFeature::WordFeature() : x_(0), y_(0), dir_(0) {
00036 }
00037 
00038 WordFeature::WordFeature(const FCOORD& fcoord, uinT8 dir)
00039   : x_(IntCastRounded(fcoord.x())),
00040     y_(ClipToRange(IntCastRounded(fcoord.y()), 0, MAX_UINT8)),
00041     dir_(dir) {
00042 }
00043 
00044 // Computes the maximum x and y value in the features.
00045 void WordFeature::ComputeSize(const GenericVector<WordFeature>& features,
00046                               int* max_x, int* max_y) {
00047   *max_x = 0;
00048   *max_y = 0;
00049   for (int f = 0; f < features.size(); ++f) {
00050     if (features[f].x_ > *max_x) *max_x = features[f].x_;
00051     if (features[f].y_ > *max_y) *max_y = features[f].y_;
00052   }
00053 }
00054 
00055 // Draws the features in the given window.
00056 void WordFeature::Draw(const GenericVector<WordFeature>& features,
00057                        ScrollView* window) {
00058 #ifndef GRAPHICS_DISABLED
00059   for (int f = 0; f < features.size(); ++f) {
00060     FCOORD pos(features[f].x_, features[f].y_);
00061     FCOORD dir;
00062     dir.from_direction(features[f].dir_);
00063     dir *= 8.0f;
00064     window->SetCursor(IntCastRounded(pos.x() - dir.x()),
00065                       IntCastRounded(pos.y() - dir.y()));
00066     window->DrawTo(IntCastRounded(pos.x() + dir.x()),
00067                       IntCastRounded(pos.y() + dir.y()));
00068   }
00069 #endif
00070 }
00071 
00072 // Writes to the given file. Returns false in case of error.
00073 bool WordFeature::Serialize(FILE* fp) const {
00074   if (fwrite(&x_, sizeof(x_), 1, fp) != 1) return false;
00075   if (fwrite(&y_, sizeof(y_), 1, fp) != 1) return false;
00076   if (fwrite(&dir_, sizeof(dir_), 1, fp) != 1) return false;
00077   return true;
00078 }
00079 // Reads from the given file. Returns false in case of error.
00080 // If swap is true, assumes a big/little-endian swap is needed.
00081 bool WordFeature::DeSerialize(bool swap, FILE* fp) {
00082   if (fread(&x_, sizeof(x_), 1, fp) != 1) return false;
00083   if (swap) ReverseN(&x_, sizeof(x_));
00084   if (fread(&y_, sizeof(y_), 1, fp) != 1) return false;
00085   if (fread(&dir_, sizeof(dir_), 1, fp) != 1) return false;
00086   return true;
00087 }
00088 
00089 void FloatWordFeature::FromWordFeatures(
00090     const GenericVector<WordFeature>& word_features,
00091     GenericVector<FloatWordFeature>* float_features) {
00092   for (int i = 0; i < word_features.size(); ++i) {
00093     FloatWordFeature f;
00094     f.x = word_features[i].x();
00095     f.y = word_features[i].y();
00096     f.dir = word_features[i].dir();
00097     f.x_bucket = 0;  // Will set it later.
00098     float_features->push_back(f);
00099   }
00100 }
00101 
00102 // Sort function to sort first by x-bucket, then by y.
00103 /* static */
00104 int FloatWordFeature::SortByXBucket(const void* v1, const void* v2) {
00105   const FloatWordFeature* f1 = reinterpret_cast<const FloatWordFeature*>(v1);
00106   const FloatWordFeature* f2 = reinterpret_cast<const FloatWordFeature*>(v2);
00107   int x_diff = f1->x_bucket - f2->x_bucket;
00108   if (x_diff == 0) return f1->y - f2->y;
00109   return x_diff;
00110 }
00111 
00112 ImageData::ImageData() : page_number_(-1), vertical_text_(false) {
00113 }
00114 // Takes ownership of the pix and destroys it.
00115 ImageData::ImageData(bool vertical, Pix* pix)
00116   : page_number_(0), vertical_text_(vertical) {
00117   SetPix(pix);
00118 }
00119 ImageData::~ImageData() {
00120 }
00121 
00122 // Builds and returns an ImageData from the basic data. Note that imagedata,
00123 // truth_text, and box_text are all the actual file data, NOT filenames.
00124 ImageData* ImageData::Build(const char* name, int page_number, const char* lang,
00125                             const char* imagedata, int imagedatasize,
00126                             const char* truth_text, const char* box_text) {
00127   ImageData* image_data = new ImageData();
00128   image_data->imagefilename_ = name;
00129   image_data->page_number_ = page_number;
00130   image_data->language_ = lang;
00131   // Save the imagedata.
00132   image_data->image_data_.init_to_size(imagedatasize, 0);
00133   memcpy(&image_data->image_data_[0], imagedata, imagedatasize);
00134   if (!image_data->AddBoxes(box_text)) {
00135     if (truth_text == NULL || truth_text[0] == '\0') {
00136       tprintf("Error: No text corresponding to page %d from image %s!\n",
00137               page_number, name);
00138       delete image_data;
00139       return NULL;
00140     }
00141     image_data->transcription_ = truth_text;
00142     // If we have no boxes, the transcription is in the 0th box_texts_.
00143     image_data->box_texts_.push_back(truth_text);
00144     // We will create a box for the whole image on PreScale, to save unpacking
00145     // the image now.
00146   } else if (truth_text != NULL && truth_text[0] != '\0' &&
00147              image_data->transcription_ != truth_text) {
00148     // Save the truth text as it is present and disagrees with the box text.
00149     image_data->transcription_ = truth_text;
00150   }
00151   return image_data;
00152 }
00153 
00154 // Writes to the given file. Returns false in case of error.
00155 bool ImageData::Serialize(TFile* fp) const {
00156   if (!imagefilename_.Serialize(fp)) return false;
00157   if (fp->FWrite(&page_number_, sizeof(page_number_), 1) != 1) return false;
00158   if (!image_data_.Serialize(fp)) return false;
00159   if (!transcription_.Serialize(fp)) return false;
00160   // WARNING: Will not work across different endian machines.
00161   if (!boxes_.Serialize(fp)) return false;
00162   if (!box_texts_.SerializeClasses(fp)) return false;
00163   inT8 vertical = vertical_text_;
00164   if (fp->FWrite(&vertical, sizeof(vertical), 1) != 1) return false;
00165   return true;
00166 }
00167 
00168 // Reads from the given file. Returns false in case of error.
00169 // If swap is true, assumes a big/little-endian swap is needed.
00170 bool ImageData::DeSerialize(bool swap, TFile* fp) {
00171   if (!imagefilename_.DeSerialize(swap, fp)) return false;
00172   if (fp->FRead(&page_number_, sizeof(page_number_), 1) != 1) return false;
00173   if (swap) ReverseN(&page_number_, sizeof(page_number_));
00174   if (!image_data_.DeSerialize(swap, fp)) return false;
00175   if (!transcription_.DeSerialize(swap, fp)) return false;
00176   // WARNING: Will not work across different endian machines.
00177   if (!boxes_.DeSerialize(swap, fp)) return false;
00178   if (!box_texts_.DeSerializeClasses(swap, fp)) return false;
00179   inT8 vertical = 0;
00180   if (fp->FRead(&vertical, sizeof(vertical), 1) != 1) return false;
00181   vertical_text_ = vertical != 0;
00182   return true;
00183 }
00184 
00185 // Saves the given Pix as a PNG-encoded string and destroys it.
00186 void ImageData::SetPix(Pix* pix) {
00187   SetPixInternal(pix, &image_data_);
00188 }
00189 
00190 // Returns the Pix image for *this. Must be pixDestroyed after use.
00191 Pix* ImageData::GetPix() const {
00192   return GetPixInternal(image_data_);
00193 }
00194 
00195 // Gets anything and everything with a non-NULL pointer, prescaled to a
00196 // given target_height (if 0, then the original image height), and aligned.
00197 // Also returns (if not NULL) the width and height of the scaled image.
00198 // The return value is the scale factor that was applied to the image to
00199 // achieve the target_height.
00200 float ImageData::PreScale(int target_height, Pix** pix,
00201                           int* scaled_width, int* scaled_height,
00202                           GenericVector<TBOX>* boxes) const {
00203   int input_width = 0;
00204   int input_height = 0;
00205   Pix* src_pix = GetPix();
00206   ASSERT_HOST(src_pix != NULL);
00207   input_width = pixGetWidth(src_pix);
00208   input_height = pixGetHeight(src_pix);
00209   if (target_height == 0)
00210     target_height = input_height;
00211   float im_factor = static_cast<float>(target_height) / input_height;
00212   if (scaled_width != NULL)
00213     *scaled_width = IntCastRounded(im_factor * input_width);
00214   if (scaled_height != NULL)
00215     *scaled_height = target_height;
00216   if (pix != NULL) {
00217     // Get the scaled image.
00218     pixDestroy(pix);
00219     *pix = pixScale(src_pix, im_factor, im_factor);
00220     if (*pix == NULL) {
00221       tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n",
00222               input_width, input_height, im_factor);
00223     }
00224     if (scaled_width != NULL)
00225       *scaled_width = pixGetWidth(*pix);
00226     if (scaled_height != NULL)
00227       *scaled_height = pixGetHeight(*pix);
00228   }
00229   pixDestroy(&src_pix);
00230   if (boxes != NULL) {
00231     // Get the boxes.
00232     boxes->truncate(0);
00233     for (int b = 0; b < boxes_.size(); ++b) {
00234       TBOX box = boxes_[b];
00235       box.scale(im_factor);
00236       boxes->push_back(box);
00237     }
00238     if (boxes->empty()) {
00239       // Make a single box for the whole image.
00240       TBOX box(0, 0, im_factor * input_width, target_height);
00241       boxes->push_back(box);
00242     }
00243   }
00244   return im_factor;
00245 }
00246 
00247 int ImageData::MemoryUsed() const {
00248   return image_data_.size();
00249 }
00250 
00251 // Draws the data in a new window.
00252 void ImageData::Display() const {
00253 #ifndef GRAPHICS_DISABLED
00254   const int kTextSize = 64;
00255   // Draw the image.
00256   Pix* pix = GetPix();
00257   if (pix == NULL) return;
00258   int width = pixGetWidth(pix);
00259   int height = pixGetHeight(pix);
00260   ScrollView* win = new ScrollView("Imagedata", 100, 100,
00261                                    2 * (width + 2 * kTextSize),
00262                                    2 * (height + 4 * kTextSize),
00263                                    width + 10, height + 3 * kTextSize, true);
00264   win->Image(pix, 0, height - 1);
00265   pixDestroy(&pix);
00266   // Draw the boxes.
00267   win->Pen(ScrollView::RED);
00268   win->Brush(ScrollView::NONE);
00269   win->TextAttributes("Arial", kTextSize, false, false, false);
00270   for (int b = 0; b < boxes_.size(); ++b) {
00271     boxes_[b].plot(win);
00272     win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].string());
00273     TBOX scaled(boxes_[b]);
00274     scaled.scale(256.0 / height);
00275     scaled.plot(win);
00276   }
00277   // The full transcription.
00278   win->Pen(ScrollView::CYAN);
00279   win->Text(0, height + kTextSize * 2, transcription_.string());
00280   // Add the features.
00281   win->Pen(ScrollView::GREEN);
00282   win->Update();
00283   window_wait(win);
00284 #endif
00285 }
00286 
00287 // Adds the supplied boxes and transcriptions that correspond to the correct
00288 // page number.
00289 void ImageData::AddBoxes(const GenericVector<TBOX>& boxes,
00290                          const GenericVector<STRING>& texts,
00291                          const GenericVector<int>& box_pages) {
00292   // Copy the boxes and make the transcription.
00293   for (int i = 0; i < box_pages.size(); ++i) {
00294     if (page_number_ >= 0 && box_pages[i] != page_number_) continue;
00295     transcription_ += texts[i];
00296     boxes_.push_back(boxes[i]);
00297     box_texts_.push_back(texts[i]);
00298   }
00299 }
00300 
00301 // Saves the given Pix as a PNG-encoded string and destroys it.
00302 void ImageData::SetPixInternal(Pix* pix, GenericVector<char>* image_data) {
00303   l_uint8* data;
00304   size_t size;
00305   pixWriteMem(&data, &size, pix, IFF_PNG);
00306   pixDestroy(&pix);
00307   image_data->init_to_size(size, 0);
00308   memcpy(&(*image_data)[0], data, size);
00309   free(data);
00310 }
00311 
00312 // Returns the Pix image for the image_data. Must be pixDestroyed after use.
00313 Pix* ImageData::GetPixInternal(const GenericVector<char>& image_data) {
00314   Pix* pix = NULL;
00315   if (!image_data.empty()) {
00316     // Convert the array to an image.
00317     const unsigned char* u_data =
00318         reinterpret_cast<const unsigned char*>(&image_data[0]);
00319     pix = pixReadMem(u_data, image_data.size());
00320   }
00321   return pix;
00322 }
00323 
00324 // Parses the text string as a box file and adds any discovered boxes that
00325 // match the page number. Returns false on error.
00326 bool ImageData::AddBoxes(const char* box_text) {
00327   if (box_text != NULL && box_text[0] != '\0') {
00328     GenericVector<TBOX> boxes;
00329     GenericVector<STRING> texts;
00330     GenericVector<int> box_pages;
00331     if (ReadMemBoxes(page_number_, false, box_text, &boxes,
00332                      &texts, NULL, &box_pages)) {
00333       AddBoxes(boxes, texts, box_pages);
00334       return true;
00335     } else {
00336       tprintf("Error: No boxes for page %d from image %s!\n",
00337               page_number_, imagefilename_.string());
00338     }
00339   }
00340   return false;
00341 }
00342 
00343 DocumentData::DocumentData(const STRING& name)
00344   : document_name_(name), pages_offset_(0), total_pages_(0),
00345     memory_used_(0), max_memory_(0), reader_(NULL) {}
00346 
00347 DocumentData::~DocumentData() {}
00348 
00349 // Reads all the pages in the given lstmf filename to the cache. The reader
00350 // is used to read the file.
00351 bool DocumentData::LoadDocument(const char* filename, const char* lang,
00352                                 int start_page, inT64 max_memory,
00353                                 FileReader reader) {
00354   document_name_ = filename;
00355   lang_ = lang;
00356   pages_offset_ = start_page;
00357   max_memory_ = max_memory;
00358   reader_ = reader;
00359   return ReCachePages();
00360 }
00361 
00362 // Writes all the pages to the given filename. Returns false on error.
00363 bool DocumentData::SaveDocument(const char* filename, FileWriter writer) {
00364   TFile fp;
00365   fp.OpenWrite(NULL);
00366   if (!pages_.Serialize(&fp) || !fp.CloseWrite(filename, writer)) {
00367     tprintf("Serialize failed: %s\n", filename);
00368     return false;
00369   }
00370   return true;
00371 }
00372 bool DocumentData::SaveToBuffer(GenericVector<char>* buffer) {
00373   TFile fp;
00374   fp.OpenWrite(buffer);
00375   return pages_.Serialize(&fp);
00376 }
00377 
00378 // Returns a pointer to the page with the given index, modulo the total
00379 // number of pages, recaching if needed.
00380 const ImageData* DocumentData::GetPage(int index) {
00381   index = Modulo(index, total_pages_);
00382   if (index < pages_offset_ || index >= pages_offset_ + pages_.size()) {
00383     pages_offset_ = index;
00384     if (!ReCachePages()) return NULL;
00385   }
00386   return pages_[index - pages_offset_];
00387 }
00388 
00389 // Loads as many pages can fit in max_memory_ starting at index pages_offset_.
00390 bool DocumentData::ReCachePages() {
00391   // Read the file.
00392   TFile fp;
00393   if (!fp.Open(document_name_, reader_)) return false;
00394   memory_used_ = 0;
00395   if (!pages_.DeSerialize(false, &fp)) {
00396     tprintf("Deserialize failed: %s\n", document_name_.string());
00397     pages_.truncate(0);
00398     return false;
00399   }
00400   total_pages_ = pages_.size();
00401   pages_offset_ %= total_pages_;
00402   // Delete pages before the first one we want, and relocate the rest.
00403   int page;
00404   for (page = 0; page < pages_.size(); ++page) {
00405     if (page < pages_offset_) {
00406       delete pages_[page];
00407       pages_[page] = NULL;
00408     } else {
00409       ImageData* image_data = pages_[page];
00410       if (max_memory_ > 0 && page > pages_offset_ &&
00411           memory_used_ + image_data->MemoryUsed() > max_memory_)
00412         break;  // Don't go over memory quota unless the first image.
00413       if (image_data->imagefilename().length() == 0) {
00414         image_data->set_imagefilename(document_name_);
00415         image_data->set_page_number(page);
00416       }
00417       image_data->set_language(lang_);
00418       memory_used_ += image_data->MemoryUsed();
00419       if (pages_offset_ != 0) {
00420         pages_[page - pages_offset_] = image_data;
00421         pages_[page] = NULL;
00422       }
00423     }
00424   }
00425   pages_.truncate(page - pages_offset_);
00426   tprintf("Loaded %d/%d pages (%d-%d) of document %s\n",
00427           pages_.size(), total_pages_, pages_offset_,
00428           pages_offset_ + pages_.size(), document_name_.string());
00429   return !pages_.empty();
00430 }
00431 
00432 // Adds the given page data to this document, counting up memory.
00433 void DocumentData::AddPageToDocument(ImageData* page) {
00434   pages_.push_back(page);
00435   memory_used_ += page->MemoryUsed();
00436 }
00437 
00438 // A collection of DocumentData that knows roughly how much memory it is using.
00439 DocumentCache::DocumentCache(inT64 max_memory)
00440   : total_pages_(0), memory_used_(0), max_memory_(max_memory) {}
00441 DocumentCache::~DocumentCache() {}
00442 
00443 // Adds all the documents in the list of filenames, counting memory.
00444 // The reader is used to read the files.
00445 bool DocumentCache::LoadDocuments(const GenericVector<STRING>& filenames,
00446                                   const char* lang, FileReader reader) {
00447   inT64 fair_share_memory = max_memory_ / filenames.size();
00448   for (int arg = 0; arg < filenames.size(); ++arg) {
00449     STRING filename = filenames[arg];
00450     DocumentData* document = new DocumentData(filename);
00451     if (document->LoadDocument(filename.string(), lang, 0,
00452                                fair_share_memory, reader)) {
00453       AddToCache(document);
00454     } else {
00455       tprintf("Failed to load image %s!\n", filename.string());
00456       delete document;
00457     }
00458   }
00459   tprintf("Loaded %d pages, total %gMB\n",
00460           total_pages_, memory_used_ / 1048576.0);
00461   return total_pages_ > 0;
00462 }
00463 
00464 // Adds document to the cache, throwing out other documents if needed.
00465 bool DocumentCache::AddToCache(DocumentData* data) {
00466   inT64 new_memory = data->memory_used();
00467   memory_used_ += new_memory;
00468   documents_.push_back(data);
00469   total_pages_ += data->NumPages();
00470   // Delete the first item in the array, and other pages of the same name
00471   // while memory is full.
00472   while (memory_used_ >= max_memory_ && max_memory_ > 0) {
00473     tprintf("Memory used=%lld vs max=%lld, discarding doc of size %lld\n",
00474             memory_used_ , max_memory_, documents_[0]->memory_used());
00475     memory_used_ -= documents_[0]->memory_used();
00476     total_pages_ -= documents_[0]->NumPages();
00477     documents_.remove(0);
00478   }
00479   return true;
00480 }
00481 
00482 // Finds and returns a document by name.
00483 DocumentData* DocumentCache::FindDocument(const STRING& document_name) const {
00484   for (int i = 0; i < documents_.size(); ++i) {
00485     if (documents_[i]->document_name() == document_name)
00486       return documents_[i];
00487   }
00488   return NULL;
00489 }
00490 
00491 // Returns a page by serial number, selecting them in a round-robin fashion
00492 // from all the documents.
00493 const ImageData* DocumentCache::GetPageBySerial(int serial) {
00494   int document_index = serial % documents_.size();
00495   return documents_[document_index]->GetPage(serial / documents_.size());
00496 }
00497 
00498 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines