|
tesseract 3.04.01
|
00001 00002 // File: imagedata.h 00003 // Description: Class to hold information about a single multi-page tiff 00004 // training file and its corresponding boxes or text file. 00005 // Author: Ray Smith 00006 // Created: Tue May 28 08:56:06 PST 2013 00007 // 00008 // (C) Copyright 2013, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00019 00020 // Include automatically generated configuration file if running autoconf. 00021 #ifdef HAVE_CONFIG_H 00022 #include "config_auto.h" 00023 #endif 00024 00025 #include "imagedata.h" 00026 00027 #include "allheaders.h" 00028 #include "boxread.h" 00029 #include "callcpp.h" 00030 #include "helpers.h" 00031 #include "tprintf.h" 00032 00033 namespace tesseract { 00034 00035 WordFeature::WordFeature() : x_(0), y_(0), dir_(0) { 00036 } 00037 00038 WordFeature::WordFeature(const FCOORD& fcoord, uinT8 dir) 00039 : x_(IntCastRounded(fcoord.x())), 00040 y_(ClipToRange(IntCastRounded(fcoord.y()), 0, MAX_UINT8)), 00041 dir_(dir) { 00042 } 00043 00044 // Computes the maximum x and y value in the features. 00045 void WordFeature::ComputeSize(const GenericVector<WordFeature>& features, 00046 int* max_x, int* max_y) { 00047 *max_x = 0; 00048 *max_y = 0; 00049 for (int f = 0; f < features.size(); ++f) { 00050 if (features[f].x_ > *max_x) *max_x = features[f].x_; 00051 if (features[f].y_ > *max_y) *max_y = features[f].y_; 00052 } 00053 } 00054 00055 // Draws the features in the given window. 00056 void WordFeature::Draw(const GenericVector<WordFeature>& features, 00057 ScrollView* window) { 00058 #ifndef GRAPHICS_DISABLED 00059 for (int f = 0; f < features.size(); ++f) { 00060 FCOORD pos(features[f].x_, features[f].y_); 00061 FCOORD dir; 00062 dir.from_direction(features[f].dir_); 00063 dir *= 8.0f; 00064 window->SetCursor(IntCastRounded(pos.x() - dir.x()), 00065 IntCastRounded(pos.y() - dir.y())); 00066 window->DrawTo(IntCastRounded(pos.x() + dir.x()), 00067 IntCastRounded(pos.y() + dir.y())); 00068 } 00069 #endif 00070 } 00071 00072 // Writes to the given file. Returns false in case of error. 00073 bool WordFeature::Serialize(FILE* fp) const { 00074 if (fwrite(&x_, sizeof(x_), 1, fp) != 1) return false; 00075 if (fwrite(&y_, sizeof(y_), 1, fp) != 1) return false; 00076 if (fwrite(&dir_, sizeof(dir_), 1, fp) != 1) return false; 00077 return true; 00078 } 00079 // Reads from the given file. Returns false in case of error. 00080 // If swap is true, assumes a big/little-endian swap is needed. 00081 bool WordFeature::DeSerialize(bool swap, FILE* fp) { 00082 if (fread(&x_, sizeof(x_), 1, fp) != 1) return false; 00083 if (swap) ReverseN(&x_, sizeof(x_)); 00084 if (fread(&y_, sizeof(y_), 1, fp) != 1) return false; 00085 if (fread(&dir_, sizeof(dir_), 1, fp) != 1) return false; 00086 return true; 00087 } 00088 00089 void FloatWordFeature::FromWordFeatures( 00090 const GenericVector<WordFeature>& word_features, 00091 GenericVector<FloatWordFeature>* float_features) { 00092 for (int i = 0; i < word_features.size(); ++i) { 00093 FloatWordFeature f; 00094 f.x = word_features[i].x(); 00095 f.y = word_features[i].y(); 00096 f.dir = word_features[i].dir(); 00097 f.x_bucket = 0; // Will set it later. 00098 float_features->push_back(f); 00099 } 00100 } 00101 00102 // Sort function to sort first by x-bucket, then by y. 00103 /* static */ 00104 int FloatWordFeature::SortByXBucket(const void* v1, const void* v2) { 00105 const FloatWordFeature* f1 = reinterpret_cast<const FloatWordFeature*>(v1); 00106 const FloatWordFeature* f2 = reinterpret_cast<const FloatWordFeature*>(v2); 00107 int x_diff = f1->x_bucket - f2->x_bucket; 00108 if (x_diff == 0) return f1->y - f2->y; 00109 return x_diff; 00110 } 00111 00112 ImageData::ImageData() : page_number_(-1), vertical_text_(false) { 00113 } 00114 // Takes ownership of the pix and destroys it. 00115 ImageData::ImageData(bool vertical, Pix* pix) 00116 : page_number_(0), vertical_text_(vertical) { 00117 SetPix(pix); 00118 } 00119 ImageData::~ImageData() { 00120 } 00121 00122 // Builds and returns an ImageData from the basic data. Note that imagedata, 00123 // truth_text, and box_text are all the actual file data, NOT filenames. 00124 ImageData* ImageData::Build(const char* name, int page_number, const char* lang, 00125 const char* imagedata, int imagedatasize, 00126 const char* truth_text, const char* box_text) { 00127 ImageData* image_data = new ImageData(); 00128 image_data->imagefilename_ = name; 00129 image_data->page_number_ = page_number; 00130 image_data->language_ = lang; 00131 // Save the imagedata. 00132 image_data->image_data_.init_to_size(imagedatasize, 0); 00133 memcpy(&image_data->image_data_[0], imagedata, imagedatasize); 00134 if (!image_data->AddBoxes(box_text)) { 00135 if (truth_text == NULL || truth_text[0] == '\0') { 00136 tprintf("Error: No text corresponding to page %d from image %s!\n", 00137 page_number, name); 00138 delete image_data; 00139 return NULL; 00140 } 00141 image_data->transcription_ = truth_text; 00142 // If we have no boxes, the transcription is in the 0th box_texts_. 00143 image_data->box_texts_.push_back(truth_text); 00144 // We will create a box for the whole image on PreScale, to save unpacking 00145 // the image now. 00146 } else if (truth_text != NULL && truth_text[0] != '\0' && 00147 image_data->transcription_ != truth_text) { 00148 // Save the truth text as it is present and disagrees with the box text. 00149 image_data->transcription_ = truth_text; 00150 } 00151 return image_data; 00152 } 00153 00154 // Writes to the given file. Returns false in case of error. 00155 bool ImageData::Serialize(TFile* fp) const { 00156 if (!imagefilename_.Serialize(fp)) return false; 00157 if (fp->FWrite(&page_number_, sizeof(page_number_), 1) != 1) return false; 00158 if (!image_data_.Serialize(fp)) return false; 00159 if (!transcription_.Serialize(fp)) return false; 00160 // WARNING: Will not work across different endian machines. 00161 if (!boxes_.Serialize(fp)) return false; 00162 if (!box_texts_.SerializeClasses(fp)) return false; 00163 inT8 vertical = vertical_text_; 00164 if (fp->FWrite(&vertical, sizeof(vertical), 1) != 1) return false; 00165 return true; 00166 } 00167 00168 // Reads from the given file. Returns false in case of error. 00169 // If swap is true, assumes a big/little-endian swap is needed. 00170 bool ImageData::DeSerialize(bool swap, TFile* fp) { 00171 if (!imagefilename_.DeSerialize(swap, fp)) return false; 00172 if (fp->FRead(&page_number_, sizeof(page_number_), 1) != 1) return false; 00173 if (swap) ReverseN(&page_number_, sizeof(page_number_)); 00174 if (!image_data_.DeSerialize(swap, fp)) return false; 00175 if (!transcription_.DeSerialize(swap, fp)) return false; 00176 // WARNING: Will not work across different endian machines. 00177 if (!boxes_.DeSerialize(swap, fp)) return false; 00178 if (!box_texts_.DeSerializeClasses(swap, fp)) return false; 00179 inT8 vertical = 0; 00180 if (fp->FRead(&vertical, sizeof(vertical), 1) != 1) return false; 00181 vertical_text_ = vertical != 0; 00182 return true; 00183 } 00184 00185 // Saves the given Pix as a PNG-encoded string and destroys it. 00186 void ImageData::SetPix(Pix* pix) { 00187 SetPixInternal(pix, &image_data_); 00188 } 00189 00190 // Returns the Pix image for *this. Must be pixDestroyed after use. 00191 Pix* ImageData::GetPix() const { 00192 return GetPixInternal(image_data_); 00193 } 00194 00195 // Gets anything and everything with a non-NULL pointer, prescaled to a 00196 // given target_height (if 0, then the original image height), and aligned. 00197 // Also returns (if not NULL) the width and height of the scaled image. 00198 // The return value is the scale factor that was applied to the image to 00199 // achieve the target_height. 00200 float ImageData::PreScale(int target_height, Pix** pix, 00201 int* scaled_width, int* scaled_height, 00202 GenericVector<TBOX>* boxes) const { 00203 int input_width = 0; 00204 int input_height = 0; 00205 Pix* src_pix = GetPix(); 00206 ASSERT_HOST(src_pix != NULL); 00207 input_width = pixGetWidth(src_pix); 00208 input_height = pixGetHeight(src_pix); 00209 if (target_height == 0) 00210 target_height = input_height; 00211 float im_factor = static_cast<float>(target_height) / input_height; 00212 if (scaled_width != NULL) 00213 *scaled_width = IntCastRounded(im_factor * input_width); 00214 if (scaled_height != NULL) 00215 *scaled_height = target_height; 00216 if (pix != NULL) { 00217 // Get the scaled image. 00218 pixDestroy(pix); 00219 *pix = pixScale(src_pix, im_factor, im_factor); 00220 if (*pix == NULL) { 00221 tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n", 00222 input_width, input_height, im_factor); 00223 } 00224 if (scaled_width != NULL) 00225 *scaled_width = pixGetWidth(*pix); 00226 if (scaled_height != NULL) 00227 *scaled_height = pixGetHeight(*pix); 00228 } 00229 pixDestroy(&src_pix); 00230 if (boxes != NULL) { 00231 // Get the boxes. 00232 boxes->truncate(0); 00233 for (int b = 0; b < boxes_.size(); ++b) { 00234 TBOX box = boxes_[b]; 00235 box.scale(im_factor); 00236 boxes->push_back(box); 00237 } 00238 if (boxes->empty()) { 00239 // Make a single box for the whole image. 00240 TBOX box(0, 0, im_factor * input_width, target_height); 00241 boxes->push_back(box); 00242 } 00243 } 00244 return im_factor; 00245 } 00246 00247 int ImageData::MemoryUsed() const { 00248 return image_data_.size(); 00249 } 00250 00251 // Draws the data in a new window. 00252 void ImageData::Display() const { 00253 #ifndef GRAPHICS_DISABLED 00254 const int kTextSize = 64; 00255 // Draw the image. 00256 Pix* pix = GetPix(); 00257 if (pix == NULL) return; 00258 int width = pixGetWidth(pix); 00259 int height = pixGetHeight(pix); 00260 ScrollView* win = new ScrollView("Imagedata", 100, 100, 00261 2 * (width + 2 * kTextSize), 00262 2 * (height + 4 * kTextSize), 00263 width + 10, height + 3 * kTextSize, true); 00264 win->Image(pix, 0, height - 1); 00265 pixDestroy(&pix); 00266 // Draw the boxes. 00267 win->Pen(ScrollView::RED); 00268 win->Brush(ScrollView::NONE); 00269 win->TextAttributes("Arial", kTextSize, false, false, false); 00270 for (int b = 0; b < boxes_.size(); ++b) { 00271 boxes_[b].plot(win); 00272 win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].string()); 00273 TBOX scaled(boxes_[b]); 00274 scaled.scale(256.0 / height); 00275 scaled.plot(win); 00276 } 00277 // The full transcription. 00278 win->Pen(ScrollView::CYAN); 00279 win->Text(0, height + kTextSize * 2, transcription_.string()); 00280 // Add the features. 00281 win->Pen(ScrollView::GREEN); 00282 win->Update(); 00283 window_wait(win); 00284 #endif 00285 } 00286 00287 // Adds the supplied boxes and transcriptions that correspond to the correct 00288 // page number. 00289 void ImageData::AddBoxes(const GenericVector<TBOX>& boxes, 00290 const GenericVector<STRING>& texts, 00291 const GenericVector<int>& box_pages) { 00292 // Copy the boxes and make the transcription. 00293 for (int i = 0; i < box_pages.size(); ++i) { 00294 if (page_number_ >= 0 && box_pages[i] != page_number_) continue; 00295 transcription_ += texts[i]; 00296 boxes_.push_back(boxes[i]); 00297 box_texts_.push_back(texts[i]); 00298 } 00299 } 00300 00301 // Saves the given Pix as a PNG-encoded string and destroys it. 00302 void ImageData::SetPixInternal(Pix* pix, GenericVector<char>* image_data) { 00303 l_uint8* data; 00304 size_t size; 00305 pixWriteMem(&data, &size, pix, IFF_PNG); 00306 pixDestroy(&pix); 00307 image_data->init_to_size(size, 0); 00308 memcpy(&(*image_data)[0], data, size); 00309 free(data); 00310 } 00311 00312 // Returns the Pix image for the image_data. Must be pixDestroyed after use. 00313 Pix* ImageData::GetPixInternal(const GenericVector<char>& image_data) { 00314 Pix* pix = NULL; 00315 if (!image_data.empty()) { 00316 // Convert the array to an image. 00317 const unsigned char* u_data = 00318 reinterpret_cast<const unsigned char*>(&image_data[0]); 00319 pix = pixReadMem(u_data, image_data.size()); 00320 } 00321 return pix; 00322 } 00323 00324 // Parses the text string as a box file and adds any discovered boxes that 00325 // match the page number. Returns false on error. 00326 bool ImageData::AddBoxes(const char* box_text) { 00327 if (box_text != NULL && box_text[0] != '\0') { 00328 GenericVector<TBOX> boxes; 00329 GenericVector<STRING> texts; 00330 GenericVector<int> box_pages; 00331 if (ReadMemBoxes(page_number_, false, box_text, &boxes, 00332 &texts, NULL, &box_pages)) { 00333 AddBoxes(boxes, texts, box_pages); 00334 return true; 00335 } else { 00336 tprintf("Error: No boxes for page %d from image %s!\n", 00337 page_number_, imagefilename_.string()); 00338 } 00339 } 00340 return false; 00341 } 00342 00343 DocumentData::DocumentData(const STRING& name) 00344 : document_name_(name), pages_offset_(0), total_pages_(0), 00345 memory_used_(0), max_memory_(0), reader_(NULL) {} 00346 00347 DocumentData::~DocumentData() {} 00348 00349 // Reads all the pages in the given lstmf filename to the cache. The reader 00350 // is used to read the file. 00351 bool DocumentData::LoadDocument(const char* filename, const char* lang, 00352 int start_page, inT64 max_memory, 00353 FileReader reader) { 00354 document_name_ = filename; 00355 lang_ = lang; 00356 pages_offset_ = start_page; 00357 max_memory_ = max_memory; 00358 reader_ = reader; 00359 return ReCachePages(); 00360 } 00361 00362 // Writes all the pages to the given filename. Returns false on error. 00363 bool DocumentData::SaveDocument(const char* filename, FileWriter writer) { 00364 TFile fp; 00365 fp.OpenWrite(NULL); 00366 if (!pages_.Serialize(&fp) || !fp.CloseWrite(filename, writer)) { 00367 tprintf("Serialize failed: %s\n", filename); 00368 return false; 00369 } 00370 return true; 00371 } 00372 bool DocumentData::SaveToBuffer(GenericVector<char>* buffer) { 00373 TFile fp; 00374 fp.OpenWrite(buffer); 00375 return pages_.Serialize(&fp); 00376 } 00377 00378 // Returns a pointer to the page with the given index, modulo the total 00379 // number of pages, recaching if needed. 00380 const ImageData* DocumentData::GetPage(int index) { 00381 index = Modulo(index, total_pages_); 00382 if (index < pages_offset_ || index >= pages_offset_ + pages_.size()) { 00383 pages_offset_ = index; 00384 if (!ReCachePages()) return NULL; 00385 } 00386 return pages_[index - pages_offset_]; 00387 } 00388 00389 // Loads as many pages can fit in max_memory_ starting at index pages_offset_. 00390 bool DocumentData::ReCachePages() { 00391 // Read the file. 00392 TFile fp; 00393 if (!fp.Open(document_name_, reader_)) return false; 00394 memory_used_ = 0; 00395 if (!pages_.DeSerialize(false, &fp)) { 00396 tprintf("Deserialize failed: %s\n", document_name_.string()); 00397 pages_.truncate(0); 00398 return false; 00399 } 00400 total_pages_ = pages_.size(); 00401 pages_offset_ %= total_pages_; 00402 // Delete pages before the first one we want, and relocate the rest. 00403 int page; 00404 for (page = 0; page < pages_.size(); ++page) { 00405 if (page < pages_offset_) { 00406 delete pages_[page]; 00407 pages_[page] = NULL; 00408 } else { 00409 ImageData* image_data = pages_[page]; 00410 if (max_memory_ > 0 && page > pages_offset_ && 00411 memory_used_ + image_data->MemoryUsed() > max_memory_) 00412 break; // Don't go over memory quota unless the first image. 00413 if (image_data->imagefilename().length() == 0) { 00414 image_data->set_imagefilename(document_name_); 00415 image_data->set_page_number(page); 00416 } 00417 image_data->set_language(lang_); 00418 memory_used_ += image_data->MemoryUsed(); 00419 if (pages_offset_ != 0) { 00420 pages_[page - pages_offset_] = image_data; 00421 pages_[page] = NULL; 00422 } 00423 } 00424 } 00425 pages_.truncate(page - pages_offset_); 00426 tprintf("Loaded %d/%d pages (%d-%d) of document %s\n", 00427 pages_.size(), total_pages_, pages_offset_, 00428 pages_offset_ + pages_.size(), document_name_.string()); 00429 return !pages_.empty(); 00430 } 00431 00432 // Adds the given page data to this document, counting up memory. 00433 void DocumentData::AddPageToDocument(ImageData* page) { 00434 pages_.push_back(page); 00435 memory_used_ += page->MemoryUsed(); 00436 } 00437 00438 // A collection of DocumentData that knows roughly how much memory it is using. 00439 DocumentCache::DocumentCache(inT64 max_memory) 00440 : total_pages_(0), memory_used_(0), max_memory_(max_memory) {} 00441 DocumentCache::~DocumentCache() {} 00442 00443 // Adds all the documents in the list of filenames, counting memory. 00444 // The reader is used to read the files. 00445 bool DocumentCache::LoadDocuments(const GenericVector<STRING>& filenames, 00446 const char* lang, FileReader reader) { 00447 inT64 fair_share_memory = max_memory_ / filenames.size(); 00448 for (int arg = 0; arg < filenames.size(); ++arg) { 00449 STRING filename = filenames[arg]; 00450 DocumentData* document = new DocumentData(filename); 00451 if (document->LoadDocument(filename.string(), lang, 0, 00452 fair_share_memory, reader)) { 00453 AddToCache(document); 00454 } else { 00455 tprintf("Failed to load image %s!\n", filename.string()); 00456 delete document; 00457 } 00458 } 00459 tprintf("Loaded %d pages, total %gMB\n", 00460 total_pages_, memory_used_ / 1048576.0); 00461 return total_pages_ > 0; 00462 } 00463 00464 // Adds document to the cache, throwing out other documents if needed. 00465 bool DocumentCache::AddToCache(DocumentData* data) { 00466 inT64 new_memory = data->memory_used(); 00467 memory_used_ += new_memory; 00468 documents_.push_back(data); 00469 total_pages_ += data->NumPages(); 00470 // Delete the first item in the array, and other pages of the same name 00471 // while memory is full. 00472 while (memory_used_ >= max_memory_ && max_memory_ > 0) { 00473 tprintf("Memory used=%lld vs max=%lld, discarding doc of size %lld\n", 00474 memory_used_ , max_memory_, documents_[0]->memory_used()); 00475 memory_used_ -= documents_[0]->memory_used(); 00476 total_pages_ -= documents_[0]->NumPages(); 00477 documents_.remove(0); 00478 } 00479 return true; 00480 } 00481 00482 // Finds and returns a document by name. 00483 DocumentData* DocumentCache::FindDocument(const STRING& document_name) const { 00484 for (int i = 0; i < documents_.size(); ++i) { 00485 if (documents_[i]->document_name() == document_name) 00486 return documents_[i]; 00487 } 00488 return NULL; 00489 } 00490 00491 // Returns a page by serial number, selecting them in a round-robin fashion 00492 // from all the documents. 00493 const ImageData* DocumentCache::GetPageBySerial(int serial) { 00494 int document_index = serial % documents_.size(); 00495 return documents_[document_index]->GetPage(serial / documents_.size()); 00496 } 00497 00498 } // namespace tesseract.