tesseract 3.04.01

training/boxchar.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        boxchar.cpp
00003  * Description: Simple class to associate a Tesseract classification unit with
00004  *              its bounding box so that the boxes can be rotated as the image
00005  *              is rotated for degradation.  Also includes routines to output
00006  *              the character-tagged boxes to a boxfile.
00007  * Author:      Ray Smith
00008  * Created:     Mon Nov 18 2013
00009  *
00010  * (C) Copyright 2013, Google Inc.
00011  * Licensed under the Apache License, Version 2.0 (the "License");
00012  * you may not use this file except in compliance with the License.
00013  * You may obtain a copy of the License at
00014  * http://www.apache.org/licenses/LICENSE-2.0
00015  * Unless required by applicable law or agreed to in writing, software
00016  * distributed under the License is distributed on an "AS IS" BASIS,
00017  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00018  * See the License for the specific language governing permissions and
00019  * limitations under the License.
00020  *
00021  **********************************************************************/
00022 
00023 #include "boxchar.h"
00024 
00025 #include <stddef.h>
00026 #include <algorithm>
00027 
00028 #include "fileio.h"
00029 #include "genericvector.h"
00030 #include "ndminx.h"
00031 #include "normstrngs.h"
00032 #include "tprintf.h"
00033 #include "unicharset.h"
00034 #include "unicode/uchar.h"  // from libicu
00035 
00036 // Absolute Ratio of dx:dy or dy:dx to be a newline.
00037 const int kMinNewlineRatio = 5;
00038 
00039 namespace tesseract {
00040 
00041 BoxChar::BoxChar(const char* utf8_str, int len) : ch_(utf8_str, len) {
00042   box_ = NULL;
00043 }
00044 
00045 BoxChar::~BoxChar() { boxDestroy(&box_); }
00046 
00047 void BoxChar::AddBox(int x, int y, int width, int height) {
00048   box_ = boxCreate(x, y, width, height);
00049 }
00050 
00051 /* static */
00052 void BoxChar::TranslateBoxes(int xshift, int yshift, vector<BoxChar*>* boxes) {
00053   for (int i = 0; i < boxes->size(); ++i) {
00054     BOX* box = (*boxes)[i]->box_;
00055     if (box != NULL) {
00056       box->x += xshift;
00057       box->y += yshift;
00058     }
00059   }
00060 }
00061 
00062 // Prepares for writing the boxes to a file by inserting newlines, spaces,
00063 // and re-ordering so the boxes are strictly left-to-right.
00064 /* static */
00065 void BoxChar::PrepareToWrite(vector<BoxChar*>* boxes) {
00066   bool rtl_rules = ContainsMostlyRTL(*boxes);
00067   bool vertical_rules = MostlyVertical(*boxes);
00068   InsertNewlines(rtl_rules, vertical_rules, boxes);
00069   InsertSpaces(rtl_rules, vertical_rules, boxes);
00070   for (int i = 0; i < boxes->size(); ++i) {
00071     if ((*boxes)[i]->box_ == NULL) tprintf("Null box at index %d\n", i);
00072   }
00073   if (rtl_rules) {
00074     ReorderRTLText(boxes);
00075   }
00076   tprintf("Rtl = %d ,vertical=%d\n", rtl_rules, vertical_rules);
00077 }
00078 
00079 // Inserts newline (tab) characters into the vector at newline positions.
00080 /* static */
00081 void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules,
00082                              vector<BoxChar*>* boxes) {
00083   int prev_i = -1;
00084   int max_shift = 0;
00085   for (int i = 0; i < boxes->size(); ++i) {
00086     Box* box = (*boxes)[i]->box_;
00087     if (box == NULL) {
00088       if (prev_i < 0 || prev_i < i - 1 || i + 1 == boxes->size()) {
00089         // Erase null boxes at the start of a line and after another null box.
00090         do {
00091           delete (*boxes)[i];
00092           boxes->erase(boxes->begin() + i);
00093           --i;
00094         } while (i >= 0 && i + 1 == boxes->size() && (*boxes)[i]->box_ == NULL);
00095       }
00096       continue;
00097     }
00098     if (prev_i >= 0) {
00099       Box* prev_box = (*boxes)[prev_i]->box_;
00100       int shift = box->x - prev_box->x;
00101       if (vertical_rules) {
00102         shift = box->y - prev_box->y;
00103       } else if (rtl_rules) {
00104         shift = -shift;
00105       }
00106       if (-shift > max_shift) {
00107         // This is a newline.
00108         int width = prev_box->w;
00109         int height = prev_box->h;
00110         int x = prev_box->x + width;
00111         int y = prev_box->y;
00112         if (vertical_rules) {
00113           x = prev_box->x;
00114           y = prev_box->y + height;
00115         } else if (rtl_rules) {
00116           x = prev_box->x - width;
00117           if (x < 0) {
00118             tprintf("prev x = %d, width=%d\n", prev_box->x, width);
00119             x = 0;
00120           }
00121         }
00122         if (prev_i == i - 1) {
00123           // New character needed.
00124           BoxChar* new_box = new BoxChar("\t", 1);
00125           new_box->AddBox(x, y, width, height);
00126           new_box->page_ = (*boxes)[i]->page_;
00127           boxes->insert(boxes->begin() + i, new_box);
00128           ++i;
00129         } else {
00130           (*boxes)[i - 1]->AddBox(x, y, width, height);
00131           (*boxes)[i - 1]->ch_ = "\t";
00132         }
00133         max_shift = 0;
00134       } else if (shift > max_shift) {
00135         max_shift = shift;
00136       }
00137     }
00138     prev_i = i;
00139   }
00140 }
00141 
00142 // Converts NULL boxes to space characters, with appropriate bounding boxes.
00143 /* static */
00144 void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules,
00145                            vector<BoxChar*>* boxes) {
00146   // After InsertNewlines, any remaining null boxes are not newlines, and are
00147   // singletons, so add a box to each remaining null box.
00148   for (int i = 1; i + 1 < boxes->size(); ++i) {
00149     Box* box = (*boxes)[i]->box_;
00150     if (box == NULL) {
00151       Box* prev = (*boxes)[i - 1]->box_;
00152       Box* next = (*boxes)[i + 1]->box_;
00153       ASSERT_HOST(prev != NULL && next != NULL);
00154       int top = MIN(prev->y, next->y);
00155       int bottom = MAX(prev->y + prev->h, next->y + next->h);
00156       int left = prev->x + prev->w;
00157       int right = next->x;
00158       if (vertical_rules) {
00159         top = prev->y + prev->h;
00160         bottom = next->y;
00161         left = MIN(prev->x, next->x);
00162         right = MAX(prev->x + prev->w, next->x + next->w);
00163       } else if (rtl_rules) {
00164         // With RTL we have to account for BiDi.
00165         // Right becomes the min left of all prior boxes back to the first
00166         // space or newline.
00167         right = prev->x;
00168         left = next->x + next->w;
00169         for (int j = i - 2;
00170              j >= 0 && (*boxes)[j]->ch_ != " " && (*boxes)[j]->ch_ != "\t";
00171              --j) {
00172           prev = (*boxes)[j]->box_;
00173           ASSERT_HOST(prev != NULL);
00174           if (prev->x < right) {
00175             right = prev->x;
00176           }
00177         }
00178         // Left becomes the max right of all next boxes forward to the first
00179         // space or newline.
00180         for (int j = i + 2; j < boxes->size() && (*boxes)[j]->box_ != NULL &&
00181                                 (*boxes)[j]->ch_ != "\t";
00182              ++j) {
00183           next = (*boxes)[j]->box_;
00184           if (next->x + next->w > left) {
00185             left = next->x + next->w;
00186           }
00187         }
00188       }
00189       // Italic and stylized characters can produce negative spaces, which
00190       // Leptonica doesn't like, so clip to a positive size.
00191       if (right <= left) right = left + 1;
00192       if (bottom <= top) bottom = top + 1;
00193       (*boxes)[i]->AddBox(left, top, right - left, bottom - top);
00194       (*boxes)[i]->ch_ = " ";
00195     }
00196   }
00197 }
00198 
00199 // Reorders text in a right-to-left script in left-to-right order.
00200 /* static */
00201 void BoxChar::ReorderRTLText(vector<BoxChar*>* boxes) {
00202   // After adding newlines and spaces, this task is simply a matter of sorting
00203   // by left each group of boxes between newlines.
00204   BoxCharPtrSort sorter;
00205   int end = 0;
00206   for (int start = 0; start < boxes->size(); start = end + 1) {
00207     end = start + 1;
00208     while (end < boxes->size() && (*boxes)[end]->ch_ != "\t") ++end;
00209     sort(boxes->begin() + start, boxes->begin() + end, sorter);
00210   }
00211 }
00212 
00213 // Returns true if the vector contains mostly RTL characters.
00214 /* static */
00215 bool BoxChar::ContainsMostlyRTL(const vector<BoxChar*>& boxes) {
00216   int num_rtl = 0, num_ltr = 0;
00217   for (int i = 0; i < boxes.size(); ++i) {
00218     // Convert the unichar to UTF32 representation
00219     GenericVector<char32> uni_vector;
00220     if (!UNICHAR::UTF8ToUnicode(boxes[i]->ch_.c_str(), &uni_vector)) {
00221       tprintf("Illegal utf8 in boxchar %d string:%s = ", i,
00222               boxes[i]->ch_.c_str());
00223       for (int c = 0; c < boxes[i]->ch_.size(); ++c) {
00224         tprintf(" 0x%x", boxes[i]->ch_[c]);
00225       }
00226       tprintf("\n");
00227       continue;
00228     }
00229     for (int j = 0; j < uni_vector.size(); ++j) {
00230       UCharDirection dir = u_charDirection(uni_vector[j]);
00231       if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC ||
00232           dir == U_ARABIC_NUMBER) {
00233         ++num_rtl;
00234       } else {
00235         ++num_ltr;
00236       }
00237     }
00238   }
00239   return num_rtl > num_ltr;
00240 }
00241 
00242 // Returns true if the text is mostly laid out vertically.
00243 /* static */
00244 bool BoxChar::MostlyVertical(const vector<BoxChar*>& boxes) {
00245   inT64 total_dx = 0, total_dy = 0;
00246   for (int i = 1; i < boxes.size(); ++i) {
00247     if (boxes[i - 1]->box_ != NULL && boxes[i]->box_ != NULL &&
00248         boxes[i - 1]->page_ == boxes[i]->page_) {
00249       int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x;
00250       int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y;
00251       if (abs(dx) > abs(dy) * kMinNewlineRatio ||
00252           abs(dy) > abs(dx) * kMinNewlineRatio) {
00253         total_dx += dx * dx;
00254         total_dy += dy * dy;
00255       }
00256     }
00257   }
00258   return total_dy > total_dx;
00259 }
00260 
00261 // Returns the total length of all the strings in the boxes.
00262 /* static */
00263 int BoxChar::TotalByteLength(const vector<BoxChar*>& boxes) {
00264   int total_length = 0;
00265   for (int i = 0; i < boxes.size(); ++i) total_length += boxes[i]->ch_.size();
00266   return total_length;
00267 }
00268 
00269 // Rotate the boxes in [start_box, end_box) by the given rotation.
00270 // The rotation is in radians clockwise about the given center.
00271 /* static */
00272 void BoxChar::RotateBoxes(float rotation, int xcenter, int ycenter,
00273                           int start_box, int end_box, vector<BoxChar*>* boxes) {
00274   Boxa* orig = boxaCreate(0);
00275   for (int i = start_box; i < end_box; ++i) {
00276     BOX* box = (*boxes)[i]->box_;
00277     if (box) boxaAddBox(orig, box, L_CLONE);
00278   }
00279   Boxa* rotated = boxaRotate(orig, xcenter, ycenter, rotation);
00280   boxaDestroy(&orig);
00281   for (int i = start_box, box_ind = 0; i < end_box; ++i) {
00282     if ((*boxes)[i]->box_) {
00283       boxDestroy(&((*boxes)[i]->box_));
00284       (*boxes)[i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE);
00285     }
00286   }
00287   boxaDestroy(&rotated);
00288 }
00289 
00290 const int kMaxLineLength = 1024;
00291 /* static */
00292 void BoxChar::WriteTesseractBoxFile(const string& filename, int height,
00293                                     const vector<BoxChar*>& boxes) {
00294   string output;
00295   char buffer[kMaxLineLength];
00296   for (int i = 0; i < boxes.size(); ++i) {
00297     const Box* box = boxes[i]->box_;
00298     if (box == NULL) {
00299       tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n");
00300       return;
00301     }
00302     int nbytes =
00303         snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n",
00304                  boxes[i]->ch_.c_str(), box->x, height - box->y - box->h,
00305                  box->x + box->w, height - box->y, boxes[i]->page_);
00306     output.append(buffer, nbytes);
00307   }
00308   File::WriteStringToFileOrDie(output, filename);
00309 }
00310 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines