|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: boxchar.cpp 00003 * Description: Simple class to associate a Tesseract classification unit with 00004 * its bounding box so that the boxes can be rotated as the image 00005 * is rotated for degradation. Also includes routines to output 00006 * the character-tagged boxes to a boxfile. 00007 * Author: Ray Smith 00008 * Created: Mon Nov 18 2013 00009 * 00010 * (C) Copyright 2013, Google Inc. 00011 * Licensed under the Apache License, Version 2.0 (the "License"); 00012 * you may not use this file except in compliance with the License. 00013 * You may obtain a copy of the License at 00014 * http://www.apache.org/licenses/LICENSE-2.0 00015 * Unless required by applicable law or agreed to in writing, software 00016 * distributed under the License is distributed on an "AS IS" BASIS, 00017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00018 * See the License for the specific language governing permissions and 00019 * limitations under the License. 00020 * 00021 **********************************************************************/ 00022 00023 #include "boxchar.h" 00024 00025 #include <stddef.h> 00026 #include <algorithm> 00027 00028 #include "fileio.h" 00029 #include "genericvector.h" 00030 #include "ndminx.h" 00031 #include "normstrngs.h" 00032 #include "tprintf.h" 00033 #include "unicharset.h" 00034 #include "unicode/uchar.h" // from libicu 00035 00036 // Absolute Ratio of dx:dy or dy:dx to be a newline. 00037 const int kMinNewlineRatio = 5; 00038 00039 namespace tesseract { 00040 00041 BoxChar::BoxChar(const char* utf8_str, int len) : ch_(utf8_str, len) { 00042 box_ = NULL; 00043 } 00044 00045 BoxChar::~BoxChar() { boxDestroy(&box_); } 00046 00047 void BoxChar::AddBox(int x, int y, int width, int height) { 00048 box_ = boxCreate(x, y, width, height); 00049 } 00050 00051 /* static */ 00052 void BoxChar::TranslateBoxes(int xshift, int yshift, vector<BoxChar*>* boxes) { 00053 for (int i = 0; i < boxes->size(); ++i) { 00054 BOX* box = (*boxes)[i]->box_; 00055 if (box != NULL) { 00056 box->x += xshift; 00057 box->y += yshift; 00058 } 00059 } 00060 } 00061 00062 // Prepares for writing the boxes to a file by inserting newlines, spaces, 00063 // and re-ordering so the boxes are strictly left-to-right. 00064 /* static */ 00065 void BoxChar::PrepareToWrite(vector<BoxChar*>* boxes) { 00066 bool rtl_rules = ContainsMostlyRTL(*boxes); 00067 bool vertical_rules = MostlyVertical(*boxes); 00068 InsertNewlines(rtl_rules, vertical_rules, boxes); 00069 InsertSpaces(rtl_rules, vertical_rules, boxes); 00070 for (int i = 0; i < boxes->size(); ++i) { 00071 if ((*boxes)[i]->box_ == NULL) tprintf("Null box at index %d\n", i); 00072 } 00073 if (rtl_rules) { 00074 ReorderRTLText(boxes); 00075 } 00076 tprintf("Rtl = %d ,vertical=%d\n", rtl_rules, vertical_rules); 00077 } 00078 00079 // Inserts newline (tab) characters into the vector at newline positions. 00080 /* static */ 00081 void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules, 00082 vector<BoxChar*>* boxes) { 00083 int prev_i = -1; 00084 int max_shift = 0; 00085 for (int i = 0; i < boxes->size(); ++i) { 00086 Box* box = (*boxes)[i]->box_; 00087 if (box == NULL) { 00088 if (prev_i < 0 || prev_i < i - 1 || i + 1 == boxes->size()) { 00089 // Erase null boxes at the start of a line and after another null box. 00090 do { 00091 delete (*boxes)[i]; 00092 boxes->erase(boxes->begin() + i); 00093 --i; 00094 } while (i >= 0 && i + 1 == boxes->size() && (*boxes)[i]->box_ == NULL); 00095 } 00096 continue; 00097 } 00098 if (prev_i >= 0) { 00099 Box* prev_box = (*boxes)[prev_i]->box_; 00100 int shift = box->x - prev_box->x; 00101 if (vertical_rules) { 00102 shift = box->y - prev_box->y; 00103 } else if (rtl_rules) { 00104 shift = -shift; 00105 } 00106 if (-shift > max_shift) { 00107 // This is a newline. 00108 int width = prev_box->w; 00109 int height = prev_box->h; 00110 int x = prev_box->x + width; 00111 int y = prev_box->y; 00112 if (vertical_rules) { 00113 x = prev_box->x; 00114 y = prev_box->y + height; 00115 } else if (rtl_rules) { 00116 x = prev_box->x - width; 00117 if (x < 0) { 00118 tprintf("prev x = %d, width=%d\n", prev_box->x, width); 00119 x = 0; 00120 } 00121 } 00122 if (prev_i == i - 1) { 00123 // New character needed. 00124 BoxChar* new_box = new BoxChar("\t", 1); 00125 new_box->AddBox(x, y, width, height); 00126 new_box->page_ = (*boxes)[i]->page_; 00127 boxes->insert(boxes->begin() + i, new_box); 00128 ++i; 00129 } else { 00130 (*boxes)[i - 1]->AddBox(x, y, width, height); 00131 (*boxes)[i - 1]->ch_ = "\t"; 00132 } 00133 max_shift = 0; 00134 } else if (shift > max_shift) { 00135 max_shift = shift; 00136 } 00137 } 00138 prev_i = i; 00139 } 00140 } 00141 00142 // Converts NULL boxes to space characters, with appropriate bounding boxes. 00143 /* static */ 00144 void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules, 00145 vector<BoxChar*>* boxes) { 00146 // After InsertNewlines, any remaining null boxes are not newlines, and are 00147 // singletons, so add a box to each remaining null box. 00148 for (int i = 1; i + 1 < boxes->size(); ++i) { 00149 Box* box = (*boxes)[i]->box_; 00150 if (box == NULL) { 00151 Box* prev = (*boxes)[i - 1]->box_; 00152 Box* next = (*boxes)[i + 1]->box_; 00153 ASSERT_HOST(prev != NULL && next != NULL); 00154 int top = MIN(prev->y, next->y); 00155 int bottom = MAX(prev->y + prev->h, next->y + next->h); 00156 int left = prev->x + prev->w; 00157 int right = next->x; 00158 if (vertical_rules) { 00159 top = prev->y + prev->h; 00160 bottom = next->y; 00161 left = MIN(prev->x, next->x); 00162 right = MAX(prev->x + prev->w, next->x + next->w); 00163 } else if (rtl_rules) { 00164 // With RTL we have to account for BiDi. 00165 // Right becomes the min left of all prior boxes back to the first 00166 // space or newline. 00167 right = prev->x; 00168 left = next->x + next->w; 00169 for (int j = i - 2; 00170 j >= 0 && (*boxes)[j]->ch_ != " " && (*boxes)[j]->ch_ != "\t"; 00171 --j) { 00172 prev = (*boxes)[j]->box_; 00173 ASSERT_HOST(prev != NULL); 00174 if (prev->x < right) { 00175 right = prev->x; 00176 } 00177 } 00178 // Left becomes the max right of all next boxes forward to the first 00179 // space or newline. 00180 for (int j = i + 2; j < boxes->size() && (*boxes)[j]->box_ != NULL && 00181 (*boxes)[j]->ch_ != "\t"; 00182 ++j) { 00183 next = (*boxes)[j]->box_; 00184 if (next->x + next->w > left) { 00185 left = next->x + next->w; 00186 } 00187 } 00188 } 00189 // Italic and stylized characters can produce negative spaces, which 00190 // Leptonica doesn't like, so clip to a positive size. 00191 if (right <= left) right = left + 1; 00192 if (bottom <= top) bottom = top + 1; 00193 (*boxes)[i]->AddBox(left, top, right - left, bottom - top); 00194 (*boxes)[i]->ch_ = " "; 00195 } 00196 } 00197 } 00198 00199 // Reorders text in a right-to-left script in left-to-right order. 00200 /* static */ 00201 void BoxChar::ReorderRTLText(vector<BoxChar*>* boxes) { 00202 // After adding newlines and spaces, this task is simply a matter of sorting 00203 // by left each group of boxes between newlines. 00204 BoxCharPtrSort sorter; 00205 int end = 0; 00206 for (int start = 0; start < boxes->size(); start = end + 1) { 00207 end = start + 1; 00208 while (end < boxes->size() && (*boxes)[end]->ch_ != "\t") ++end; 00209 sort(boxes->begin() + start, boxes->begin() + end, sorter); 00210 } 00211 } 00212 00213 // Returns true if the vector contains mostly RTL characters. 00214 /* static */ 00215 bool BoxChar::ContainsMostlyRTL(const vector<BoxChar*>& boxes) { 00216 int num_rtl = 0, num_ltr = 0; 00217 for (int i = 0; i < boxes.size(); ++i) { 00218 // Convert the unichar to UTF32 representation 00219 GenericVector<char32> uni_vector; 00220 if (!UNICHAR::UTF8ToUnicode(boxes[i]->ch_.c_str(), &uni_vector)) { 00221 tprintf("Illegal utf8 in boxchar %d string:%s = ", i, 00222 boxes[i]->ch_.c_str()); 00223 for (int c = 0; c < boxes[i]->ch_.size(); ++c) { 00224 tprintf(" 0x%x", boxes[i]->ch_[c]); 00225 } 00226 tprintf("\n"); 00227 continue; 00228 } 00229 for (int j = 0; j < uni_vector.size(); ++j) { 00230 UCharDirection dir = u_charDirection(uni_vector[j]); 00231 if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC || 00232 dir == U_ARABIC_NUMBER) { 00233 ++num_rtl; 00234 } else { 00235 ++num_ltr; 00236 } 00237 } 00238 } 00239 return num_rtl > num_ltr; 00240 } 00241 00242 // Returns true if the text is mostly laid out vertically. 00243 /* static */ 00244 bool BoxChar::MostlyVertical(const vector<BoxChar*>& boxes) { 00245 inT64 total_dx = 0, total_dy = 0; 00246 for (int i = 1; i < boxes.size(); ++i) { 00247 if (boxes[i - 1]->box_ != NULL && boxes[i]->box_ != NULL && 00248 boxes[i - 1]->page_ == boxes[i]->page_) { 00249 int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x; 00250 int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y; 00251 if (abs(dx) > abs(dy) * kMinNewlineRatio || 00252 abs(dy) > abs(dx) * kMinNewlineRatio) { 00253 total_dx += dx * dx; 00254 total_dy += dy * dy; 00255 } 00256 } 00257 } 00258 return total_dy > total_dx; 00259 } 00260 00261 // Returns the total length of all the strings in the boxes. 00262 /* static */ 00263 int BoxChar::TotalByteLength(const vector<BoxChar*>& boxes) { 00264 int total_length = 0; 00265 for (int i = 0; i < boxes.size(); ++i) total_length += boxes[i]->ch_.size(); 00266 return total_length; 00267 } 00268 00269 // Rotate the boxes in [start_box, end_box) by the given rotation. 00270 // The rotation is in radians clockwise about the given center. 00271 /* static */ 00272 void BoxChar::RotateBoxes(float rotation, int xcenter, int ycenter, 00273 int start_box, int end_box, vector<BoxChar*>* boxes) { 00274 Boxa* orig = boxaCreate(0); 00275 for (int i = start_box; i < end_box; ++i) { 00276 BOX* box = (*boxes)[i]->box_; 00277 if (box) boxaAddBox(orig, box, L_CLONE); 00278 } 00279 Boxa* rotated = boxaRotate(orig, xcenter, ycenter, rotation); 00280 boxaDestroy(&orig); 00281 for (int i = start_box, box_ind = 0; i < end_box; ++i) { 00282 if ((*boxes)[i]->box_) { 00283 boxDestroy(&((*boxes)[i]->box_)); 00284 (*boxes)[i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE); 00285 } 00286 } 00287 boxaDestroy(&rotated); 00288 } 00289 00290 const int kMaxLineLength = 1024; 00291 /* static */ 00292 void BoxChar::WriteTesseractBoxFile(const string& filename, int height, 00293 const vector<BoxChar*>& boxes) { 00294 string output; 00295 char buffer[kMaxLineLength]; 00296 for (int i = 0; i < boxes.size(); ++i) { 00297 const Box* box = boxes[i]->box_; 00298 if (box == NULL) { 00299 tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n"); 00300 return; 00301 } 00302 int nbytes = 00303 snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n", 00304 boxes[i]->ch_.c_str(), box->x, height - box->y - box->h, 00305 box->x + box->w, height - box->y, boxes[i]->page_); 00306 output.append(buffer, nbytes); 00307 } 00308 File::WriteStringToFileOrDie(output, filename); 00309 } 00310 } // namespace tesseract