|
tesseract 3.04.01
|
00001 00002 // File: resultiterator.cpp 00003 // Description: Iterator for tesseract results that is capable of 00004 // iterating in proper reading order over Bi Directional 00005 // (e.g. mixed Hebrew and English) text. 00006 // Author: David Eger 00007 // Created: Fri May 27 13:58:06 PST 2011 00008 // 00009 // (C) Copyright 2011, Google Inc. 00010 // Licensed under the Apache License, Version 2.0 (the "License"); 00011 // you may not use this file except in compliance with the License. 00012 // You may obtain a copy of the License at 00013 // http://www.apache.org/licenses/LICENSE-2.0 00014 // Unless required by applicable law or agreed to in writing, software 00015 // distributed under the License is distributed on an "AS IS" BASIS, 00016 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00017 // See the License for the specific language governing permissions and 00018 // limitations under the License. 00019 // 00021 00022 #include "resultiterator.h" 00023 00024 #include "allheaders.h" 00025 #include "pageres.h" 00026 #include "strngs.h" 00027 #include "tesseractclass.h" 00028 #include "unicharset.h" 00029 #include "unicodes.h" 00030 00031 namespace tesseract { 00032 00033 ResultIterator::ResultIterator(const LTRResultIterator &resit) 00034 : LTRResultIterator(resit) { 00035 in_minor_direction_ = false; 00036 at_beginning_of_minor_run_ = false; 00037 preserve_interword_spaces_ = false; 00038 00039 BoolParam *p = ParamUtils::FindParam<BoolParam>( 00040 "preserve_interword_spaces", GlobalParams()->bool_params, 00041 tesseract_->params()->bool_params); 00042 if (p != NULL) preserve_interword_spaces_ = (bool)(*p); 00043 00044 current_paragraph_is_ltr_ = CurrentParagraphIsLtr(); 00045 MoveToLogicalStartOfTextline(); 00046 } 00047 00048 ResultIterator *ResultIterator::StartOfParagraph( 00049 const LTRResultIterator &resit) { 00050 return new ResultIterator(resit); 00051 } 00052 00053 bool ResultIterator::ParagraphIsLtr() const { 00054 return current_paragraph_is_ltr_; 00055 } 00056 00057 bool ResultIterator::CurrentParagraphIsLtr() const { 00058 if (!it_->word()) 00059 return true; // doesn't matter. 00060 LTRResultIterator it(*this); 00061 it.RestartParagraph(); 00062 // Try to figure out the ltr-ness of the paragraph. The rules below 00063 // make more sense in the context of a difficult paragraph example. 00064 // Here we denote {ltr characters, RTL CHARACTERS}: 00065 // 00066 // "don't go in there!" DAIS EH 00067 // EHT OTNI DEPMUJ FELSMIH NEHT DNA 00068 // .GNIDLIUB GNINRUB 00069 // 00070 // On the first line, the left-most word is LTR and the rightmost word 00071 // is RTL. Thus, we are better off taking the majority direction for 00072 // the whole paragraph contents. So instead of "the leftmost word is LTR" 00073 // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs 00074 // would not do: Typically an RTL paragraph would *not* start with an LTR 00075 // word. So our heuristics are as follows: 00076 // 00077 // (1) If the first text line has an RTL word in the left-most position 00078 // it is RTL. 00079 // (2) If the first text line has an LTR word in the right-most position 00080 // it is LTR. 00081 // (3) If neither of the above is true, take the majority count for the 00082 // paragraph -- if there are more rtl words, it is RTL. If there 00083 // are more LTR words, it's LTR. 00084 bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT; 00085 bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT; 00086 int num_ltr, num_rtl; 00087 num_rtl = leftmost_rtl ? 1 : 0; 00088 num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0; 00089 for (it.Next(RIL_WORD); 00090 !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE); 00091 it.Next(RIL_WORD)) { 00092 StrongScriptDirection dir = it.WordDirection(); 00093 rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT); 00094 num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0; 00095 num_ltr += rightmost_ltr ? 1 : 0; 00096 } 00097 if (leftmost_rtl) 00098 return false; 00099 if (rightmost_ltr) 00100 return true; 00101 // First line is ambiguous. Take statistics on the whole paragraph. 00102 if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) do { 00103 StrongScriptDirection dir = it.WordDirection(); 00104 num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0; 00105 num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0; 00106 } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)); 00107 return num_ltr >= num_rtl; 00108 } 00109 00110 const int ResultIterator::kMinorRunStart = -1; 00111 const int ResultIterator::kMinorRunEnd = -2; 00112 const int ResultIterator::kComplexWord = -3; 00113 00114 void ResultIterator::CalculateBlobOrder( 00115 GenericVector<int> *blob_indices) const { 00116 bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_; 00117 blob_indices->clear(); 00118 if (Empty(RIL_WORD)) return; 00119 if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) { 00120 // Easy! just return the blobs in order; 00121 for (int i = 0; i < word_length_; i++) 00122 blob_indices->push_back(i); 00123 return; 00124 } 00125 00126 // The blobs are in left-to-right order, but the current reading context 00127 // is right-to-left. 00128 const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT; 00129 const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT; 00130 const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER; 00131 const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR; 00132 const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR; 00133 const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR; 00134 const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL; 00135 00136 // Step 1: Scan for and mark European Number sequences 00137 // [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]* 00138 GenericVector<int> letter_types; 00139 for (int i = 0; i < word_length_; i++) { 00140 letter_types.push_back(it_->word()->SymbolDirection(i)); 00141 } 00142 // Convert a single separtor sandwiched between two EN's into an EN. 00143 for (int i = 0; i + 2 < word_length_; i++) { 00144 if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM && 00145 (letter_types[i + 1] == U_EURO_NUM_SEP || 00146 letter_types[i + 1] == U_COMMON_NUM_SEP)) { 00147 letter_types[i + 1] = U_EURO_NUM; 00148 } 00149 } 00150 // Scan for sequences of European Number Terminators around ENs and convert 00151 // them to ENs. 00152 for (int i = 0; i < word_length_; i++) { 00153 if (letter_types[i] == U_EURO_NUM_TERM) { 00154 int j = i + 1; 00155 while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) { j++; } 00156 if (j < word_length_ && letter_types[j] == U_EURO_NUM) { 00157 // The sequence [i..j] should be converted to all European Numbers. 00158 for (int k = i; k < j; k++) letter_types[k] = U_EURO_NUM; 00159 } 00160 j = i - 1; 00161 while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) { j--; } 00162 if (j > -1 && letter_types[j] == U_EURO_NUM) { 00163 // The sequence [j..i] should be converted to all European Numbers. 00164 for (int k = j; k <= i; k++) letter_types[k] = U_EURO_NUM; 00165 } 00166 } 00167 } 00168 // Step 2: Convert all remaining types to either L or R. 00169 // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L. 00170 // All other are R. 00171 for (int i = 0; i < word_length_;) { 00172 int ti = letter_types[i]; 00173 if (ti == U_LTR || ti == U_EURO_NUM) { 00174 // Left to right sequence; scan to the end of it. 00175 int last_good = i; 00176 for (int j = i + 1; j < word_length_; j++) { 00177 int tj = letter_types[j]; 00178 if (tj == U_LTR || tj == U_EURO_NUM) { 00179 last_good = j; 00180 } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) { 00181 // do nothing. 00182 } else { 00183 break; 00184 } 00185 } 00186 // [i..last_good] is the L sequence 00187 for (int k = i; k <= last_good; k++) letter_types[k] = U_LTR; 00188 i = last_good + 1; 00189 } else { 00190 letter_types[i] = U_RTL; 00191 i++; 00192 } 00193 } 00194 00195 // At this point, letter_types is entirely U_LTR or U_RTL. 00196 for (int i = word_length_ - 1; i >= 0;) { 00197 if (letter_types[i] == U_RTL) { 00198 blob_indices->push_back(i); 00199 i--; 00200 } else { 00201 // left to right sequence. scan to the beginning. 00202 int j = i - 1; 00203 for (; j >= 0 && letter_types[j] != U_RTL; j--) { } // pass 00204 // Now (j, i] is LTR 00205 for (int k = j + 1; k <= i; k++) blob_indices->push_back(k); 00206 i = j; 00207 } 00208 } 00209 ASSERT_HOST(blob_indices->size() == word_length_); 00210 } 00211 00212 static void PrintScriptDirs(const GenericVector<StrongScriptDirection> &dirs) { 00213 for (int i = 0; i < dirs.size(); i++) { 00214 switch (dirs[i]) { 00215 case DIR_NEUTRAL: tprintf ("N "); break; 00216 case DIR_LEFT_TO_RIGHT: tprintf("L "); break; 00217 case DIR_RIGHT_TO_LEFT: tprintf("R "); break; 00218 case DIR_MIX: tprintf("Z "); break; 00219 default: tprintf("? "); break; 00220 } 00221 } 00222 tprintf("\n"); 00223 } 00224 00225 void ResultIterator::CalculateTextlineOrder( 00226 bool paragraph_is_ltr, 00227 const LTRResultIterator &resit, 00228 GenericVectorEqEq<int> *word_indices) const { 00229 GenericVector<StrongScriptDirection> directions; 00230 CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices); 00231 } 00232 00233 void ResultIterator::CalculateTextlineOrder( 00234 bool paragraph_is_ltr, 00235 const LTRResultIterator &resit, 00236 GenericVector<StrongScriptDirection> *dirs_arg, 00237 GenericVectorEqEq<int> *word_indices) const { 00238 GenericVector<StrongScriptDirection> dirs; 00239 GenericVector<StrongScriptDirection> *directions; 00240 directions = (dirs_arg != NULL) ? dirs_arg : &dirs; 00241 directions->truncate(0); 00242 00243 // A LTRResultIterator goes strictly left-to-right word order. 00244 LTRResultIterator ltr_it(resit); 00245 ltr_it.RestartRow(); 00246 if (ltr_it.Empty(RIL_WORD)) return; 00247 do { 00248 directions->push_back(ltr_it.WordDirection()); 00249 } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE)); 00250 00251 word_indices->truncate(0); 00252 CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices); 00253 } 00254 00255 void ResultIterator::CalculateTextlineOrder( 00256 bool paragraph_is_ltr, 00257 const GenericVector<StrongScriptDirection> &word_dirs, 00258 GenericVectorEqEq<int> *reading_order) { 00259 reading_order->truncate(0); 00260 if (word_dirs.size() == 0) return; 00261 00262 // Take all of the runs of minor direction words and insert them 00263 // in reverse order. 00264 int minor_direction, major_direction, major_step, start, end; 00265 if (paragraph_is_ltr) { 00266 start = 0; 00267 end = word_dirs.size(); 00268 major_step = 1; 00269 major_direction = DIR_LEFT_TO_RIGHT; 00270 minor_direction = DIR_RIGHT_TO_LEFT; 00271 } else { 00272 start = word_dirs.size() - 1; 00273 end = -1; 00274 major_step = -1; 00275 major_direction = DIR_RIGHT_TO_LEFT; 00276 minor_direction = DIR_LEFT_TO_RIGHT; 00277 // Special rule: if there are neutral words at the right most side 00278 // of a line adjacent to a left-to-right word in the middle of the 00279 // line, we interpret the end of the line as a single LTR sequence. 00280 if (word_dirs[start] == DIR_NEUTRAL) { 00281 int neutral_end = start; 00282 while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) { 00283 neutral_end--; 00284 } 00285 if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) { 00286 // LTR followed by neutrals. 00287 // Scan for the beginning of the minor left-to-right run. 00288 int left = neutral_end; 00289 for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) { 00290 if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i; 00291 } 00292 reading_order->push_back(kMinorRunStart); 00293 for (int i = left; i < word_dirs.size(); i++) { 00294 reading_order->push_back(i); 00295 if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord); 00296 } 00297 reading_order->push_back(kMinorRunEnd); 00298 start = left - 1; 00299 } 00300 } 00301 } 00302 for (int i = start; i != end;) { 00303 if (word_dirs[i] == minor_direction) { 00304 int j = i; 00305 while (j != end && word_dirs[j] != major_direction) 00306 j += major_step; 00307 if (j == end) j -= major_step; 00308 while (j != i && word_dirs[j] != minor_direction) 00309 j -= major_step; 00310 // [j..i] is a minor direction run. 00311 reading_order->push_back(kMinorRunStart); 00312 for (int k = j; k != i; k -= major_step) { 00313 reading_order->push_back(k); 00314 } 00315 reading_order->push_back(i); 00316 reading_order->push_back(kMinorRunEnd); 00317 i = j + major_step; 00318 } else { 00319 reading_order->push_back(i); 00320 if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord); 00321 i += major_step; 00322 } 00323 } 00324 } 00325 00326 int ResultIterator::LTRWordIndex() const { 00327 int this_word_index = 0; 00328 LTRResultIterator textline(*this); 00329 textline.RestartRow(); 00330 while (!textline.PositionedAtSameWord(it_)) { 00331 this_word_index++; 00332 textline.Next(RIL_WORD); 00333 } 00334 return this_word_index; 00335 } 00336 00337 void ResultIterator::MoveToLogicalStartOfWord() { 00338 if (word_length_ == 0) { 00339 BeginWord(0); 00340 return; 00341 } 00342 GenericVector<int> blob_order; 00343 CalculateBlobOrder(&blob_order); 00344 if (blob_order.size() == 0 || blob_order[0] == 0) return; 00345 BeginWord(blob_order[0]); 00346 } 00347 00348 bool ResultIterator::IsAtFinalSymbolOfWord() const { 00349 if (!it_->word()) return true; 00350 GenericVector<int> blob_order; 00351 CalculateBlobOrder(&blob_order); 00352 return blob_order.size() == 0 || blob_order.back() == blob_index_; 00353 } 00354 00355 bool ResultIterator::IsAtFirstSymbolOfWord() const { 00356 if (!it_->word()) return true; 00357 GenericVector<int> blob_order; 00358 CalculateBlobOrder(&blob_order); 00359 return blob_order.size() == 0 || blob_order[0] == blob_index_; 00360 } 00361 00362 void ResultIterator::AppendSuffixMarks(STRING *text) const { 00363 if (!it_->word()) return; 00364 bool reading_direction_is_ltr = 00365 current_paragraph_is_ltr_ ^ in_minor_direction_; 00366 // scan forward to see what meta-information the word ordering algorithm 00367 // left us. 00368 // If this word is at the *end* of a minor run, insert the other 00369 // direction's mark; else if this was a complex word, insert the 00370 // current reading order's mark. 00371 GenericVectorEqEq<int> textline_order; 00372 CalculateTextlineOrder(current_paragraph_is_ltr_, 00373 *this, &textline_order); 00374 int this_word_index = LTRWordIndex(); 00375 int i = textline_order.get_index(this_word_index); 00376 if (i < 0) return; 00377 00378 int last_non_word_mark = 0; 00379 for (i++; i < textline_order.size() && textline_order[i] < 0; i++) { 00380 last_non_word_mark = textline_order[i]; 00381 } 00382 if (last_non_word_mark == kComplexWord) { 00383 *text += reading_direction_is_ltr ? kLRM : kRLM; 00384 } else if (last_non_word_mark == kMinorRunEnd) { 00385 if (current_paragraph_is_ltr_) { 00386 *text += kLRM; 00387 } else { 00388 *text += kRLM; 00389 } 00390 } 00391 } 00392 00393 void ResultIterator::MoveToLogicalStartOfTextline() { 00394 GenericVectorEqEq<int> word_indices; 00395 RestartRow(); 00396 CalculateTextlineOrder(current_paragraph_is_ltr_, 00397 dynamic_cast<const LTRResultIterator&>(*this), 00398 &word_indices); 00399 int i = 0; 00400 for (; i < word_indices.size() && word_indices[i] < 0; i++) { 00401 if (word_indices[i] == kMinorRunStart) in_minor_direction_ = true; 00402 else if (word_indices[i] == kMinorRunEnd) in_minor_direction_ = false; 00403 } 00404 if (in_minor_direction_) at_beginning_of_minor_run_ = true; 00405 if (i >= word_indices.size()) return; 00406 int first_word_index = word_indices[i]; 00407 for (int j = 0; j < first_word_index; j++) { 00408 PageIterator::Next(RIL_WORD); 00409 } 00410 MoveToLogicalStartOfWord(); 00411 } 00412 00413 void ResultIterator::Begin() { 00414 LTRResultIterator::Begin(); 00415 current_paragraph_is_ltr_ = CurrentParagraphIsLtr(); 00416 in_minor_direction_ = false; 00417 at_beginning_of_minor_run_ = false; 00418 MoveToLogicalStartOfTextline(); 00419 } 00420 00421 bool ResultIterator::Next(PageIteratorLevel level) { 00422 if (it_->block() == NULL) return false; // already at end! 00423 switch (level) { 00424 case RIL_BLOCK: // explicit fall-through 00425 case RIL_PARA: // explicit fall-through 00426 case RIL_TEXTLINE: 00427 if (!PageIterator::Next(level)) return false; 00428 if (IsWithinFirstTextlineOfParagraph()) { 00429 // if we've advanced to a new paragraph, 00430 // recalculate current_paragraph_is_ltr_ 00431 current_paragraph_is_ltr_ = CurrentParagraphIsLtr(); 00432 } 00433 in_minor_direction_ = false; 00434 MoveToLogicalStartOfTextline(); 00435 return it_->block() != NULL; 00436 case RIL_SYMBOL: 00437 { 00438 GenericVector<int> blob_order; 00439 CalculateBlobOrder(&blob_order); 00440 int next_blob = 0; 00441 while (next_blob < blob_order.size() && 00442 blob_index_ != blob_order[next_blob]) 00443 next_blob++; 00444 next_blob++; 00445 if (next_blob < blob_order.size()) { 00446 // we're in the same word; simply advance one blob. 00447 BeginWord(blob_order[next_blob]); 00448 at_beginning_of_minor_run_ = false; 00449 return true; 00450 } 00451 level = RIL_WORD; // we've fallen through to the next word. 00452 } 00453 case RIL_WORD: // explicit fall-through. 00454 { 00455 if (it_->word() == NULL) return Next(RIL_BLOCK); 00456 GenericVectorEqEq<int> word_indices; 00457 int this_word_index = LTRWordIndex(); 00458 CalculateTextlineOrder(current_paragraph_is_ltr_, 00459 *this, 00460 &word_indices); 00461 int final_real_index = word_indices.size() - 1; 00462 while (final_real_index > 0 && word_indices[final_real_index] < 0) 00463 final_real_index--; 00464 for (int i = 0; i < final_real_index; i++) { 00465 if (word_indices[i] == this_word_index) { 00466 int j = i + 1; 00467 for (; j < final_real_index && word_indices[j] < 0; j++) { 00468 if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true; 00469 if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false; 00470 } 00471 at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart); 00472 // awesome, we move to word_indices[j] 00473 if (BidiDebug(3)) { 00474 tprintf("Next(RIL_WORD): %d -> %d\n", 00475 this_word_index, word_indices[j]); 00476 } 00477 PageIterator::RestartRow(); 00478 for (int k = 0; k < word_indices[j]; k++) { 00479 PageIterator::Next(RIL_WORD); 00480 } 00481 MoveToLogicalStartOfWord(); 00482 return true; 00483 } 00484 } 00485 if (BidiDebug(3)) { 00486 tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index); 00487 } 00488 // we're going off the end of the text line. 00489 return Next(RIL_TEXTLINE); 00490 } 00491 } 00492 ASSERT_HOST(false); // shouldn't happen. 00493 return false; 00494 } 00495 00496 bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const { 00497 if (it_->block() == NULL) return false; // Already at the end! 00498 if (it_->word() == NULL) return true; // In an image block. 00499 if (level == RIL_SYMBOL) return true; // Always at beginning of a symbol. 00500 00501 bool at_word_start = IsAtFirstSymbolOfWord(); 00502 if (level == RIL_WORD) return at_word_start; 00503 00504 ResultIterator line_start(*this); 00505 // move to the first word in the line... 00506 line_start.MoveToLogicalStartOfTextline(); 00507 00508 bool at_textline_start = at_word_start && *line_start.it_ == *it_; 00509 if (level == RIL_TEXTLINE) return at_textline_start; 00510 00511 // now we move to the left-most word... 00512 line_start.RestartRow(); 00513 bool at_block_start = at_textline_start && 00514 line_start.it_->block() != line_start.it_->prev_block(); 00515 if (level == RIL_BLOCK) return at_block_start; 00516 00517 bool at_para_start = at_block_start || 00518 (at_textline_start && 00519 line_start.it_->row()->row->para() != 00520 line_start.it_->prev_row()->row->para()); 00521 if (level == RIL_PARA) return at_para_start; 00522 00523 ASSERT_HOST(false); // shouldn't happen. 00524 return false; 00525 } 00526 00532 bool ResultIterator::IsAtFinalElement(PageIteratorLevel level, 00533 PageIteratorLevel element) const { 00534 if (Empty(element)) return true; // Already at the end! 00535 // The result is true if we step forward by element and find we are 00536 // at the the end of the page or at beginning of *all* levels in: 00537 // [level, element). 00538 // When there is more than one level difference between element and level, 00539 // we could for instance move forward one symbol and still be at the first 00540 // word on a line, so we also have to be at the first symbol in a word. 00541 ResultIterator next(*this); 00542 next.Next(element); 00543 if (next.Empty(element)) return true; // Reached the end of the page. 00544 while (element > level) { 00545 element = static_cast<PageIteratorLevel>(element - 1); 00546 if (!next.IsAtBeginningOf(element)) 00547 return false; 00548 } 00549 return true; 00550 } 00551 00556 char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const { 00557 if (it_->word() == NULL) return NULL; // Already at the end! 00558 STRING text; 00559 switch (level) { 00560 case RIL_BLOCK: 00561 { 00562 ResultIterator pp(*this); 00563 do { 00564 pp.AppendUTF8ParagraphText(&text); 00565 } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block()); 00566 } 00567 break; 00568 case RIL_PARA: 00569 AppendUTF8ParagraphText(&text); 00570 break; 00571 case RIL_TEXTLINE: 00572 { 00573 ResultIterator it(*this); 00574 it.MoveToLogicalStartOfTextline(); 00575 it.IterateAndAppendUTF8TextlineText(&text); 00576 } 00577 break; 00578 case RIL_WORD: 00579 AppendUTF8WordText(&text); 00580 break; 00581 case RIL_SYMBOL: 00582 { 00583 bool reading_direction_is_ltr = 00584 current_paragraph_is_ltr_ ^ in_minor_direction_; 00585 if (at_beginning_of_minor_run_) { 00586 text += reading_direction_is_ltr ? kLRM : kRLM; 00587 } 00588 text = it_->word()->BestUTF8(blob_index_, !reading_direction_is_ltr); 00589 if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text); 00590 } 00591 break; 00592 } 00593 int length = text.length() + 1; 00594 char* result = new char[length]; 00595 strncpy(result, text.string(), length); 00596 return result; 00597 } 00598 00599 void ResultIterator::AppendUTF8WordText(STRING *text) const { 00600 if (!it_->word()) return; 00601 ASSERT_HOST(it_->word()->best_choice != NULL); 00602 bool reading_direction_is_ltr = 00603 current_paragraph_is_ltr_ ^ in_minor_direction_; 00604 if (at_beginning_of_minor_run_) { 00605 *text += reading_direction_is_ltr ? kLRM : kRLM; 00606 } 00607 00608 GenericVector<int> blob_order; 00609 CalculateBlobOrder(&blob_order); 00610 for (int i = 0; i < blob_order.size(); i++) { 00611 *text += it_->word()->BestUTF8(blob_order[i], !reading_direction_is_ltr); 00612 } 00613 AppendSuffixMarks(text); 00614 } 00615 00616 void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) { 00617 if (Empty(RIL_WORD)) { 00618 Next(RIL_WORD); 00619 return; 00620 } 00621 if (BidiDebug(1)) { 00622 GenericVectorEqEq<int> textline_order; 00623 GenericVector<StrongScriptDirection> dirs; 00624 CalculateTextlineOrder(current_paragraph_is_ltr_, 00625 *this, &dirs, &textline_order); 00626 tprintf("Strong Script dirs [%p/P=%s]: ", it_->row(), 00627 current_paragraph_is_ltr_ ? "ltr" : "rtl"); 00628 PrintScriptDirs(dirs); 00629 tprintf("Logical textline order [%p/P=%s]: ", it_->row(), 00630 current_paragraph_is_ltr_ ? "ltr" : "rtl"); 00631 for (int i = 0; i < textline_order.size(); i++) { 00632 tprintf("%d ", textline_order[i]); 00633 } 00634 tprintf("\n"); 00635 } 00636 00637 int words_appended = 0; 00638 do { 00639 int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() 00640 : (words_appended > 0); 00641 for (int i = 0; i < numSpaces; ++i) { 00642 *text += " "; 00643 } 00644 AppendUTF8WordText(text); 00645 words_appended++; 00646 } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE)); 00647 if (BidiDebug(1)) { 00648 tprintf("%d words printed\n", words_appended); 00649 } 00650 *text += line_separator_; 00651 // If we just finished a paragraph, add an extra newline. 00652 if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA)) 00653 *text += paragraph_separator_; 00654 } 00655 00656 void ResultIterator::AppendUTF8ParagraphText(STRING *text) const { 00657 ResultIterator it(*this); 00658 it.RestartParagraph(); 00659 it.MoveToLogicalStartOfTextline(); 00660 if (it.Empty(RIL_WORD)) return; 00661 do { 00662 it.IterateAndAppendUTF8TextlineText(text); 00663 } while (it.it_->block() != NULL && !it.IsAtBeginningOf(RIL_PARA)); 00664 } 00665 00666 bool ResultIterator::BidiDebug(int min_level) const { 00667 int debug_level = 1; 00668 IntParam *p = ParamUtils::FindParam<IntParam>( 00669 "bidi_debug", GlobalParams()->int_params, 00670 tesseract_->params()->int_params); 00671 if (p != NULL) debug_level = (inT32)(*p); 00672 return debug_level >= min_level; 00673 } 00674 00675 } // namespace tesseract.