tesseract 3.04.01

ccmain/resultiterator.cpp

Go to the documentation of this file.
00001 
00002 // File:        resultiterator.cpp
00003 // Description: Iterator for tesseract results that is capable of
00004 //              iterating in proper reading order over Bi Directional
00005 //              (e.g. mixed Hebrew and English) text.
00006 // Author:      David Eger
00007 // Created:     Fri May 27 13:58:06 PST 2011
00008 //
00009 // (C) Copyright 2011, Google Inc.
00010 // Licensed under the Apache License, Version 2.0 (the "License");
00011 // you may not use this file except in compliance with the License.
00012 // You may obtain a copy of the License at
00013 // http://www.apache.org/licenses/LICENSE-2.0
00014 // Unless required by applicable law or agreed to in writing, software
00015 // distributed under the License is distributed on an "AS IS" BASIS,
00016 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00017 // See the License for the specific language governing permissions and
00018 // limitations under the License.
00019 //
00021 
00022 #include "resultiterator.h"
00023 
00024 #include "allheaders.h"
00025 #include "pageres.h"
00026 #include "strngs.h"
00027 #include "tesseractclass.h"
00028 #include "unicharset.h"
00029 #include "unicodes.h"
00030 
00031 namespace tesseract {
00032 
00033 ResultIterator::ResultIterator(const LTRResultIterator &resit)
00034     : LTRResultIterator(resit) {
00035   in_minor_direction_ = false;
00036   at_beginning_of_minor_run_ = false;
00037   preserve_interword_spaces_ = false;
00038 
00039   BoolParam *p = ParamUtils::FindParam<BoolParam>(
00040       "preserve_interword_spaces", GlobalParams()->bool_params,
00041       tesseract_->params()->bool_params);
00042   if (p != NULL) preserve_interword_spaces_ = (bool)(*p);
00043 
00044   current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
00045   MoveToLogicalStartOfTextline();
00046 }
00047 
00048 ResultIterator *ResultIterator::StartOfParagraph(
00049     const LTRResultIterator &resit) {
00050   return new ResultIterator(resit);
00051 }
00052 
00053 bool ResultIterator::ParagraphIsLtr() const {
00054   return current_paragraph_is_ltr_;
00055 }
00056 
00057 bool ResultIterator::CurrentParagraphIsLtr() const {
00058   if (!it_->word())
00059     return true;  // doesn't matter.
00060   LTRResultIterator it(*this);
00061   it.RestartParagraph();
00062   // Try to figure out the ltr-ness of the paragraph.  The rules below
00063   // make more sense in the context of a difficult paragraph example.
00064   // Here we denote {ltr characters, RTL CHARACTERS}:
00065   //
00066   //   "don't go in there!" DAIS EH
00067   //   EHT OTNI DEPMUJ FELSMIH NEHT DNA
00068   //                  .GNIDLIUB GNINRUB
00069   //
00070   // On the first line, the left-most word is LTR and the rightmost word
00071   // is RTL.  Thus, we are better off taking the majority direction for
00072   // the whole paragraph contents.  So instead of "the leftmost word is LTR"
00073   // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
00074   // would not do:  Typically an RTL paragraph would *not* start with an LTR
00075   // word.  So our heuristics are as follows:
00076   //
00077   // (1) If the first text line has an RTL word in the left-most position
00078   //     it is RTL.
00079   // (2) If the first text line has an LTR word in the right-most position
00080   //     it is LTR.
00081   // (3) If neither of the above is true, take the majority count for the
00082   //     paragraph -- if there are more rtl words, it is RTL.  If there
00083   //     are more LTR words, it's LTR.
00084   bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
00085   bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
00086   int num_ltr, num_rtl;
00087   num_rtl = leftmost_rtl ? 1 : 0;
00088   num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
00089   for (it.Next(RIL_WORD);
00090        !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
00091        it.Next(RIL_WORD)) {
00092     StrongScriptDirection dir = it.WordDirection();
00093     rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
00094     num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
00095     num_ltr += rightmost_ltr ? 1 : 0;
00096   }
00097   if (leftmost_rtl)
00098     return false;
00099   if (rightmost_ltr)
00100     return true;
00101   // First line is ambiguous.  Take statistics on the whole paragraph.
00102   if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) do {
00103     StrongScriptDirection dir = it.WordDirection();
00104     num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
00105     num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
00106   } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
00107   return num_ltr >= num_rtl;
00108 }
00109 
00110 const int ResultIterator::kMinorRunStart = -1;
00111 const int ResultIterator::kMinorRunEnd = -2;
00112 const int ResultIterator::kComplexWord = -3;
00113 
00114 void ResultIterator::CalculateBlobOrder(
00115     GenericVector<int> *blob_indices) const {
00116   bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
00117   blob_indices->clear();
00118   if (Empty(RIL_WORD)) return;
00119   if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
00120     // Easy! just return the blobs in order;
00121     for (int i = 0; i < word_length_; i++)
00122       blob_indices->push_back(i);
00123     return;
00124   }
00125 
00126   // The blobs are in left-to-right order, but the current reading context
00127   // is right-to-left.
00128   const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
00129   const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
00130   const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
00131   const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
00132   const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
00133   const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
00134   const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
00135 
00136   // Step 1: Scan for and mark European Number sequences
00137   //   [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
00138   GenericVector<int> letter_types;
00139   for (int i = 0; i < word_length_; i++) {
00140     letter_types.push_back(it_->word()->SymbolDirection(i));
00141   }
00142   // Convert a single separtor sandwiched between two EN's into an EN.
00143   for (int i = 0; i + 2 < word_length_; i++) {
00144     if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
00145         (letter_types[i + 1] == U_EURO_NUM_SEP ||
00146          letter_types[i + 1] == U_COMMON_NUM_SEP)) {
00147       letter_types[i + 1] = U_EURO_NUM;
00148     }
00149   }
00150   // Scan for sequences of European Number Terminators around ENs and convert
00151   // them to ENs.
00152   for (int i = 0; i < word_length_; i++) {
00153     if (letter_types[i] == U_EURO_NUM_TERM) {
00154       int j = i + 1;
00155       while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) { j++; }
00156       if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
00157         // The sequence [i..j] should be converted to all European Numbers.
00158         for (int k = i; k < j; k++) letter_types[k] = U_EURO_NUM;
00159       }
00160       j = i - 1;
00161       while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) { j--; }
00162       if (j > -1 && letter_types[j] == U_EURO_NUM) {
00163         // The sequence [j..i] should be converted to all European Numbers.
00164         for (int k = j; k <= i; k++) letter_types[k] = U_EURO_NUM;
00165       }
00166     }
00167   }
00168   // Step 2: Convert all remaining types to either L or R.
00169   // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
00170   // All other are R.
00171   for (int i = 0; i < word_length_;) {
00172     int ti = letter_types[i];
00173     if (ti == U_LTR || ti == U_EURO_NUM) {
00174       // Left to right sequence; scan to the end of it.
00175       int last_good = i;
00176       for (int j = i + 1; j < word_length_; j++) {
00177         int tj = letter_types[j];
00178         if (tj == U_LTR || tj == U_EURO_NUM) {
00179           last_good = j;
00180         } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
00181           // do nothing.
00182         } else {
00183           break;
00184         }
00185       }
00186       // [i..last_good] is the L sequence
00187       for (int k = i; k <= last_good; k++) letter_types[k] = U_LTR;
00188       i = last_good + 1;
00189     } else {
00190       letter_types[i] = U_RTL;
00191       i++;
00192     }
00193   }
00194 
00195   // At this point, letter_types is entirely U_LTR or U_RTL.
00196   for (int i = word_length_ - 1; i >= 0;) {
00197     if (letter_types[i] == U_RTL) {
00198       blob_indices->push_back(i);
00199       i--;
00200     } else {
00201       // left to right sequence.  scan to the beginning.
00202       int j = i - 1;
00203       for (; j >= 0 && letter_types[j] != U_RTL; j--) { }  // pass
00204       // Now (j, i] is LTR
00205       for (int k = j + 1; k <= i; k++) blob_indices->push_back(k);
00206       i = j;
00207     }
00208   }
00209   ASSERT_HOST(blob_indices->size() == word_length_);
00210 }
00211 
00212 static void PrintScriptDirs(const GenericVector<StrongScriptDirection> &dirs) {
00213   for (int i = 0; i < dirs.size(); i++) {
00214     switch (dirs[i]) {
00215       case DIR_NEUTRAL: tprintf ("N "); break;
00216       case DIR_LEFT_TO_RIGHT: tprintf("L "); break;
00217       case DIR_RIGHT_TO_LEFT: tprintf("R "); break;
00218       case DIR_MIX: tprintf("Z "); break;
00219       default: tprintf("? "); break;
00220     }
00221   }
00222   tprintf("\n");
00223 }
00224 
00225 void ResultIterator::CalculateTextlineOrder(
00226     bool paragraph_is_ltr,
00227     const LTRResultIterator &resit,
00228     GenericVectorEqEq<int> *word_indices) const {
00229   GenericVector<StrongScriptDirection> directions;
00230   CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
00231 }
00232 
00233 void ResultIterator::CalculateTextlineOrder(
00234     bool paragraph_is_ltr,
00235     const LTRResultIterator &resit,
00236     GenericVector<StrongScriptDirection> *dirs_arg,
00237     GenericVectorEqEq<int> *word_indices) const {
00238   GenericVector<StrongScriptDirection> dirs;
00239   GenericVector<StrongScriptDirection> *directions;
00240   directions = (dirs_arg != NULL) ? dirs_arg : &dirs;
00241   directions->truncate(0);
00242 
00243   // A LTRResultIterator goes strictly left-to-right word order.
00244   LTRResultIterator ltr_it(resit);
00245   ltr_it.RestartRow();
00246   if (ltr_it.Empty(RIL_WORD)) return;
00247   do {
00248     directions->push_back(ltr_it.WordDirection());
00249   } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
00250 
00251   word_indices->truncate(0);
00252   CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
00253 }
00254 
00255 void ResultIterator::CalculateTextlineOrder(
00256     bool paragraph_is_ltr,
00257     const GenericVector<StrongScriptDirection> &word_dirs,
00258     GenericVectorEqEq<int> *reading_order) {
00259   reading_order->truncate(0);
00260   if (word_dirs.size() == 0) return;
00261 
00262   // Take all of the runs of minor direction words and insert them
00263   // in reverse order.
00264   int minor_direction, major_direction, major_step, start, end;
00265   if (paragraph_is_ltr) {
00266     start = 0;
00267     end = word_dirs.size();
00268     major_step = 1;
00269     major_direction = DIR_LEFT_TO_RIGHT;
00270     minor_direction = DIR_RIGHT_TO_LEFT;
00271   } else {
00272     start = word_dirs.size() - 1;
00273     end = -1;
00274     major_step = -1;
00275     major_direction = DIR_RIGHT_TO_LEFT;
00276     minor_direction = DIR_LEFT_TO_RIGHT;
00277     // Special rule: if there are neutral words at the right most side
00278     //   of a line adjacent to a left-to-right word in the middle of the
00279     //   line, we interpret the end of the line as a single LTR sequence.
00280     if (word_dirs[start] == DIR_NEUTRAL) {
00281       int neutral_end = start;
00282       while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
00283         neutral_end--;
00284       }
00285       if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
00286         // LTR followed by neutrals.
00287         // Scan for the beginning of the minor left-to-right run.
00288         int left = neutral_end;
00289         for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
00290           if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
00291         }
00292         reading_order->push_back(kMinorRunStart);
00293         for (int i = left; i < word_dirs.size(); i++) {
00294           reading_order->push_back(i);
00295           if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
00296         }
00297         reading_order->push_back(kMinorRunEnd);
00298         start = left - 1;
00299       }
00300     }
00301   }
00302   for (int i = start; i != end;) {
00303     if (word_dirs[i] == minor_direction) {
00304       int j = i;
00305       while (j != end && word_dirs[j] != major_direction)
00306         j += major_step;
00307       if (j == end) j -= major_step;
00308       while (j != i && word_dirs[j] != minor_direction)
00309         j -= major_step;
00310       //  [j..i] is a minor direction run.
00311       reading_order->push_back(kMinorRunStart);
00312       for (int k = j; k != i; k -= major_step) {
00313         reading_order->push_back(k);
00314       }
00315       reading_order->push_back(i);
00316       reading_order->push_back(kMinorRunEnd);
00317       i = j + major_step;
00318     } else {
00319       reading_order->push_back(i);
00320       if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
00321       i += major_step;
00322     }
00323   }
00324 }
00325 
00326 int ResultIterator::LTRWordIndex() const {
00327   int this_word_index = 0;
00328   LTRResultIterator textline(*this);
00329   textline.RestartRow();
00330   while (!textline.PositionedAtSameWord(it_)) {
00331     this_word_index++;
00332     textline.Next(RIL_WORD);
00333   }
00334   return this_word_index;
00335 }
00336 
00337 void ResultIterator::MoveToLogicalStartOfWord() {
00338   if (word_length_ == 0) {
00339     BeginWord(0);
00340     return;
00341   }
00342   GenericVector<int> blob_order;
00343   CalculateBlobOrder(&blob_order);
00344   if (blob_order.size() == 0 || blob_order[0] == 0) return;
00345   BeginWord(blob_order[0]);
00346 }
00347 
00348 bool ResultIterator::IsAtFinalSymbolOfWord() const {
00349   if (!it_->word()) return true;
00350   GenericVector<int> blob_order;
00351   CalculateBlobOrder(&blob_order);
00352   return blob_order.size() == 0 || blob_order.back() == blob_index_;
00353 }
00354 
00355 bool ResultIterator::IsAtFirstSymbolOfWord() const {
00356   if (!it_->word()) return true;
00357   GenericVector<int> blob_order;
00358   CalculateBlobOrder(&blob_order);
00359   return blob_order.size() == 0 || blob_order[0] == blob_index_;
00360 }
00361 
00362 void ResultIterator::AppendSuffixMarks(STRING *text) const {
00363   if (!it_->word()) return;
00364   bool reading_direction_is_ltr =
00365       current_paragraph_is_ltr_ ^ in_minor_direction_;
00366   // scan forward to see what meta-information the word ordering algorithm
00367   // left us.
00368   // If this word is at the  *end* of a minor run, insert the other
00369   // direction's mark;  else if this was a complex word, insert the
00370   // current reading order's mark.
00371   GenericVectorEqEq<int> textline_order;
00372   CalculateTextlineOrder(current_paragraph_is_ltr_,
00373                          *this, &textline_order);
00374   int this_word_index = LTRWordIndex();
00375   int i = textline_order.get_index(this_word_index);
00376   if (i < 0) return;
00377 
00378   int last_non_word_mark = 0;
00379   for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
00380     last_non_word_mark = textline_order[i];
00381   }
00382   if (last_non_word_mark == kComplexWord) {
00383     *text += reading_direction_is_ltr ? kLRM : kRLM;
00384   } else if (last_non_word_mark == kMinorRunEnd) {
00385     if (current_paragraph_is_ltr_) {
00386       *text += kLRM;
00387     } else {
00388       *text += kRLM;
00389     }
00390   }
00391 }
00392 
00393 void ResultIterator::MoveToLogicalStartOfTextline() {
00394   GenericVectorEqEq<int> word_indices;
00395   RestartRow();
00396   CalculateTextlineOrder(current_paragraph_is_ltr_,
00397                          dynamic_cast<const LTRResultIterator&>(*this),
00398                          &word_indices);
00399   int i = 0;
00400   for (; i < word_indices.size() && word_indices[i] < 0; i++) {
00401     if (word_indices[i] == kMinorRunStart) in_minor_direction_ = true;
00402     else if (word_indices[i] == kMinorRunEnd) in_minor_direction_ = false;
00403   }
00404   if (in_minor_direction_) at_beginning_of_minor_run_ = true;
00405   if (i >= word_indices.size()) return;
00406   int first_word_index = word_indices[i];
00407   for (int j = 0; j < first_word_index; j++) {
00408     PageIterator::Next(RIL_WORD);
00409   }
00410   MoveToLogicalStartOfWord();
00411 }
00412 
00413 void ResultIterator::Begin() {
00414   LTRResultIterator::Begin();
00415   current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
00416   in_minor_direction_ = false;
00417   at_beginning_of_minor_run_ = false;
00418   MoveToLogicalStartOfTextline();
00419 }
00420 
00421 bool ResultIterator::Next(PageIteratorLevel level) {
00422   if (it_->block() == NULL) return false; // already at end!
00423   switch (level) {
00424     case RIL_BLOCK:  // explicit fall-through
00425     case RIL_PARA:   // explicit fall-through
00426     case RIL_TEXTLINE:
00427       if (!PageIterator::Next(level)) return false;
00428       if (IsWithinFirstTextlineOfParagraph()) {
00429         // if we've advanced to a new paragraph,
00430         // recalculate current_paragraph_is_ltr_
00431         current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
00432       }
00433       in_minor_direction_ = false;
00434       MoveToLogicalStartOfTextline();
00435       return it_->block() != NULL;
00436     case RIL_SYMBOL:
00437     {
00438       GenericVector<int> blob_order;
00439       CalculateBlobOrder(&blob_order);
00440       int next_blob = 0;
00441       while (next_blob < blob_order.size() &&
00442              blob_index_ != blob_order[next_blob])
00443         next_blob++;
00444       next_blob++;
00445       if (next_blob < blob_order.size()) {
00446         // we're in the same word; simply advance one blob.
00447         BeginWord(blob_order[next_blob]);
00448         at_beginning_of_minor_run_ = false;
00449         return true;
00450       }
00451       level = RIL_WORD;  // we've fallen through to the next word.
00452     }
00453     case RIL_WORD:  // explicit fall-through.
00454     {
00455       if (it_->word() == NULL) return Next(RIL_BLOCK);
00456       GenericVectorEqEq<int> word_indices;
00457       int this_word_index = LTRWordIndex();
00458       CalculateTextlineOrder(current_paragraph_is_ltr_,
00459                              *this,
00460                              &word_indices);
00461       int final_real_index = word_indices.size() - 1;
00462       while (final_real_index > 0 && word_indices[final_real_index] < 0)
00463         final_real_index--;
00464       for (int i = 0; i < final_real_index; i++) {
00465         if (word_indices[i] == this_word_index) {
00466           int j = i + 1;
00467           for (; j < final_real_index && word_indices[j] < 0; j++) {
00468             if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
00469             if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
00470           }
00471           at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
00472           // awesome, we move to word_indices[j]
00473           if (BidiDebug(3)) {
00474             tprintf("Next(RIL_WORD): %d -> %d\n",
00475                     this_word_index, word_indices[j]);
00476           }
00477           PageIterator::RestartRow();
00478           for (int k = 0; k < word_indices[j]; k++) {
00479             PageIterator::Next(RIL_WORD);
00480           }
00481           MoveToLogicalStartOfWord();
00482           return true;
00483         }
00484       }
00485       if (BidiDebug(3)) {
00486         tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
00487       }
00488       // we're going off the end of the text line.
00489       return Next(RIL_TEXTLINE);
00490     }
00491   }
00492   ASSERT_HOST(false);  // shouldn't happen.
00493   return false;
00494 }
00495 
00496 bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const {
00497   if (it_->block() == NULL) return false;  // Already at the end!
00498   if (it_->word() == NULL) return true;  // In an image block.
00499   if (level == RIL_SYMBOL) return true;  // Always at beginning of a symbol.
00500 
00501   bool at_word_start = IsAtFirstSymbolOfWord();
00502   if (level == RIL_WORD) return at_word_start;
00503 
00504   ResultIterator line_start(*this);
00505   // move to the first word in the line...
00506   line_start.MoveToLogicalStartOfTextline();
00507 
00508   bool at_textline_start = at_word_start && *line_start.it_ == *it_;
00509   if (level == RIL_TEXTLINE) return at_textline_start;
00510 
00511   // now we move to the left-most word...
00512   line_start.RestartRow();
00513   bool at_block_start = at_textline_start &&
00514       line_start.it_->block() != line_start.it_->prev_block();
00515   if (level == RIL_BLOCK) return at_block_start;
00516 
00517   bool at_para_start = at_block_start ||
00518       (at_textline_start &&
00519        line_start.it_->row()->row->para() !=
00520            line_start.it_->prev_row()->row->para());
00521   if (level == RIL_PARA) return at_para_start;
00522 
00523   ASSERT_HOST(false);  // shouldn't happen.
00524   return false;
00525 }
00526 
00532 bool ResultIterator::IsAtFinalElement(PageIteratorLevel level,
00533                                       PageIteratorLevel element) const {
00534   if (Empty(element)) return true;  // Already at the end!
00535   // The result is true if we step forward by element and find we are
00536   // at the the end of the page or at beginning of *all* levels in:
00537   // [level, element).
00538   // When there is more than one level difference between element and level,
00539   // we could for instance move forward one symbol and still be at the first
00540   // word on a line, so we also have to be at the first symbol in a word.
00541   ResultIterator next(*this);
00542   next.Next(element);
00543   if (next.Empty(element)) return true;  // Reached the end of the page.
00544   while (element > level) {
00545     element = static_cast<PageIteratorLevel>(element - 1);
00546     if (!next.IsAtBeginningOf(element))
00547       return false;
00548   }
00549   return true;
00550 }
00551 
00556 char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
00557   if (it_->word() == NULL) return NULL;  // Already at the end!
00558   STRING text;
00559   switch (level) {
00560     case RIL_BLOCK:
00561       {
00562         ResultIterator pp(*this);
00563         do {
00564           pp.AppendUTF8ParagraphText(&text);
00565         } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
00566       }
00567       break;
00568     case RIL_PARA:
00569       AppendUTF8ParagraphText(&text);
00570       break;
00571     case RIL_TEXTLINE:
00572       {
00573         ResultIterator it(*this);
00574         it.MoveToLogicalStartOfTextline();
00575         it.IterateAndAppendUTF8TextlineText(&text);
00576       }
00577       break;
00578     case RIL_WORD:
00579       AppendUTF8WordText(&text);
00580       break;
00581     case RIL_SYMBOL:
00582       {
00583         bool reading_direction_is_ltr =
00584           current_paragraph_is_ltr_ ^ in_minor_direction_;
00585         if (at_beginning_of_minor_run_) {
00586           text += reading_direction_is_ltr ? kLRM : kRLM;
00587         }
00588         text = it_->word()->BestUTF8(blob_index_, !reading_direction_is_ltr);
00589         if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
00590       }
00591       break;
00592   }
00593   int length = text.length() + 1;
00594   char* result = new char[length];
00595   strncpy(result, text.string(), length);
00596   return result;
00597 }
00598 
00599 void ResultIterator::AppendUTF8WordText(STRING *text) const {
00600   if (!it_->word()) return;
00601   ASSERT_HOST(it_->word()->best_choice != NULL);
00602   bool reading_direction_is_ltr =
00603       current_paragraph_is_ltr_ ^ in_minor_direction_;
00604   if (at_beginning_of_minor_run_) {
00605     *text += reading_direction_is_ltr ? kLRM : kRLM;
00606   }
00607 
00608   GenericVector<int> blob_order;
00609   CalculateBlobOrder(&blob_order);
00610   for (int i = 0; i < blob_order.size(); i++) {
00611     *text += it_->word()->BestUTF8(blob_order[i], !reading_direction_is_ltr);
00612   }
00613   AppendSuffixMarks(text);
00614 }
00615 
00616 void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) {
00617   if (Empty(RIL_WORD)) {
00618     Next(RIL_WORD);
00619     return;
00620   }
00621   if (BidiDebug(1)) {
00622     GenericVectorEqEq<int> textline_order;
00623     GenericVector<StrongScriptDirection> dirs;
00624     CalculateTextlineOrder(current_paragraph_is_ltr_,
00625                            *this, &dirs, &textline_order);
00626     tprintf("Strong Script dirs     [%p/P=%s]: ", it_->row(),
00627             current_paragraph_is_ltr_ ? "ltr" : "rtl");
00628     PrintScriptDirs(dirs);
00629     tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
00630             current_paragraph_is_ltr_ ? "ltr" : "rtl");
00631     for (int i = 0; i < textline_order.size(); i++) {
00632       tprintf("%d ", textline_order[i]);
00633     }
00634     tprintf("\n");
00635   }
00636 
00637   int words_appended = 0;
00638   do {
00639     int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space()
00640                                                : (words_appended > 0);
00641     for (int i = 0; i < numSpaces; ++i) {
00642       *text += " ";
00643     }
00644     AppendUTF8WordText(text);
00645     words_appended++;
00646   } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
00647   if (BidiDebug(1)) {
00648     tprintf("%d words printed\n", words_appended);
00649   }
00650   *text += line_separator_;
00651   // If we just finished a paragraph, add an extra newline.
00652   if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA))
00653     *text += paragraph_separator_;
00654 }
00655 
00656 void ResultIterator::AppendUTF8ParagraphText(STRING *text) const {
00657   ResultIterator it(*this);
00658   it.RestartParagraph();
00659   it.MoveToLogicalStartOfTextline();
00660   if (it.Empty(RIL_WORD)) return;
00661   do {
00662     it.IterateAndAppendUTF8TextlineText(text);
00663   } while (it.it_->block() != NULL && !it.IsAtBeginningOf(RIL_PARA));
00664 }
00665 
00666 bool ResultIterator::BidiDebug(int min_level) const {
00667   int debug_level = 1;
00668   IntParam *p = ParamUtils::FindParam<IntParam>(
00669       "bidi_debug", GlobalParams()->int_params,
00670       tesseract_->params()->int_params);
00671   if (p != NULL) debug_level = (inT32)(*p);
00672   return debug_level >= min_level;
00673 }
00674 
00675 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines