tesseract 3.04.01

textord/colpartitionset.cpp

Go to the documentation of this file.
00001 
00002 // File:        colpartitionset.cpp
00003 // Description: Class to hold a list of ColPartitions of the page that
00004 //              correspond roughly to columns.
00005 // Author:      Ray Smith
00006 // Created:     Thu Aug 14 10:54:01 PDT 2008
00007 //
00008 // (C) Copyright 2008, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #ifdef HAVE_CONFIG_H
00022 #include "config_auto.h"
00023 #endif
00024 
00025 #include "colpartitionset.h"
00026 #include "ndminx.h"
00027 #include "workingpartset.h"
00028 #include "tablefind.h"
00029 
00030 namespace tesseract {
00031 
00032 // Minimum width of a column to be interesting as a multiple of resolution.
00033 const double kMinColumnWidth = 2.0 / 3;
00034 
00035 ELISTIZE(ColPartitionSet)
00036 
00037 ColPartitionSet::ColPartitionSet(ColPartition_LIST* partitions) {
00038   ColPartition_IT it(&parts_);
00039   it.add_list_after(partitions);
00040   ComputeCoverage();
00041 }
00042 
00043 ColPartitionSet::ColPartitionSet(ColPartition* part) {
00044   ColPartition_IT it(&parts_);
00045   it.add_after_then_move(part);
00046   ComputeCoverage();
00047 }
00048 
00049 ColPartitionSet::~ColPartitionSet() {
00050 }
00051 
00052 // Returns the number of columns of good width.
00053 int ColPartitionSet::GoodColumnCount() const {
00054   int num_good_cols = 0;
00055   // This is a read-only iteration of the list.
00056   ColPartition_IT it(const_cast<ColPartition_LIST*>(&parts_));
00057   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00058     if (it.data()->good_width()) ++num_good_cols;
00059   }
00060   return num_good_cols;
00061 }
00062 
00063 // Return an element of the parts_ list from its index.
00064 ColPartition* ColPartitionSet::GetColumnByIndex(int index) {
00065   ColPartition_IT it(&parts_);
00066   it.mark_cycle_pt();
00067   for (int i = 0; i < index && !it.cycled_list(); ++i, it.forward());
00068   if (it.cycled_list())
00069     return NULL;
00070   return it.data();
00071 }
00072 
00073 // Return the ColPartition that contains the given coords, if any, else NULL.
00074 ColPartition* ColPartitionSet::ColumnContaining(int x, int y) {
00075   ColPartition_IT it(&parts_);
00076   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00077     ColPartition* part = it.data();
00078     if (part->ColumnContains(x, y))
00079       return part;
00080   }
00081   return NULL;
00082 }
00083 
00084 // Extract all the parts from the list, relinquishing ownership.
00085 void ColPartitionSet::RelinquishParts() {
00086   ColPartition_IT it(&parts_);
00087   while (!it.empty()) {
00088     it.extract();
00089     it.forward();
00090   }
00091 }
00092 
00093 // Attempt to improve this by adding partitions or expanding partitions.
00094 void ColPartitionSet::ImproveColumnCandidate(WidthCallback* cb,
00095                                              PartSetVector* src_sets) {
00096   int set_size = src_sets->size();
00097   // Iterate over the provided column sets, as each one may have something
00098   // to improve this.
00099   for (int i = 0; i < set_size; ++i) {
00100     ColPartitionSet* column_set = src_sets->get(i);
00101     if (column_set == NULL)
00102       continue;
00103     // Iterate over the parts in this and column_set, adding bigger or
00104     // new parts in column_set to this.
00105     ColPartition_IT part_it(&parts_);
00106     ASSERT_HOST(!part_it.empty());
00107     int prev_right = MIN_INT32;
00108     part_it.mark_cycle_pt();
00109     ColPartition_IT col_it(&column_set->parts_);
00110     for (col_it.mark_cycle_pt(); !col_it.cycled_list(); col_it.forward()) {
00111       ColPartition* col_part = col_it.data();
00112       if (col_part->blob_type() < BRT_UNKNOWN)
00113         continue;  // Ignore image partitions.
00114       int col_left = col_part->left_key();
00115       int col_right = col_part->right_key();
00116       // Sync-up part_it (in this) so it matches the col_part in column_set.
00117       ColPartition* part = part_it.data();
00118       while (!part_it.at_last() && part->right_key() < col_left) {
00119         prev_right = part->right_key();
00120         part_it.forward();
00121         part = part_it.data();
00122       }
00123       int part_left = part->left_key();
00124       int part_right = part->right_key();
00125       if (part_right < col_left || col_right < part_left) {
00126         // There is no overlap so this is a new partition.
00127         AddPartition(col_part->ShallowCopy(), &part_it);
00128         continue;
00129       }
00130       // Check the edges of col_part to see if they can improve part.
00131       bool part_width_ok = cb->Run(part->KeyWidth(part_left, part_right));
00132       if (col_left < part_left && col_left > prev_right) {
00133         // The left edge of the column is better and it doesn't overlap,
00134         // so we can potentially expand it.
00135         int col_box_left = col_part->BoxLeftKey();
00136         bool tab_width_ok = cb->Run(part->KeyWidth(col_left, part_right));
00137         bool box_width_ok = cb->Run(part->KeyWidth(col_box_left, part_right));
00138         if (tab_width_ok || (!part_width_ok )) {
00139           // The tab is leaving the good column metric at least as good as
00140           // it was before, so use the tab.
00141           part->CopyLeftTab(*col_part, false);
00142           part->SetColumnGoodness(cb);
00143         } else if (col_box_left < part_left &&
00144                    (box_width_ok || !part_width_ok)) {
00145           // The box is leaving the good column metric at least as good as
00146           // it was before, so use the box.
00147           part->CopyLeftTab(*col_part, true);
00148           part->SetColumnGoodness(cb);
00149         }
00150         part_left = part->left_key();
00151       }
00152       if (col_right > part_right &&
00153           (part_it.at_last() ||
00154            part_it.data_relative(1)->left_key() > col_right)) {
00155         // The right edge is better, so we can possibly expand it.
00156         int col_box_right = col_part->BoxRightKey();
00157         bool tab_width_ok = cb->Run(part->KeyWidth(part_left, col_right));
00158         bool box_width_ok = cb->Run(part->KeyWidth(part_left, col_box_right));
00159         if (tab_width_ok || (!part_width_ok )) {
00160           // The tab is leaving the good column metric at least as good as
00161           // it was before, so use the tab.
00162           part->CopyRightTab(*col_part, false);
00163           part->SetColumnGoodness(cb);
00164         } else if (col_box_right > part_right &&
00165                    (box_width_ok || !part_width_ok)) {
00166           // The box is leaving the good column metric at least as good as
00167           // it was before, so use the box.
00168           part->CopyRightTab(*col_part, true);
00169           part->SetColumnGoodness(cb);
00170         }
00171       }
00172     }
00173   }
00174   ComputeCoverage();
00175 }
00176 
00177 // If this set is good enough to represent a new partitioning into columns,
00178 // add it to the vector of sets, otherwise delete it.
00179 void ColPartitionSet::AddToColumnSetsIfUnique(PartSetVector* column_sets,
00180                                               WidthCallback* cb) {
00181   bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
00182                                          bounding_box_.bottom());
00183   if (debug) {
00184     tprintf("Considering new column candidate:\n");
00185     Print();
00186   }
00187   if (!LegalColumnCandidate()) {
00188     if (debug) {
00189       tprintf("Not a legal column candidate:\n");
00190       Print();
00191     }
00192     delete this;
00193     return;
00194   }
00195   for (int i = 0; i < column_sets->size(); ++i) {
00196     ColPartitionSet* columns = column_sets->get(i);
00197     // In ordering the column set candidates, good_coverage_ is king,
00198     // followed by good_column_count_ and then bad_coverage_.
00199     bool better = good_coverage_ > columns->good_coverage_;
00200     if (good_coverage_ == columns->good_coverage_) {
00201       better = good_column_count_ > columns->good_column_count_;
00202       if (good_column_count_ == columns->good_column_count_) {
00203           better = bad_coverage_ > columns->bad_coverage_;
00204       }
00205     }
00206     if (better) {
00207       // The new one is better so add it.
00208       if (debug)
00209         tprintf("Good one\n");
00210       column_sets->insert(this, i);
00211       return;
00212     }
00213     if (columns->CompatibleColumns(false, this, cb)) {
00214       if (debug)
00215         tprintf("Duplicate\n");
00216       delete this;
00217       return;  // It is not unique.
00218     }
00219   }
00220   if (debug)
00221     tprintf("Added to end\n");
00222   column_sets->push_back(this);
00223 }
00224 
00225 // Return true if the partitions in other are all compatible with the columns
00226 // in this.
00227 bool ColPartitionSet::CompatibleColumns(bool debug, ColPartitionSet* other,
00228                                         WidthCallback* cb) {
00229   if (debug) {
00230     tprintf("CompatibleColumns testing compatibility\n");
00231     Print();
00232     other->Print();
00233   }
00234   if (other->parts_.empty()) {
00235     if (debug)
00236       tprintf("CompatibleColumns true due to empty other\n");
00237     return true;
00238   }
00239   ColPartition_IT it(&other->parts_);
00240   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00241     ColPartition* part = it.data();
00242     if (part->blob_type() < BRT_UNKNOWN) {
00243       if (debug) {
00244         tprintf("CompatibleColumns ignoring image partition\n");
00245         part->Print();
00246       }
00247       continue;  // Image partitions are irrelevant to column compatibility.
00248     }
00249     int y = part->MidY();
00250     int left = part->bounding_box().left();
00251     int right = part->bounding_box().right();
00252     ColPartition* left_col = ColumnContaining(left, y);
00253     ColPartition* right_col = ColumnContaining(right, y);
00254     if (right_col == NULL || left_col == NULL) {
00255       if (debug) {
00256         tprintf("CompatibleColumns false due to partition edge outside\n");
00257         part->Print();
00258       }
00259       return false;  // A partition edge lies outside of all columns
00260     }
00261     if (right_col != left_col && cb->Run(right - left)) {
00262       if (debug) {
00263         tprintf("CompatibleColumns false due to good width in multiple cols\n");
00264         part->Print();
00265       }
00266       return false;  // Partition with a good width must be in a single column.
00267     }
00268 
00269     ColPartition_IT it2= it;
00270     while (!it2.at_last()) {
00271       it2.forward();
00272       ColPartition* next_part = it2.data();
00273       if (!BLOBNBOX::IsTextType(next_part->blob_type()))
00274         continue;  // Non-text partitions are irrelevant.
00275       int next_left = next_part->bounding_box().left();
00276       if (next_left == right) {
00277         break;  // They share the same edge, so one must be a pull-out.
00278       }
00279       // Search to see if right and next_left fall within a single column.
00280       ColPartition* next_left_col = ColumnContaining(next_left, y);
00281       if (right_col == next_left_col) {
00282         // There is a column break in this column.
00283         // This can be due to a figure caption within a column, a pull-out
00284         // block, or a simple broken textline that remains to be merged:
00285         // all allowed, or a change in column layout: not allowed.
00286         // If both partitions are of good width, then it is likely
00287         // a change in column layout, otherwise probably an allowed situation.
00288         if (part->good_width() && next_part->good_width()) {
00289           if (debug) {
00290             int next_right = next_part->bounding_box().right();
00291             tprintf("CompatibleColumns false due to 2 parts of good width\n");
00292             tprintf("part1 %d-%d, part2 %d-%d\n",
00293                     left, right, next_left, next_right);
00294             right_col->Print();
00295           }
00296           return false;
00297         }
00298       }
00299       break;
00300     }
00301   }
00302   if (debug)
00303     tprintf("CompatibleColumns true!\n");
00304   return true;
00305 }
00306 
00307 // Returns the total width of all blobs in the part_set that do not lie
00308 // within an approved column. Used as a cost measure for using this
00309 // column set over another that might be compatible.
00310 int ColPartitionSet::UnmatchedWidth(ColPartitionSet* part_set) {
00311   int total_width = 0;
00312   ColPartition_IT it(&part_set->parts_);
00313   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00314     ColPartition* part = it.data();
00315     if (!BLOBNBOX::IsTextType(part->blob_type())) {
00316       continue;  // Non-text partitions are irrelevant to column compatibility.
00317     }
00318     int y = part->MidY();
00319     BLOBNBOX_C_IT box_it(part->boxes());
00320     for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
00321       const TBOX& box = it.data()->bounding_box();
00322       // Assume that the whole blob is outside any column iff its x-middle
00323       // is outside.
00324       int x = (box.left() + box.right()) / 2;
00325       ColPartition* col = ColumnContaining(x, y);
00326       if (col == NULL)
00327         total_width += box.width();
00328     }
00329   }
00330   return total_width;
00331 }
00332 
00333 // Return true if this ColPartitionSet makes a legal column candidate by
00334 // having legal individual partitions and non-overlapping adjacent pairs.
00335 bool ColPartitionSet::LegalColumnCandidate() {
00336   ColPartition_IT it(&parts_);
00337   if (it.empty())
00338     return false;
00339   bool any_text_parts = false;
00340   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00341     ColPartition* part = it.data();
00342     if (BLOBNBOX::IsTextType(part->blob_type())) {
00343       if (!part->IsLegal())
00344         return false;  // Individual partition is illegal.
00345       any_text_parts = true;
00346     }
00347     if (!it.at_last()) {
00348       ColPartition* next_part = it.data_relative(1);
00349       if (next_part->left_key() < part->right_key()) {
00350         return false;
00351       }
00352     }
00353   }
00354   return any_text_parts;
00355 }
00356 
00357 // Return a copy of this. If good_only will only copy the Good ColPartitions.
00358 ColPartitionSet* ColPartitionSet::Copy(bool good_only) {
00359   ColPartition_LIST copy_parts;
00360   ColPartition_IT src_it(&parts_);
00361   ColPartition_IT dest_it(&copy_parts);
00362   for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
00363     ColPartition* part = src_it.data();
00364     if (BLOBNBOX::IsTextType(part->blob_type()) &&
00365         (!good_only || part->good_width() || part->good_column()))
00366       dest_it.add_after_then_move(part->ShallowCopy());
00367   }
00368   if (dest_it.empty())
00369     return NULL;
00370   return new ColPartitionSet(&copy_parts);
00371 }
00372 
00373 // Return the bounding boxes of columns at the given y-range
00374 void ColPartitionSet::GetColumnBoxes(int y_bottom, int y_top,
00375                                      ColSegment_LIST *segments) {
00376   ColPartition_IT it(&parts_);
00377   ColSegment_IT col_it(segments);
00378   col_it.move_to_last();
00379   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00380     ColPartition* part = it.data();
00381     ICOORD bot_left(part->LeftAtY(y_top), y_bottom);
00382     ICOORD top_right(part->RightAtY(y_bottom), y_top);
00383     ColSegment *col_seg = new ColSegment();
00384     col_seg->InsertBox(TBOX(bot_left, top_right));
00385     col_it.add_after_then_move(col_seg);
00386   }
00387 }
00388 
00389 // Display the edges of the columns at the given y coords.
00390 void ColPartitionSet::DisplayColumnEdges(int y_bottom, int y_top,
00391                                          ScrollView* win) {
00392 #ifndef GRAPHICS_DISABLED
00393   ColPartition_IT it(&parts_);
00394   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00395     ColPartition* part = it.data();
00396     win->Line(part->LeftAtY(y_top), y_top, part->LeftAtY(y_bottom), y_bottom);
00397     win->Line(part->RightAtY(y_top), y_top, part->RightAtY(y_bottom), y_bottom);
00398   }
00399 #endif  // GRAPHICS_DISABLED
00400 }
00401 
00402 // Return the ColumnSpanningType that best explains the columns overlapped
00403 // by the given coords(left,right,y), with the given margins.
00404 // Also return the first and last column index touched by the coords and
00405 // the leftmost spanned column.
00406 // Column indices are 2n + 1 for real columns (0 based) and even values
00407 // represent the gaps in between columns, with 0 being left of the leftmost.
00408 // resolution refers to the ppi resolution of the image.
00409 ColumnSpanningType ColPartitionSet::SpanningType(int resolution,
00410                                                  int left, int right,
00411                                                  int height, int y,
00412                                                  int left_margin,
00413                                                  int right_margin,
00414                                                  int* first_col,
00415                                                  int* last_col,
00416                                                  int* first_spanned_col) {
00417   *first_col = -1;
00418   *last_col = -1;
00419   *first_spanned_col = -1;
00420   int margin_columns = 0;
00421   ColPartition_IT it(&parts_);
00422   int col_index = 1;
00423   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), col_index += 2) {
00424     ColPartition* part = it.data();
00425     if (part->ColumnContains(left, y) ||
00426         (it.at_first() && part->ColumnContains(left + height, y))) {
00427       // In the default case, first_col is set, but columns_spanned remains
00428       // zero, so first_col will get reset in the first column genuinely
00429       // spanned, but we can tell the difference from a noise partition
00430       // that touches no column.
00431       *first_col = col_index;
00432       if (part->ColumnContains(right, y) ||
00433           (it.at_last() && part->ColumnContains(right - height, y))) {
00434         // Both within a single column.
00435         *last_col = col_index;
00436         return CST_FLOWING;
00437       }
00438       if (left_margin <= part->LeftAtY(y)) {
00439         // It completely spans this column.
00440         *first_spanned_col = col_index;
00441         margin_columns = 1;
00442       }
00443     } else if (part->ColumnContains(right, y) ||
00444                (it.at_last() && part->ColumnContains(right - height, y))) {
00445       if (*first_col < 0) {
00446         // It started in-between.
00447         *first_col = col_index - 1;
00448       }
00449       if (right_margin >= part->RightAtY(y)) {
00450         // It completely spans this column.
00451         if (margin_columns == 0)
00452           *first_spanned_col = col_index;
00453         ++margin_columns;
00454       }
00455       *last_col = col_index;
00456       break;
00457     } else if (left < part->LeftAtY(y) && right > part->RightAtY(y)) {
00458       // Neither left nor right are contained within, so it spans this
00459       // column.
00460       if (*first_col < 0) {
00461         // It started in between the previous column and the current column.
00462         *first_col = col_index - 1;
00463       }
00464       if (margin_columns == 0)
00465         *first_spanned_col = col_index;
00466       *last_col = col_index;
00467     } else if (right < part->LeftAtY(y)) {
00468       // We have gone past the end.
00469       *last_col = col_index - 1;
00470       if (*first_col < 0) {
00471         // It must lie completely between columns =>noise.
00472         *first_col = col_index - 1;
00473       }
00474       break;
00475     }
00476   }
00477   if (*first_col < 0)
00478     *first_col = col_index - 1;  // The last in-between.
00479   if (*last_col < 0)
00480     *last_col = col_index - 1;  // The last in-between.
00481   ASSERT_HOST(*first_col >= 0 && *last_col >= 0);
00482   ASSERT_HOST(*first_col <= *last_col);
00483   if (*first_col == *last_col && right - left < kMinColumnWidth * resolution) {
00484     // Neither end was in a column, and it didn't span any, so it lies
00485     // entirely between columns, therefore noise.
00486     return CST_NOISE;
00487   } else if (margin_columns <= 1) {
00488     // An exception for headings that stick outside of single-column text.
00489     if (margin_columns == 1 && parts_.singleton()) {
00490       return CST_HEADING;
00491     }
00492     // It is a pullout, as left and right were not in the same column, but
00493     // it doesn't go to the edge of its start and end.
00494     return CST_PULLOUT;
00495   }
00496   // Its margins went to the edges of first and last columns => heading.
00497   return CST_HEADING;
00498 }
00499 
00500 // The column_set has changed. Close down all in-progress WorkingPartSets in
00501 // columns that do not match and start new ones for the new columns in this.
00502 // As ColPartitions are turned into BLOCKs, the used ones are put in
00503 // used_parts, as they still need to be referenced in the grid.
00504 void ColPartitionSet::ChangeWorkColumns(const ICOORD& bleft,
00505                                         const ICOORD& tright,
00506                                         int resolution,
00507                                         ColPartition_LIST* used_parts,
00508                                         WorkingPartSet_LIST* working_set_list) {
00509   // Move the input list to a temporary location so we can delete its elements
00510   // as we add them to the output working_set.
00511   WorkingPartSet_LIST work_src;
00512   WorkingPartSet_IT src_it(&work_src);
00513   src_it.add_list_after(working_set_list);
00514   src_it.move_to_first();
00515   WorkingPartSet_IT dest_it(working_set_list);
00516   // Completed blocks and to_blocks are accumulated and given to the first new
00517   // one  whenever we keep a column, or at the end.
00518   BLOCK_LIST completed_blocks;
00519   TO_BLOCK_LIST to_blocks;
00520   WorkingPartSet* first_new_set = NULL;
00521   WorkingPartSet* working_set = NULL;
00522   ColPartition_IT col_it(&parts_);
00523   for (col_it.mark_cycle_pt(); !col_it.cycled_list(); col_it.forward()) {
00524     ColPartition* column = col_it.data();
00525     // Any existing column to the left of column is completed.
00526     while (!src_it.empty() &&
00527            ((working_set = src_it.data())->column() == NULL ||
00528             working_set->column()->right_key() <= column->left_key())) {
00529       src_it.extract();
00530       working_set->ExtractCompletedBlocks(bleft, tright, resolution,
00531                                           used_parts, &completed_blocks,
00532                                           &to_blocks);
00533       delete working_set;
00534       src_it.forward();
00535     }
00536     // Make a new between-column WorkingSet for before the current column.
00537     working_set = new WorkingPartSet(NULL);
00538     dest_it.add_after_then_move(working_set);
00539     if (first_new_set == NULL)
00540       first_new_set = working_set;
00541     // A matching column gets to stay, and first_new_set gets all the
00542     // completed_sets.
00543     working_set = src_it.empty() ? NULL : src_it.data();
00544     if (working_set != NULL &&
00545         working_set->column()->MatchingColumns(*column)) {
00546       working_set->set_column(column);
00547       dest_it.add_after_then_move(src_it.extract());
00548       src_it.forward();
00549       first_new_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);
00550       first_new_set = NULL;
00551     } else {
00552       // Just make a new working set for the current column.
00553       working_set = new WorkingPartSet(column);
00554       dest_it.add_after_then_move(working_set);
00555     }
00556   }
00557   // Complete any remaining src working sets.
00558   while (!src_it.empty()) {
00559     working_set = src_it.extract();
00560     working_set->ExtractCompletedBlocks(bleft, tright, resolution,
00561                                         used_parts, &completed_blocks,
00562                                         &to_blocks);
00563     delete working_set;
00564     src_it.forward();
00565   }
00566   // Make a new between-column WorkingSet for after the last column.
00567   working_set = new WorkingPartSet(NULL);
00568   dest_it.add_after_then_move(working_set);
00569   if (first_new_set == NULL)
00570     first_new_set = working_set;
00571   // The first_new_set now gets any accumulated completed_parts/blocks.
00572   first_new_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);
00573 }
00574 
00575 // Accumulate the widths and gaps into the given variables.
00576 void ColPartitionSet::AccumulateColumnWidthsAndGaps(int* total_width,
00577                                                     int* width_samples,
00578                                                     int* total_gap,
00579                                                     int* gap_samples) {
00580   ColPartition_IT it(&parts_);
00581   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00582     ColPartition* part = it.data();
00583     *total_width += part->ColumnWidth();
00584     ++*width_samples;
00585     if (!it.at_last()) {
00586       ColPartition* next_part = it.data_relative(1);
00587       int gap = part->KeyWidth(part->right_key(), next_part->left_key());
00588       *total_gap += gap;
00589       ++*gap_samples;
00590     }
00591   }
00592 }
00593 
00594 // Provide debug output for this ColPartitionSet and all the ColPartitions.
00595 void ColPartitionSet::Print() {
00596   ColPartition_IT it(&parts_);
00597   tprintf("Partition set of %d parts, %d good, coverage=%d+%d"
00598           " (%d,%d)->(%d,%d)\n",
00599           it.length(), good_column_count_, good_coverage_, bad_coverage_,
00600           bounding_box_.left(), bounding_box_.bottom(),
00601           bounding_box_.right(), bounding_box_.top());
00602   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00603     ColPartition* part = it.data();
00604     part->Print();
00605   }
00606 }
00607 
00608 // PRIVATE CODE.
00609 
00610 // Add the given partition to the list in the appropriate place.
00611 void ColPartitionSet::AddPartition(ColPartition* new_part,
00612                                    ColPartition_IT* it) {
00613   AddPartitionCoverageAndBox(*new_part);
00614   int new_right = new_part->right_key();
00615   if (it->data()->left_key() >= new_right)
00616     it->add_before_stay_put(new_part);
00617   else
00618     it->add_after_stay_put(new_part);
00619 }
00620 
00621 // Compute the coverage and good column count. Coverage is the amount of the
00622 // width of the page (in pixels) that is covered by ColPartitions, which are
00623 // used to provide candidate column layouts.
00624 // Coverage is split into good and bad. Good coverage is provided by
00625 // ColPartitions of a frequent width (according to the callback function
00626 // provided by TabFinder::WidthCB, which accesses stored statistics on the
00627 // widths of ColParititions) and bad coverage is provided by all other
00628 // ColPartitions, even if they have tab vectors at both sides. Thus:
00629 // |-----------------------------------------------------------------|
00630 // |        Double     width    heading                              |
00631 // |-----------------------------------------------------------------|
00632 // |-------------------------------| |-------------------------------|
00633 // |   Common width ColParition    | |  Common width ColPartition    |
00634 // |-------------------------------| |-------------------------------|
00635 // the layout with two common-width columns has better coverage than the
00636 // double width heading, because the coverage is "good," even though less in
00637 // total coverage than the heading, because the heading coverage is "bad."
00638 void ColPartitionSet::ComputeCoverage() {
00639   // Count the number of good columns and sum their width.
00640   ColPartition_IT it(&parts_);
00641   good_column_count_ = 0;
00642   good_coverage_ = 0;
00643   bad_coverage_ = 0;
00644   bounding_box_ = TBOX();
00645   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00646     ColPartition* part = it.data();
00647     AddPartitionCoverageAndBox(*part);
00648   }
00649 }
00650 
00651 // Adds the coverage, column count and box for a single partition,
00652 // without adding it to the list. (Helper factored from ComputeCoverage.)
00653 void ColPartitionSet::AddPartitionCoverageAndBox(const ColPartition& part) {
00654   bounding_box_ += part.bounding_box();
00655   int coverage = part.ColumnWidth();
00656   if (part.good_width()) {
00657     good_coverage_ += coverage;
00658     good_column_count_ += 2;
00659   } else {
00660     if (part.blob_type() < BRT_UNKNOWN)
00661       coverage /= 2;
00662     if (part.good_column())
00663       ++good_column_count_;
00664     bad_coverage_ += coverage;
00665   }
00666 }
00667 
00668 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines