tesseract 3.04.01

textord/colpartitionset.h

Go to the documentation of this file.
00001 
00002 // File:        colpartitionset.h
00003 // Description: Class to hold a list of ColPartitions of the page that
00004 //              correspond roughly to columns.
00005 // Author:      Ray Smith
00006 // Created:     Thu Aug 14 10:50:01 PDT 2008
00007 //
00008 // (C) Copyright 2008, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #ifndef TESSERACT_TEXTORD_COLPARTITIONSET_H__
00022 #define TESSERACT_TEXTORD_COLPARTITIONSET_H__
00023 
00024 #include "colpartition.h"   // For ColPartition_LIST.
00025 #include "genericvector.h"  // For GenericVector.
00026 #include "rect.h"           // For TBOX.
00027 #include "tabvector.h"      // For BLOBNBOX_CLIST.
00028 
00029 namespace tesseract {
00030 
00031 class WorkingPartSet_LIST;
00032 class ColSegment_LIST;
00033 class ColPartitionSet;
00034 typedef GenericVector<ColPartitionSet*> PartSetVector;
00035 
00036 // ColPartitionSet is a class that holds a list of ColPartitions.
00037 // Its main use is in holding a candidate partitioning of the width of the
00038 // image into columns, where each member ColPartition is a single column.
00039 // ColPartitionSets are used in building the column layout of a page.
00040 class ColPartitionSet : public ELIST_LINK {
00041  public:
00042   ColPartitionSet() {
00043   }
00044   explicit ColPartitionSet(ColPartition_LIST* partitions);
00045   explicit ColPartitionSet(ColPartition* partition);
00046 
00047   ~ColPartitionSet();
00048 
00049   // Simple accessors.
00050   const TBOX& bounding_box() const {
00051     return bounding_box_;
00052   }
00053   bool Empty() const {
00054     return parts_.empty();
00055   }
00056   int ColumnCount() const {
00057     return parts_.length();
00058   }
00059 
00060   // Returns the number of columns of good width.
00061   int GoodColumnCount() const;
00062 
00063   // Return an element of the parts_ list from its index.
00064   ColPartition* GetColumnByIndex(int index);
00065 
00066   // Return the ColPartition that contains the given coords, if any, else NULL.
00067   ColPartition* ColumnContaining(int x, int y);
00068 
00069   // Return the bounding boxes of columns at the given y-range
00070   void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments);
00071 
00072   // Extract all the parts from the list, relinquishing ownership.
00073   void RelinquishParts();
00074 
00075   // Attempt to improve this by adding partitions or expanding partitions.
00076   void ImproveColumnCandidate(WidthCallback* cb, PartSetVector* src_sets);
00077 
00078   // If this set is good enough to represent a new partitioning into columns,
00079   // add it to the vector of sets, otherwise delete it.
00080   void AddToColumnSetsIfUnique(PartSetVector* column_sets, WidthCallback* cb);
00081 
00082   // Return true if the partitions in other are all compatible with the columns
00083   // in this.
00084   bool CompatibleColumns(bool debug, ColPartitionSet* other, WidthCallback* cb);
00085 
00086   // Returns the total width of all blobs in the part_set that do not lie
00087   // within an approved column. Used as a cost measure for using this
00088   // column set over another that might be compatible.
00089   int UnmatchedWidth(ColPartitionSet* part_set);
00090 
00091   // Return true if this ColPartitionSet makes a legal column candidate by
00092   // having legal individual partitions and non-overlapping adjacent pairs.
00093   bool LegalColumnCandidate();
00094 
00095   // Return a copy of this. If good_only will only copy the Good ColPartitions.
00096   ColPartitionSet* Copy(bool good_only);
00097 
00098   // Display the edges of the columns at the given y coords.
00099   void DisplayColumnEdges(int y_bottom, int y_top, ScrollView* win);
00100 
00101   // Return the ColumnSpanningType that best explains the columns overlapped
00102   // by the given coords(left,right,y), with the given margins.
00103   // Also return the first and last column index touched by the coords and
00104   // the leftmost spanned column.
00105   // Column indices are 2n + 1 for real colums (0 based) and even values
00106   // represent the gaps in between columns, with 0 being left of the leftmost.
00107   // resolution refers to the ppi resolution of the image. It may be 0 if only
00108   // the first_col and last_col are required.
00109   ColumnSpanningType SpanningType(int resolution,
00110                                   int left, int right, int height, int y,
00111                                   int left_margin, int right_margin,
00112                                   int* first_col, int* last_col,
00113                                   int* first_spanned_col);
00114 
00115   // The column_set has changed. Close down all in-progress WorkingPartSets in
00116   // columns that do not match and start new ones for the new columns in this.
00117   // As ColPartitions are turned into BLOCKs, the used ones are put in
00118   // used_parts, as they still need to be referenced in the grid.
00119   void ChangeWorkColumns(const ICOORD& bleft, const ICOORD& tright,
00120                          int resolution, ColPartition_LIST* used_parts,
00121                          WorkingPartSet_LIST* working_set);
00122 
00123   // Accumulate the widths and gaps into the given variables.
00124   void AccumulateColumnWidthsAndGaps(int* total_width, int* width_samples,
00125                                      int* total_gap, int* gap_samples);
00126 
00127   // Provide debug output for this ColPartitionSet and all the ColPartitions.
00128   void Print();
00129 
00130  private:
00131   // Add the given partition to the list in the appropriate place.
00132   void AddPartition(ColPartition* new_part, ColPartition_IT* it);
00133 
00134   // Compute the coverage and good column count. Coverage is the amount of the
00135   // width of the page (in pixels) that is covered by ColPartitions, which are
00136   // used to provide candidate column layouts.
00137   // Coverage is split into good and bad. Good coverage is provided by
00138   // ColPartitions of a frequent width (according to the callback function
00139   // provided by TabFinder::WidthCB, which accesses stored statistics on the
00140   // widths of ColParititions) and bad coverage is provided by all other
00141   // ColPartitions, even if they have tab vectors at both sides. Thus:
00142   // |-----------------------------------------------------------------|
00143   // |        Double     width    heading                              |
00144   // |-----------------------------------------------------------------|
00145   // |-------------------------------| |-------------------------------|
00146   // |   Common width ColParition    | |  Common width ColPartition    |
00147   // |-------------------------------| |-------------------------------|
00148   // the layout with two common-width columns has better coverage than the
00149   // double width heading, because the coverage is "good," even though less in
00150   // total coverage than the heading, because the heading coverage is "bad."
00151   void ComputeCoverage();
00152 
00153   // Adds the coverage, column count and box for a single partition,
00154   // without adding it to the list. (Helper factored from ComputeCoverage.)
00155   void AddPartitionCoverageAndBox(const ColPartition& part);
00156 
00157   // The partitions in this column candidate.
00158   ColPartition_LIST parts_;
00159   // The number of partitions that have a frequent column width.
00160   int good_column_count_;
00161   // Total width of all the good ColPartitions.
00162   int good_coverage_;
00163   // Total width of all the bad ColPartitions.
00164   int bad_coverage_;
00165   // Bounding box of all partitions in the set.
00166   TBOX bounding_box_;
00167 };
00168 
00169 ELISTIZEH(ColPartitionSet)
00170 
00171 }  // namespace tesseract.
00172 
00173 #endif  // TESSERACT_TEXTORD_COLPARTITION_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines