|
tesseract 3.04.01
|
00001 00002 // File: colpartitionset.h 00003 // Description: Class to hold a list of ColPartitions of the page that 00004 // correspond roughly to columns. 00005 // Author: Ray Smith 00006 // Created: Thu Aug 14 10:50:01 PDT 2008 00007 // 00008 // (C) Copyright 2008, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #ifndef TESSERACT_TEXTORD_COLPARTITIONSET_H__ 00022 #define TESSERACT_TEXTORD_COLPARTITIONSET_H__ 00023 00024 #include "colpartition.h" // For ColPartition_LIST. 00025 #include "genericvector.h" // For GenericVector. 00026 #include "rect.h" // For TBOX. 00027 #include "tabvector.h" // For BLOBNBOX_CLIST. 00028 00029 namespace tesseract { 00030 00031 class WorkingPartSet_LIST; 00032 class ColSegment_LIST; 00033 class ColPartitionSet; 00034 typedef GenericVector<ColPartitionSet*> PartSetVector; 00035 00036 // ColPartitionSet is a class that holds a list of ColPartitions. 00037 // Its main use is in holding a candidate partitioning of the width of the 00038 // image into columns, where each member ColPartition is a single column. 00039 // ColPartitionSets are used in building the column layout of a page. 00040 class ColPartitionSet : public ELIST_LINK { 00041 public: 00042 ColPartitionSet() { 00043 } 00044 explicit ColPartitionSet(ColPartition_LIST* partitions); 00045 explicit ColPartitionSet(ColPartition* partition); 00046 00047 ~ColPartitionSet(); 00048 00049 // Simple accessors. 00050 const TBOX& bounding_box() const { 00051 return bounding_box_; 00052 } 00053 bool Empty() const { 00054 return parts_.empty(); 00055 } 00056 int ColumnCount() const { 00057 return parts_.length(); 00058 } 00059 00060 // Returns the number of columns of good width. 00061 int GoodColumnCount() const; 00062 00063 // Return an element of the parts_ list from its index. 00064 ColPartition* GetColumnByIndex(int index); 00065 00066 // Return the ColPartition that contains the given coords, if any, else NULL. 00067 ColPartition* ColumnContaining(int x, int y); 00068 00069 // Return the bounding boxes of columns at the given y-range 00070 void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments); 00071 00072 // Extract all the parts from the list, relinquishing ownership. 00073 void RelinquishParts(); 00074 00075 // Attempt to improve this by adding partitions or expanding partitions. 00076 void ImproveColumnCandidate(WidthCallback* cb, PartSetVector* src_sets); 00077 00078 // If this set is good enough to represent a new partitioning into columns, 00079 // add it to the vector of sets, otherwise delete it. 00080 void AddToColumnSetsIfUnique(PartSetVector* column_sets, WidthCallback* cb); 00081 00082 // Return true if the partitions in other are all compatible with the columns 00083 // in this. 00084 bool CompatibleColumns(bool debug, ColPartitionSet* other, WidthCallback* cb); 00085 00086 // Returns the total width of all blobs in the part_set that do not lie 00087 // within an approved column. Used as a cost measure for using this 00088 // column set over another that might be compatible. 00089 int UnmatchedWidth(ColPartitionSet* part_set); 00090 00091 // Return true if this ColPartitionSet makes a legal column candidate by 00092 // having legal individual partitions and non-overlapping adjacent pairs. 00093 bool LegalColumnCandidate(); 00094 00095 // Return a copy of this. If good_only will only copy the Good ColPartitions. 00096 ColPartitionSet* Copy(bool good_only); 00097 00098 // Display the edges of the columns at the given y coords. 00099 void DisplayColumnEdges(int y_bottom, int y_top, ScrollView* win); 00100 00101 // Return the ColumnSpanningType that best explains the columns overlapped 00102 // by the given coords(left,right,y), with the given margins. 00103 // Also return the first and last column index touched by the coords and 00104 // the leftmost spanned column. 00105 // Column indices are 2n + 1 for real colums (0 based) and even values 00106 // represent the gaps in between columns, with 0 being left of the leftmost. 00107 // resolution refers to the ppi resolution of the image. It may be 0 if only 00108 // the first_col and last_col are required. 00109 ColumnSpanningType SpanningType(int resolution, 00110 int left, int right, int height, int y, 00111 int left_margin, int right_margin, 00112 int* first_col, int* last_col, 00113 int* first_spanned_col); 00114 00115 // The column_set has changed. Close down all in-progress WorkingPartSets in 00116 // columns that do not match and start new ones for the new columns in this. 00117 // As ColPartitions are turned into BLOCKs, the used ones are put in 00118 // used_parts, as they still need to be referenced in the grid. 00119 void ChangeWorkColumns(const ICOORD& bleft, const ICOORD& tright, 00120 int resolution, ColPartition_LIST* used_parts, 00121 WorkingPartSet_LIST* working_set); 00122 00123 // Accumulate the widths and gaps into the given variables. 00124 void AccumulateColumnWidthsAndGaps(int* total_width, int* width_samples, 00125 int* total_gap, int* gap_samples); 00126 00127 // Provide debug output for this ColPartitionSet and all the ColPartitions. 00128 void Print(); 00129 00130 private: 00131 // Add the given partition to the list in the appropriate place. 00132 void AddPartition(ColPartition* new_part, ColPartition_IT* it); 00133 00134 // Compute the coverage and good column count. Coverage is the amount of the 00135 // width of the page (in pixels) that is covered by ColPartitions, which are 00136 // used to provide candidate column layouts. 00137 // Coverage is split into good and bad. Good coverage is provided by 00138 // ColPartitions of a frequent width (according to the callback function 00139 // provided by TabFinder::WidthCB, which accesses stored statistics on the 00140 // widths of ColParititions) and bad coverage is provided by all other 00141 // ColPartitions, even if they have tab vectors at both sides. Thus: 00142 // |-----------------------------------------------------------------| 00143 // | Double width heading | 00144 // |-----------------------------------------------------------------| 00145 // |-------------------------------| |-------------------------------| 00146 // | Common width ColParition | | Common width ColPartition | 00147 // |-------------------------------| |-------------------------------| 00148 // the layout with two common-width columns has better coverage than the 00149 // double width heading, because the coverage is "good," even though less in 00150 // total coverage than the heading, because the heading coverage is "bad." 00151 void ComputeCoverage(); 00152 00153 // Adds the coverage, column count and box for a single partition, 00154 // without adding it to the list. (Helper factored from ComputeCoverage.) 00155 void AddPartitionCoverageAndBox(const ColPartition& part); 00156 00157 // The partitions in this column candidate. 00158 ColPartition_LIST parts_; 00159 // The number of partitions that have a frequent column width. 00160 int good_column_count_; 00161 // Total width of all the good ColPartitions. 00162 int good_coverage_; 00163 // Total width of all the bad ColPartitions. 00164 int bad_coverage_; 00165 // Bounding box of all partitions in the set. 00166 TBOX bounding_box_; 00167 }; 00168 00169 ELISTIZEH(ColPartitionSet) 00170 00171 } // namespace tesseract. 00172 00173 #endif // TESSERACT_TEXTORD_COLPARTITION_H__