tesseract 3.04.01

textord/tablerecog.cpp

Go to the documentation of this file.
00001 
00002 // File:        tablerecog.cpp
00003 // Description: Helper class to help structure table areas. Given an bounding
00004 //              box from TableFinder, the TableRecognizer should give a
00005 //              StructuredTable (maybe a list in the future) of "good" tables
00006 //              in that area.
00007 // Author:      Nicholas Beato
00008 // Created:     Friday, Aug. 20, 2010
00009 //
00010 // (C) Copyright 2009, Google Inc.
00011 // Licensed under the Apache License, Version 2.0 (the "License");
00012 // you may not use this file except in compliance with the License.
00013 // You may obtain a copy of the License at
00014 // http://www.apache.org/licenses/LICENSE-2.0
00015 // Unless required by applicable law or agreed to in writing, software
00016 // distributed under the License is distributed on an "AS IS" BASIS,
00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00018 // See the License for the specific language governing permissions and
00019 // limitations under the License.
00020 //
00022 
00023 #ifdef HAVE_CONFIG_H
00024 #include "config_auto.h"
00025 #endif
00026 
00027 #include "tablerecog.h"
00028 
00029 namespace tesseract {
00030 
00031 // The amount of space required between the ColPartitions in 2 columns
00032 // of a non-lined table as a multiple of the median width.
00033 const double kHorizontalSpacing = 0.30;
00034 // The amount of space required between the ColPartitions in 2 rows
00035 // of a non-lined table as multiples of the median height.
00036 const double kVerticalSpacing = -0.2;
00037 // The number of cells that the grid lines may intersect.
00038 // See FindCellSplitLocations for explanation.
00039 const int kCellSplitRowThreshold = 0;
00040 const int kCellSplitColumnThreshold = 0;
00041 // For "lined tables", the number of required lines. Currently a guess.
00042 const int kLinedTableMinVerticalLines = 3;
00043 const int kLinedTableMinHorizontalLines = 3;
00044 // Number of columns required, as a fraction of the most columns found.
00045 // None of these are tweaked at all.
00046 const double kRequiredColumns = 0.7;
00047 // The tolerance for comparing margins of potential tables.
00048 const double kMarginFactor = 1.1;
00049 // The first and last row should be consistent cell height.
00050 // This factor is the first and last row cell height max.
00051 const double kMaxRowSize = 2.5;
00052 // Number of filled columns required to form a strong table row.
00053 // For small tables, this is an absolute number.
00054 const double kGoodRowNumberOfColumnsSmall[] = { 2, 2, 2, 2, 2, 3, 3 };
00055 const int kGoodRowNumberOfColumnsSmallSize = 
00056     sizeof(kGoodRowNumberOfColumnsSmall) / sizeof(double) - 1;
00057 // For large tables, it is a relative number
00058 const double kGoodRowNumberOfColumnsLarge = 0.7;
00059 // The amount of area that must be covered in a cell by ColPartitions to
00060 // be considered "filled"
00061 const double kMinFilledArea = 0.35;
00062 
00066 
00067 StructuredTable::StructuredTable()
00068     : text_grid_(NULL),
00069       line_grid_(NULL),
00070       is_lined_(false),
00071       space_above_(0),
00072       space_below_(0),
00073       space_left_(0),
00074       space_right_(0),
00075       median_cell_height_(0),
00076       median_cell_width_(0),
00077       max_text_height_(MAX_INT32) {
00078 }
00079 
00080 StructuredTable::~StructuredTable() {
00081 }
00082 
00083 void StructuredTable::Init() {
00084 }
00085 
00086 void StructuredTable::set_text_grid(ColPartitionGrid* text_grid) {
00087   text_grid_ = text_grid;
00088 }
00089 void StructuredTable::set_line_grid(ColPartitionGrid* line_grid) {
00090   line_grid_ = line_grid;
00091 }
00092 void StructuredTable::set_max_text_height(int height) {
00093   max_text_height_ = height;
00094 }
00095 bool StructuredTable::is_lined() const {
00096   return is_lined_;
00097 }
00098 int StructuredTable::row_count() const {
00099   return cell_y_.length() == 0 ? 0 : cell_y_.length() - 1;
00100 }
00101 int StructuredTable::column_count() const {
00102   return cell_x_.length() == 0 ? 0 : cell_x_.length() - 1;
00103 }
00104 int StructuredTable::cell_count() const {
00105   return row_count() * column_count();
00106 }
00107 void StructuredTable::set_bounding_box(const TBOX& box) {
00108   bounding_box_ = box;
00109 }
00110 const TBOX& StructuredTable::bounding_box() const {
00111   return bounding_box_;
00112 }
00113 int StructuredTable::median_cell_height() {
00114   return median_cell_height_;
00115 }
00116 int StructuredTable::median_cell_width() {
00117   return median_cell_width_;
00118 }
00119 int StructuredTable::row_height(int row) const {
00120   ASSERT_HOST(0 <= row && row < row_count());
00121   return cell_y_[row + 1] - cell_y_[row];
00122 }
00123 int StructuredTable::column_width(int column) const {
00124   ASSERT_HOST(0 <= column && column < column_count());
00125   return cell_x_[column + 1] - cell_x_[column];
00126 }
00127 int StructuredTable::space_above() const {
00128   return space_above_;
00129 }
00130 int StructuredTable::space_below() const {
00131   return space_below_;
00132 }
00133 
00134 // At this point, we know that the lines are contained
00135 // by the box (by FindLinesBoundingBox).
00136 // So try to find the cell structure and make sure it works out.
00137 // The assumption is that all lines span the table. If this
00138 // assumption fails, the VerifyLinedTable method will
00139 // abort the lined table. The TableRecognizer will fall
00140 // back on FindWhitespacedStructure.
00141 bool StructuredTable::FindLinedStructure() {
00142   ClearStructure();
00143 
00144   // Search for all of the lines in the current box.
00145   // Update the cellular structure with the exact lines.
00146   ColPartitionGridSearch box_search(line_grid_);
00147   box_search.SetUniqueMode(true);
00148   box_search.StartRectSearch(bounding_box_);
00149   ColPartition* line = NULL;
00150 
00151   while ((line = box_search.NextRectSearch()) != NULL) {
00152     if (line->IsHorizontalLine())
00153       cell_y_.push_back(line->MidY());
00154     if (line->IsVerticalLine())
00155       cell_x_.push_back(line->MidX());
00156   }
00157 
00158   // HasSignificantLines should guarantee cells.
00159   // Because that code is a different class, just gracefully
00160   // return false. This could be an assert.
00161   if (cell_x_.length() < 3 || cell_y_.length() < 3)
00162     return false;
00163 
00164   cell_x_.sort();
00165   cell_y_.sort();
00166 
00167   // Remove duplicates that may have occurred due to split lines.
00168   cell_x_.compact_sorted();
00169   cell_y_.compact_sorted();
00170 
00171   // The border should be the extents of line boxes, not middle.
00172   cell_x_[0] = bounding_box_.left();
00173   cell_x_[cell_x_.length() - 1] = bounding_box_.right();
00174   cell_y_[0] = bounding_box_.bottom();
00175   cell_y_[cell_y_.length() - 1] = bounding_box_.top();
00176 
00177   // Remove duplicates that may have occurred due to moving the borders.
00178   cell_x_.compact_sorted();
00179   cell_y_.compact_sorted();
00180 
00181   CalculateMargins();
00182   CalculateStats();
00183   is_lined_ = VerifyLinedTableCells();
00184   return is_lined_;
00185 }
00186 
00187 // Finds the cellular structure given a particular box.
00188 bool StructuredTable::FindWhitespacedStructure() {
00189   ClearStructure();
00190   FindWhitespacedColumns();
00191   FindWhitespacedRows();
00192 
00193   if (!VerifyWhitespacedTable()) {
00194     return false;
00195   } else {
00196     bounding_box_.set_left(cell_x_[0]);
00197     bounding_box_.set_right(cell_x_[cell_x_.length() - 1]);
00198     bounding_box_.set_bottom(cell_y_[0]);
00199     bounding_box_.set_top(cell_y_[cell_y_.length() - 1]);
00200     AbsorbNearbyLines();
00201     CalculateMargins();
00202     CalculateStats();
00203     return true;
00204   }
00205 }
00206 
00207 // Tests if a partition fits inside the table structure.
00208 // Partitions must fully span a grid line in order to intersect it.
00209 // This means that a partition does not intersect a line
00210 // that it "just" touches. This is mainly because the assumption
00211 // throughout the code is that "0" distance is a very very small space.
00212 bool StructuredTable::DoesPartitionFit(const ColPartition& part) const {
00213   const TBOX& box = part.bounding_box();
00214   for (int i = 0; i < cell_x_.length(); ++i)
00215     if (box.left() < cell_x_[i] && cell_x_[i] < box.right())
00216       return false;
00217   for (int i = 0; i < cell_y_.length(); ++i)
00218     if (box.bottom() < cell_y_[i] && cell_y_[i] < box.top())
00219       return false;
00220   return true;
00221 }
00222 
00223 // Checks if a sub-table has multiple data cells filled.
00224 int StructuredTable::CountFilledCells() {
00225   return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
00226 }
00227 int StructuredTable::CountFilledCellsInRow(int row) {
00228   return CountFilledCells(row, row, 0, column_count() - 1);
00229 }
00230 int StructuredTable::CountFilledCellsInColumn(int column) {
00231   return CountFilledCells(0, row_count() - 1, column, column);
00232 }
00233 int StructuredTable::CountFilledCells(int row_start, int row_end,
00234                             int column_start, int column_end) {
00235   ASSERT_HOST(0 <= row_start && row_start <= row_end && row_end < row_count());
00236   ASSERT_HOST(0 <= column_start && column_start <= column_end &&
00237               column_end < column_count());
00238   int cell_count = 0;
00239   TBOX cell_box;
00240   for (int row = row_start; row <= row_end; ++row) {
00241     cell_box.set_bottom(cell_y_[row]);
00242     cell_box.set_top(cell_y_[row + 1]);
00243     for (int col = column_start; col <= column_end; ++col) {
00244       cell_box.set_left(cell_x_[col]);
00245       cell_box.set_right(cell_x_[col + 1]);
00246       if (CountPartitions(cell_box) > 0)
00247         ++cell_count;
00248     }
00249   }
00250   return cell_count;
00251 }
00252 
00253 // Makes sure that at least one cell in a row has substantial area filled.
00254 // This can filter out large whitespace caused by growing tables too far
00255 // and page numbers.
00256 bool StructuredTable::VerifyRowFilled(int row) {
00257   for (int i = 0; i < column_count(); ++i) {
00258     double area_filled = CalculateCellFilledPercentage(row, i);
00259     if (area_filled >= kMinFilledArea)
00260       return true;
00261   }
00262   return false;
00263 }
00264 
00265 // Finds the filled area in a cell.
00266 // Assume ColPartitions do not overlap for simplicity (even though they do).
00267 double StructuredTable::CalculateCellFilledPercentage(int row, int column) {
00268   ASSERT_HOST(0 <= row && row <= row_count());
00269   ASSERT_HOST(0 <= column && column <= column_count());
00270   const TBOX kCellBox(cell_x_[column], cell_y_[row],
00271                       cell_x_[column + 1], cell_y_[row + 1]);
00272   ASSERT_HOST(!kCellBox.null_box());
00273 
00274   ColPartitionGridSearch gsearch(text_grid_);
00275   gsearch.SetUniqueMode(true);
00276   gsearch.StartRectSearch(kCellBox);
00277   double area_covered = 0;
00278   ColPartition* text = NULL;
00279   while ((text = gsearch.NextRectSearch()) != NULL) {
00280     if (text->IsTextType())
00281       area_covered += text->bounding_box().intersection(kCellBox).area();
00282   }
00283   const inT32 current_area = kCellBox.area();
00284   if (current_area == 0) {
00285     return 1.0;
00286   }
00287   return MIN(1.0, area_covered / current_area);
00288 }
00289 
00290 void StructuredTable::Display(ScrollView* window, ScrollView::Color color) {
00291 #ifndef GRAPHICS_DISABLED
00292   window->Brush(ScrollView::NONE);
00293   window->Pen(color);
00294   window->Rectangle(bounding_box_.left(), bounding_box_.bottom(),
00295                     bounding_box_.right(), bounding_box_.top());
00296   for (int i = 0; i < cell_x_.length(); i++) {
00297     window->Line(cell_x_[i], bounding_box_.bottom(),
00298                  cell_x_[i], bounding_box_.top());
00299   }
00300   for (int i = 0; i < cell_y_.length(); i++) {
00301     window->Line(bounding_box_.left(), cell_y_[i],
00302                  bounding_box_.right(), cell_y_[i]);
00303   }
00304   window->UpdateWindow();
00305 #endif
00306 }
00307 
00308 // Clear structure information.
00309 void StructuredTable::ClearStructure() {
00310   cell_x_.clear();
00311   cell_y_.clear();
00312   is_lined_ = false;
00313   space_above_ = 0;
00314   space_below_ = 0;
00315   space_left_ = 0;
00316   space_right_ = 0;
00317   median_cell_height_ = 0;
00318   median_cell_width_ = 0;
00319 }
00320 
00321 // When a table has lines, the lines should not intersect any partitions.
00322 // The following function makes sure the previous assumption is met.
00323 bool StructuredTable::VerifyLinedTableCells() {
00324   // Function only called when lines exist.
00325   ASSERT_HOST(cell_y_.length() >= 2 && cell_x_.length() >= 2);
00326   for (int i = 0; i < cell_y_.length(); ++i) {
00327     if (CountHorizontalIntersections(cell_y_[i]) > 0)
00328       return false;
00329   }
00330   for (int i = 0; i < cell_x_.length(); ++i) {
00331     if (CountVerticalIntersections(cell_x_[i]) > 0)
00332       return false;
00333   }
00334   return true;
00335 }
00336 
00337 // TODO(nbeato): Could be much better than this.
00338 // Examples:
00339 //   - Caclulate the percentage of filled cells.
00340 //   - Calculate the average number of ColPartitions per cell.
00341 //   - Calculate the number of cells per row with partitions.
00342 //   - Check if ColPartitions in adjacent cells are similar.
00343 //   - Check that all columns are at least a certain width.
00344 //   - etc.
00345 bool StructuredTable::VerifyWhitespacedTable() {
00346   // criteria for a table, must be at least 2x3 or 3x2
00347   return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
00348 }
00349 
00350 // Finds vertical splits in the ColPartitions of text_grid_ by considering
00351 // all possible "good" guesses. A good guess is just the left/right sides of
00352 // the partitions, since these locations will uniquely define where the
00353 // extremal values where the splits can occur. The split happens
00354 // in the middle of the two nearest partitions.
00355 void StructuredTable::FindWhitespacedColumns() {
00356   // Set of the extents of all partitions on the page.
00357   GenericVectorEqEq<int> left_sides;
00358   GenericVectorEqEq<int> right_sides;
00359 
00360   // Look at each text partition. We want to find the partitions
00361   // that have extremal left/right sides. These will give us a basis
00362   // for the table columns.
00363   ColPartitionGridSearch gsearch(text_grid_);
00364   gsearch.SetUniqueMode(true);
00365   gsearch.StartRectSearch(bounding_box_);
00366   ColPartition* text = NULL;
00367   while ((text = gsearch.NextRectSearch()) != NULL) {
00368     if (!text->IsTextType())
00369       continue;
00370 
00371     ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
00372     int spacing = static_cast<int>(text->median_width() *
00373                                    kHorizontalSpacing / 2.0 + 0.5);
00374     left_sides.push_back(text->bounding_box().left() - spacing);
00375     right_sides.push_back(text->bounding_box().right() + spacing);
00376   }
00377   // It causes disaster below, so avoid it!
00378   if (left_sides.length() == 0 || right_sides.length() == 0)
00379     return;
00380 
00381   // Since data may be inserted in grid order, we sort the left/right sides.
00382   left_sides.sort();
00383   right_sides.sort();
00384 
00385   // At this point, in the "merged list", we expect to have a left side,
00386   // followed by either more left sides or a right side. The last number
00387   // should be a right side. We find places where the splits occur by looking
00388   // for "valleys". If we want to force gap sizes or allow overlap, change
00389   // the spacing above. If you want to let lines "slice" partitions as long
00390   // as it is infrequent, change the following function.
00391   FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold,
00392                          &cell_x_);
00393 }
00394 
00395 // Finds horizontal splits in the ColPartitions of text_grid_ by considering
00396 // all possible "good" guesses. A good guess is just the bottom/top sides of
00397 // the partitions, since these locations will uniquely define where the
00398 // extremal values where the splits can occur. The split happens
00399 // in the middle of the two nearest partitions.
00400 void StructuredTable::FindWhitespacedRows() {
00401   // Set of the extents of all partitions on the page.
00402   GenericVectorEqEq<int> bottom_sides;
00403   GenericVectorEqEq<int> top_sides;
00404   // We will be "shrinking" partitions, so keep the min/max around to
00405   // make sure the bottom/top lines do not intersect text.
00406   int min_bottom = MAX_INT32;
00407   int max_top = MIN_INT32;
00408 
00409   // Look at each text partition. We want to find the partitions
00410   // that have extremal bottom/top sides. These will give us a basis
00411   // for the table rows. Because the textlines can be skewed and close due
00412   // to warping, the height of the partitions is toned down a little bit.
00413   ColPartitionGridSearch gsearch(text_grid_);
00414   gsearch.SetUniqueMode(true);
00415   gsearch.StartRectSearch(bounding_box_);
00416   ColPartition* text = NULL;
00417   while ((text = gsearch.NextRectSearch()) != NULL) {
00418     if (!text->IsTextType())
00419       continue;
00420 
00421     ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
00422     min_bottom = MIN(min_bottom, text->bounding_box().bottom());
00423     max_top = MAX(max_top, text->bounding_box().top());
00424 
00425     // Ignore "tall" text partitions, as these are usually false positive
00426     // vertical text or multiple lines pulled together.
00427     if (text->bounding_box().height() > max_text_height_)
00428       continue;
00429 
00430     int spacing = static_cast<int>(text->bounding_box().height() *
00431                                    kVerticalSpacing / 2.0 + 0.5);
00432     int bottom = text->bounding_box().bottom() - spacing;
00433     int top = text->bounding_box().top() + spacing;
00434     // For horizontal text, the factor can be negative. This should
00435     // probably cause a warning or failure. I haven't actually checked if
00436     // it happens.
00437     if (bottom >= top)
00438       continue;
00439 
00440     bottom_sides.push_back(bottom);
00441     top_sides.push_back(top);
00442   }
00443   // It causes disaster below, so avoid it!
00444   if (bottom_sides.length() == 0 || top_sides.length() == 0)
00445     return;
00446 
00447   // Since data may be inserted in grid order, we sort the bottom/top sides.
00448   bottom_sides.sort();
00449   top_sides.sort();
00450 
00451   // At this point, in the "merged list", we expect to have a bottom side,
00452   // followed by either more bottom sides or a top side. The last number
00453   // should be a top side. We find places where the splits occur by looking
00454   // for "valleys". If we want to force gap sizes or allow overlap, change
00455   // the spacing above. If you want to let lines "slice" partitions as long
00456   // as it is infrequent, change the following function.
00457   FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold,
00458                          &cell_y_);
00459 
00460   // Recover the min/max correctly since it was shifted.
00461   cell_y_[0] = min_bottom;
00462   cell_y_[cell_y_.length() - 1] = max_top;
00463 }
00464 
00465 void StructuredTable::CalculateMargins() {
00466   space_above_ = MAX_INT32;
00467   space_below_ = MAX_INT32;
00468   space_right_ = MAX_INT32;
00469   space_left_ = MAX_INT32;
00470   UpdateMargins(text_grid_);
00471   UpdateMargins(line_grid_);
00472 }
00473 // Finds the nearest partition in grid to the table
00474 // boundaries and updates the margin.
00475 void StructuredTable::UpdateMargins(ColPartitionGrid* grid) {
00476   int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
00477   space_below_ = MIN(space_below_, below);
00478   int above = FindVerticalMargin(grid, bounding_box_.top(), false);
00479   space_above_ = MIN(space_above_, above);
00480   int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
00481   space_left_ = MIN(space_left_, left);
00482   int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
00483   space_right_ = MIN(space_right_, right);
00484 }
00485 int StructuredTable::FindVerticalMargin(ColPartitionGrid* grid, int border,
00486                                         bool decrease) const {
00487   ColPartitionGridSearch gsearch(grid);
00488   gsearch.SetUniqueMode(true);
00489   gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
00490                               border);
00491   ColPartition* part = NULL;
00492   while ((part = gsearch.NextVerticalSearch(decrease)) != NULL) {
00493     if (!part->IsTextType() && !part->IsHorizontalLine())
00494       continue;
00495     int distance = decrease ? border - part->bounding_box().top()
00496                             : part->bounding_box().bottom() - border;
00497     if (distance >= 0)
00498       return distance;
00499   }
00500   return MAX_INT32;
00501 }
00502 int StructuredTable::FindHorizontalMargin(ColPartitionGrid* grid, int border,
00503                                           bool decrease) const {
00504   ColPartitionGridSearch gsearch(grid);
00505   gsearch.SetUniqueMode(true);
00506   gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top());
00507   ColPartition* part = NULL;
00508   while ((part = gsearch.NextSideSearch(decrease)) != NULL) {
00509     if (!part->IsTextType() && !part->IsVerticalLine())
00510       continue;
00511     int distance = decrease ? border - part->bounding_box().right()
00512                             : part->bounding_box().left() - border;
00513     if (distance >= 0)
00514       return distance;
00515   }
00516   return MAX_INT32;
00517 }
00518 
00519 void StructuredTable::CalculateStats() {
00520   const int kMaxCellHeight = 1000;
00521   const int kMaxCellWidth = 1000;
00522   STATS height_stats(0, kMaxCellHeight + 1);
00523   STATS width_stats(0, kMaxCellWidth + 1);
00524 
00525   for (int i = 0; i < row_count(); ++i)
00526     height_stats.add(row_height(i), column_count());
00527   for (int i = 0; i < column_count(); ++i)
00528     width_stats.add(column_width(i), row_count());
00529 
00530   median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
00531   median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
00532 }
00533 
00534 // Looks for grid lines near the current bounding box and
00535 // grows the bounding box to include them if no intersections
00536 // will occur as a result. This is necessary because the margins
00537 // are calculated relative to the closest line/text. If the
00538 // line isn't absorbed, the margin will be the distance to the line.
00539 void StructuredTable::AbsorbNearbyLines() {
00540   ColPartitionGridSearch gsearch(line_grid_);
00541   gsearch.SetUniqueMode(true);
00542 
00543   // Is the closest line above good? Loop multiple times for tables with
00544   // multi-line (sometimes 2) borders. Limit the number of lines by
00545   // making sure they stay within a table cell or so.
00546   ColPartition* line = NULL;
00547   gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
00548                               bounding_box_.top());
00549   while ((line = gsearch.NextVerticalSearch(false)) != NULL) {
00550     if (!line->IsHorizontalLine())
00551       break;
00552     TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1,
00553                      bounding_box_.right(), line->MidY());
00554     if (text_search.height() > median_cell_height_ * 2)
00555       break;
00556     if (CountPartitions(text_search) > 0)
00557       break;
00558     bounding_box_.set_top(line->MidY());
00559   }
00560   // As above, is the closest line below good?
00561   line = NULL;
00562   gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
00563                               bounding_box_.bottom());
00564   while ((line = gsearch.NextVerticalSearch(true)) != NULL) {
00565     if (!line->IsHorizontalLine())
00566       break;
00567     TBOX text_search(bounding_box_.left(), line->MidY(),
00568                      bounding_box_.right(), bounding_box_.bottom() - 1);
00569     if (text_search.height() > median_cell_height_ * 2)
00570       break;
00571     if (CountPartitions(text_search) > 0)
00572       break;
00573     bounding_box_.set_bottom(line->MidY());
00574   }
00575   // TODO(nbeato): vertical lines
00576 }
00577 
00578 
00579 // This function will find all "0 valleys" (of any length) given two
00580 // arrays. The arrays are the mins and maxes of partitions (either
00581 // left and right or bottom and top). Since the min/max lists are generated
00582 // with pairs of increasing integers, we can make some assumptions in
00583 // the function about ordering of the overall list, which are shown in the
00584 // asserts.
00585 // The algorithm works as follows:
00586 //   While there are numbers to process, take the smallest number.
00587 //     If it is from the min_list, increment the "hill" counter.
00588 //     Otherwise, decrement the "hill" counter.
00589 //     In the process of doing this, keep track of "crossing" the
00590 //     desired height.
00591 // The first/last items are extremal values of the list and known.
00592 // NOTE: This function assumes the lists are sorted!
00593 void StructuredTable::FindCellSplitLocations(const GenericVector<int>& min_list,
00594                                              const GenericVector<int>& max_list,
00595                                              int max_merged,
00596                                              GenericVector<int>* locations) {
00597   locations->clear();
00598   ASSERT_HOST(min_list.length() == max_list.length());
00599   if (min_list.length() == 0)
00600     return;
00601   ASSERT_HOST(min_list.get(0) < max_list.get(0));
00602   ASSERT_HOST(min_list.get(min_list.length() - 1) <
00603               max_list.get(max_list.length() - 1));
00604 
00605   locations->push_back(min_list.get(0));
00606   int min_index = 0;
00607   int max_index = 0;
00608   int stacked_partitions = 0;
00609   int last_cross_position = MAX_INT32;
00610   // max_index will expire after min_index.
00611   // However, we can't "increase" the hill size if min_index expired.
00612   // So finish processing when min_index expires.
00613   while (min_index < min_list.length()) {
00614     // Increase the hill count.
00615     if (min_list[min_index] < max_list[max_index]) {
00616       ++stacked_partitions;
00617       if (last_cross_position != MAX_INT32 &&
00618           stacked_partitions > max_merged) {
00619         int mid = (last_cross_position + min_list[min_index]) / 2;
00620         locations->push_back(mid);
00621         last_cross_position = MAX_INT32;
00622       }
00623       ++min_index;
00624     } else {
00625       // Decrease the hill count.
00626       --stacked_partitions;
00627       if (last_cross_position == MAX_INT32 &&
00628           stacked_partitions <= max_merged) {
00629         last_cross_position = max_list[max_index];
00630       }
00631       ++max_index;
00632     }
00633   }
00634   locations->push_back(max_list.get(max_list.length() - 1));
00635 }
00636 
00637 // Counts the number of partitions in the table
00638 // box that intersection the given x value.
00639 int StructuredTable::CountVerticalIntersections(int x) {
00640   int count = 0;
00641   // Make a small box to keep the search time down.
00642   const int kGridSize = text_grid_->gridsize();
00643   TBOX vertical_box = bounding_box_;
00644   vertical_box.set_left(x - kGridSize);
00645   vertical_box.set_right(x + kGridSize);
00646 
00647   ColPartitionGridSearch gsearch(text_grid_);
00648   gsearch.SetUniqueMode(true);
00649   gsearch.StartRectSearch(vertical_box);
00650   ColPartition* text = NULL;
00651   while ((text = gsearch.NextRectSearch()) != NULL) {
00652     if (!text->IsTextType())
00653       continue;
00654     const TBOX& box = text->bounding_box();
00655     if (box.left() < x && x < box.right())
00656       ++count;
00657   }
00658   return count;
00659 }
00660 
00661 // Counts the number of partitions in the table
00662 // box that intersection the given y value.
00663 int StructuredTable::CountHorizontalIntersections(int y) {
00664   int count = 0;
00665   // Make a small box to keep the search time down.
00666   const int kGridSize = text_grid_->gridsize();
00667   TBOX horizontal_box = bounding_box_;
00668   horizontal_box.set_bottom(y - kGridSize);
00669   horizontal_box.set_top(y + kGridSize);
00670 
00671   ColPartitionGridSearch gsearch(text_grid_);
00672   gsearch.SetUniqueMode(true);
00673   gsearch.StartRectSearch(horizontal_box);
00674   ColPartition* text = NULL;
00675   while ((text = gsearch.NextRectSearch()) != NULL) {
00676     if (!text->IsTextType())
00677       continue;
00678 
00679     const TBOX& box = text->bounding_box();
00680     if (box.bottom() < y && y < box.top())
00681       ++count;
00682   }
00683   return count;
00684 }
00685 
00686 // Counts how many text partitions are in this box.
00687 // This is used to count partitons in cells, as that can indicate
00688 // how "strong" a potential table row/column (or even full table) actually is.
00689 int StructuredTable::CountPartitions(const TBOX& box) {
00690   ColPartitionGridSearch gsearch(text_grid_);
00691   gsearch.SetUniqueMode(true);
00692   gsearch.StartRectSearch(box);
00693   int count = 0;
00694   ColPartition* text = NULL;
00695   while ((text = gsearch.NextRectSearch()) != NULL) {
00696     if (text->IsTextType())
00697       ++count;
00698   }
00699   return count;
00700 }
00701 
00705 
00706 TableRecognizer::TableRecognizer()
00707     : text_grid_(NULL),
00708       line_grid_(NULL),
00709       min_height_(0),
00710       min_width_(0),
00711       max_text_height_(MAX_INT32) {
00712 }
00713 
00714 TableRecognizer::~TableRecognizer() {
00715 }
00716 
00717 void TableRecognizer::Init() {
00718 }
00719 
00720 void TableRecognizer::set_text_grid(ColPartitionGrid* text_grid) {
00721   text_grid_ = text_grid;
00722 }
00723 void TableRecognizer::set_line_grid(ColPartitionGrid* line_grid) {
00724   line_grid_ = line_grid;
00725 }
00726 void TableRecognizer::set_min_height(int height) {
00727   min_height_ = height;
00728 }
00729 void TableRecognizer::set_min_width(int width) {
00730   min_width_ = width;
00731 }
00732 void TableRecognizer::set_max_text_height(int height) {
00733   max_text_height_ = height;
00734 }
00735 
00736 StructuredTable* TableRecognizer::RecognizeTable(const TBOX& guess) {
00737   StructuredTable* table = new StructuredTable();
00738   table->Init();
00739   table->set_text_grid(text_grid_);
00740   table->set_line_grid(line_grid_);
00741   table->set_max_text_height(max_text_height_);
00742 
00743   // Try to solve this simple case, a table with *both*
00744   // vertical and horizontal lines.
00745   if (RecognizeLinedTable(guess, table))
00746     return table;
00747 
00748   // Fallback to whitespace if that failed.
00749   // TODO(nbeato): Break this apart to take advantage of horizontal
00750   // lines or vertical lines when present.
00751   if (RecognizeWhitespacedTable(guess, table))
00752     return table;
00753 
00754   // No table found...
00755   delete table;
00756   return NULL;
00757 }
00758 
00759 bool TableRecognizer::RecognizeLinedTable(const TBOX& guess_box,
00760                                           StructuredTable* table) {
00761   if (!HasSignificantLines(guess_box))
00762     return false;
00763   TBOX line_bound = guess_box;
00764   if (!FindLinesBoundingBox(&line_bound))
00765     return false;
00766   table->set_bounding_box(line_bound);
00767   return table->FindLinedStructure();
00768 }
00769 
00770 // Quick implementation. Just count the number of lines in the box.
00771 // A better implementation would counter intersections and look for connected
00772 // components. It could even go as far as finding similar length lines.
00773 // To account for these possible issues, the VerifyLinedTableCells function
00774 // will reject lined tables that cause intersections with text on the page.
00775 // TODO(nbeato): look for "better" lines
00776 bool TableRecognizer::HasSignificantLines(const TBOX& guess) {
00777   ColPartitionGridSearch box_search(line_grid_);
00778   box_search.SetUniqueMode(true);
00779   box_search.StartRectSearch(guess);
00780   ColPartition* line = NULL;
00781   int vertical_count = 0;
00782   int horizontal_count = 0;
00783 
00784   while ((line = box_search.NextRectSearch()) != NULL) {
00785     if (line->IsHorizontalLine())
00786       ++horizontal_count;
00787     if (line->IsVerticalLine())
00788       ++vertical_count;
00789   }
00790 
00791   return vertical_count >= kLinedTableMinVerticalLines &&
00792          horizontal_count >= kLinedTableMinHorizontalLines;
00793 }
00794 
00795 // Given a bounding box with a bunch of horizontal / vertical lines,
00796 // we just find the extents of all of these lines iteratively.
00797 // The box will be at least as large as guess. This
00798 // could possibly be a bad assumption.
00799 // It is guaranteed to halt in at least O(n * gridarea) where n
00800 // is the number of lines.
00801 // The assumption is that growing the box iteratively will add lines
00802 // several times, but eventually we'll find the extents.
00803 //
00804 // For tables, the approach is a bit aggressive, a single line (which could be
00805 // noise or a column ruling) can destroy the table inside.
00806 //
00807 // TODO(nbeato): This is a quick first implementation.
00808 // A better implementation would actually look for consistency
00809 // in extents of the lines and find the extents using lines
00810 // that clearly describe the table. This would allow the
00811 // lines to "vote" for height/width. An approach like
00812 // this would solve issues with page layout rulings.
00813 // I haven't looked for these issues yet, so I can't even
00814 // say they happen confidently.
00815 bool TableRecognizer::FindLinesBoundingBox(TBOX* bounding_box) {
00816   // The first iteration will tell us if there are lines
00817   // present and shrink the box to a minimal iterative size.
00818   if (!FindLinesBoundingBoxIteration(bounding_box))
00819     return false;
00820 
00821   // Keep growing until the area of the table stabilizes.
00822   // The box can only get bigger, increasing area.
00823   bool changed = true;
00824   while (changed) {
00825     changed = false;
00826     int old_area = bounding_box->area();
00827     bool check = FindLinesBoundingBoxIteration(bounding_box);
00828     // At this point, the function will return true.
00829     ASSERT_HOST(check);
00830     ASSERT_HOST(bounding_box->area() >= old_area);
00831     changed = (bounding_box->area() > old_area);
00832   }
00833 
00834   return true;
00835 }
00836 
00837 bool TableRecognizer::FindLinesBoundingBoxIteration(TBOX* bounding_box) {
00838   // Search for all of the lines in the current box, keeping track of extents.
00839   ColPartitionGridSearch box_search(line_grid_);
00840   box_search.SetUniqueMode(true);
00841   box_search.StartRectSearch(*bounding_box);
00842   ColPartition* line = NULL;
00843   bool first_line = true;
00844 
00845   while ((line = box_search.NextRectSearch()) != NULL) {
00846     if (line->IsLineType()) {
00847       if (first_line) {
00848         // The first iteration can shrink the box.
00849         *bounding_box = line->bounding_box();
00850         first_line = false;
00851       } else {
00852         *bounding_box += line->bounding_box();
00853       }
00854     }
00855   }
00856   return !first_line;
00857 }
00858 
00859 // The goal of this function is to move the table boundaries around and find
00860 // a table that maximizes the whitespace around the table while maximizing
00861 // the cellular structure. As a result, it gets confused by headers, footers,
00862 // and merged columns (text that crosses columns). There is a tolerance
00863 // that allows a few partitions to count towards potential cell merges.
00864 // It's the max_merged parameter to FindPartitionLocations.
00865 // It can work, but it needs some false positive remove on boundaries.
00866 // For now, the grid structure must not intersect any partitions.
00867 // Also, small tolerance is added to the horizontal lines for tightly packed
00868 // tables. The tolerance is added by adjusting the bounding boxes of the
00869 // partitions (in FindHorizontalPartitions). The current implementation
00870 // only adjusts the vertical extents of the table.
00871 //
00872 // Also note. This was hacked at a lot. It could probably use some
00873 // more hacking at to find a good set of border conditions and then a
00874 // nice clean up.
00875 bool TableRecognizer::RecognizeWhitespacedTable(const TBOX& guess_box,
00876                                                 StructuredTable* table) {
00877   TBOX best_box = guess_box;  // Best borders known.
00878   int best_below = 0;         // Margin size above best table.
00879   int best_above = 0;         // Margin size below best table.
00880   TBOX adjusted = guess_box;  // The search box.
00881 
00882   // We assume that the guess box is somewhat accurate, so we don't allow
00883   // the adjusted border to pass half of the guessed area. This prevents
00884   // "negative" tables from forming.
00885   const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
00886   // Keeps track of the most columns in an accepted table. The resulting table
00887   // may be less than the max, but we don't want to stray too far.
00888   int best_cols = 0;
00889   // Make sure we find a good border.
00890   bool found_good_border = false;
00891 
00892   // Find the bottom of the table by trying a few different locations. For
00893   // each location, the top, left, and right are fixed. We start the search
00894   // in a smaller table to favor best_cols getting a good estimate sooner.
00895   int last_bottom = MAX_INT32;
00896   int bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
00897                                    kMidGuessY - min_height_ / 2, true);
00898   int top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
00899                                 kMidGuessY + min_height_ / 2, false);
00900   adjusted.set_top(top);
00901 
00902   // Headers/footers can be spaced far from everything.
00903   // Make sure that the space below is greater than the space above
00904   // the lowest row.
00905   int previous_below = 0;
00906   const int kMaxChances = 10;
00907   int chances = kMaxChances;
00908   while (bottom != last_bottom) {
00909     adjusted.set_bottom(bottom);
00910 
00911     if (adjusted.height() >= min_height_) {
00912       // Try to fit the grid on the current box. We give it a chance
00913       // if the number of columns didn't significantly drop.
00914       table->set_bounding_box(adjusted);
00915       if (table->FindWhitespacedStructure() &&
00916           table->column_count() >= best_cols * kRequiredColumns) {
00917         if (false && IsWeakTableRow(table, 0)) {
00918           // Currently buggy, but was looking promising so disabled.
00919           --chances;
00920         } else {
00921           // We favor 2 things,
00922           //   1- Adding rows that have partitioned data.
00923           //   2- Better margins (to find header/footer).
00924           // For better tables, we just look for multiple cells in the
00925           // bottom row with data in them.
00926           // For margins, the space below the last row should
00927           // be better than a table with the last row removed.
00928           chances = kMaxChances;
00929           double max_row_height = kMaxRowSize * table->median_cell_height();
00930           if ((table->space_below() * kMarginFactor >= best_below &&
00931                table->space_below() >= previous_below) ||
00932               (table->CountFilledCellsInRow(0) > 1 &&
00933                table->row_height(0) < max_row_height)) {
00934             best_box.set_bottom(bottom);
00935             best_below = table->space_below();
00936             best_cols = MAX(table->column_count(), best_cols);
00937             found_good_border = true;
00938           }
00939         }
00940         previous_below = table->space_below();
00941       } else {
00942        --chances;
00943       }
00944     }
00945     if (chances <= 0)
00946       break;
00947 
00948     last_bottom = bottom;
00949     bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
00950                                  last_bottom, true);
00951   }
00952   if (!found_good_border)
00953     return false;
00954 
00955   // TODO(nbeato) comments: follow modified code above... put it in a function!
00956   found_good_border = false;
00957   int last_top = MIN_INT32;
00958   top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
00959                             kMidGuessY + min_height_ / 2, false);
00960   int previous_above = 0;
00961   chances = kMaxChances;
00962 
00963   adjusted.set_bottom(best_box.bottom());
00964   while (last_top != top) {
00965     adjusted.set_top(top);
00966     if (adjusted.height() >= min_height_) {
00967       table->set_bounding_box(adjusted);
00968       if (table->FindWhitespacedStructure() &&
00969           table->column_count() >= best_cols * kRequiredColumns) {
00970         int last_row = table->row_count() - 1;
00971         if (false && IsWeakTableRow(table, last_row)) {
00972           // Currently buggy, but was looking promising so disabled.
00973           --chances;
00974         } else {
00975           chances = kMaxChances;
00976           double max_row_height = kMaxRowSize * table->median_cell_height();
00977           if ((table->space_above() * kMarginFactor >= best_above &&
00978                table->space_above() >= previous_above) ||
00979               (table->CountFilledCellsInRow(last_row) > 1 &&
00980                table->row_height(last_row) < max_row_height)) {
00981             best_box.set_top(top);
00982             best_above = table->space_above();
00983             best_cols = MAX(table->column_count(), best_cols);
00984             found_good_border = true;
00985           }
00986         }
00987         previous_above = table->space_above();
00988       } else {
00989        --chances;
00990       }
00991     }
00992     if (chances <= 0)
00993       break;
00994 
00995     last_top = top;
00996     top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
00997                               last_top, false);
00998   }
00999 
01000   if (!found_good_border)
01001     return false;
01002 
01003   // If we get here, this shouldn't happen. It can be an assert, but
01004   // I haven't tested it enough to make it crash things.
01005   if (best_box.null_box())
01006     return false;
01007 
01008   // Given the best locations, fit the box to those locations.
01009   table->set_bounding_box(best_box);
01010   return table->FindWhitespacedStructure();
01011 }
01012 
01013 // Finds the closest value to y that can safely cause a horizontal
01014 // split in the partitions.
01015 // This function has been buggy and not as reliable as I would've
01016 // liked. I suggest finding all of the splits using the
01017 // FindPartitionLocations once and then just keeping the results
01018 // of that function cached somewhere.
01019 int TableRecognizer::NextHorizontalSplit(int left, int right, int y,
01020                                          bool top_to_bottom) {
01021   ColPartitionGridSearch gsearch(text_grid_);
01022   gsearch.SetUniqueMode(true);
01023   gsearch.StartVerticalSearch(left, right, y);
01024   ColPartition* text = NULL;
01025   int last_y = y;
01026   while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != NULL) {
01027     if (!text->IsTextType() || !text->IsHorizontalType())
01028       continue;
01029     if (text->bounding_box().height() > max_text_height_)
01030       continue;
01031 
01032     const TBOX& text_box = text->bounding_box();
01033     if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) {
01034       last_y = MIN(last_y, text_box.bottom());
01035       continue;
01036     }
01037     if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) {
01038       last_y = MAX(last_y, text_box.top());
01039       continue;
01040     }
01041 
01042     return last_y;
01043   }
01044   // If none is found, we at least want to preserve the min/max,
01045   // which defines the overlap of y with the last partition in the grid.
01046   return last_y;
01047 }
01048 
01049 // Code is buggy right now. It is disabled in the calling function.
01050 // It seems like sometimes the row that is passed in is not correct
01051 // sometimes (like a phantom row is introduced). There's something going
01052 // on in the cell_y_ data member before this is called... not certain.
01053 bool TableRecognizer::IsWeakTableRow(StructuredTable* table, int row) {
01054   if (!table->VerifyRowFilled(row))
01055     return false;
01056 
01057   double threshold = 0.0;
01058   if (table->column_count() > kGoodRowNumberOfColumnsSmallSize)
01059     threshold = table->column_count() * kGoodRowNumberOfColumnsLarge;
01060   else
01061     threshold = kGoodRowNumberOfColumnsSmall[table->column_count()];
01062 
01063   return table->CountFilledCellsInRow(row) < threshold;
01064 }
01065 
01066 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines