|
tesseract 3.04.01
|
00001 00002 // File: tablerecog.cpp 00003 // Description: Helper class to help structure table areas. Given an bounding 00004 // box from TableFinder, the TableRecognizer should give a 00005 // StructuredTable (maybe a list in the future) of "good" tables 00006 // in that area. 00007 // Author: Nicholas Beato 00008 // Created: Friday, Aug. 20, 2010 00009 // 00010 // (C) Copyright 2009, Google Inc. 00011 // Licensed under the Apache License, Version 2.0 (the "License"); 00012 // you may not use this file except in compliance with the License. 00013 // You may obtain a copy of the License at 00014 // http://www.apache.org/licenses/LICENSE-2.0 00015 // Unless required by applicable law or agreed to in writing, software 00016 // distributed under the License is distributed on an "AS IS" BASIS, 00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00018 // See the License for the specific language governing permissions and 00019 // limitations under the License. 00020 // 00022 00023 #ifdef HAVE_CONFIG_H 00024 #include "config_auto.h" 00025 #endif 00026 00027 #include "tablerecog.h" 00028 00029 namespace tesseract { 00030 00031 // The amount of space required between the ColPartitions in 2 columns 00032 // of a non-lined table as a multiple of the median width. 00033 const double kHorizontalSpacing = 0.30; 00034 // The amount of space required between the ColPartitions in 2 rows 00035 // of a non-lined table as multiples of the median height. 00036 const double kVerticalSpacing = -0.2; 00037 // The number of cells that the grid lines may intersect. 00038 // See FindCellSplitLocations for explanation. 00039 const int kCellSplitRowThreshold = 0; 00040 const int kCellSplitColumnThreshold = 0; 00041 // For "lined tables", the number of required lines. Currently a guess. 00042 const int kLinedTableMinVerticalLines = 3; 00043 const int kLinedTableMinHorizontalLines = 3; 00044 // Number of columns required, as a fraction of the most columns found. 00045 // None of these are tweaked at all. 00046 const double kRequiredColumns = 0.7; 00047 // The tolerance for comparing margins of potential tables. 00048 const double kMarginFactor = 1.1; 00049 // The first and last row should be consistent cell height. 00050 // This factor is the first and last row cell height max. 00051 const double kMaxRowSize = 2.5; 00052 // Number of filled columns required to form a strong table row. 00053 // For small tables, this is an absolute number. 00054 const double kGoodRowNumberOfColumnsSmall[] = { 2, 2, 2, 2, 2, 3, 3 }; 00055 const int kGoodRowNumberOfColumnsSmallSize = 00056 sizeof(kGoodRowNumberOfColumnsSmall) / sizeof(double) - 1; 00057 // For large tables, it is a relative number 00058 const double kGoodRowNumberOfColumnsLarge = 0.7; 00059 // The amount of area that must be covered in a cell by ColPartitions to 00060 // be considered "filled" 00061 const double kMinFilledArea = 0.35; 00062 00066 00067 StructuredTable::StructuredTable() 00068 : text_grid_(NULL), 00069 line_grid_(NULL), 00070 is_lined_(false), 00071 space_above_(0), 00072 space_below_(0), 00073 space_left_(0), 00074 space_right_(0), 00075 median_cell_height_(0), 00076 median_cell_width_(0), 00077 max_text_height_(MAX_INT32) { 00078 } 00079 00080 StructuredTable::~StructuredTable() { 00081 } 00082 00083 void StructuredTable::Init() { 00084 } 00085 00086 void StructuredTable::set_text_grid(ColPartitionGrid* text_grid) { 00087 text_grid_ = text_grid; 00088 } 00089 void StructuredTable::set_line_grid(ColPartitionGrid* line_grid) { 00090 line_grid_ = line_grid; 00091 } 00092 void StructuredTable::set_max_text_height(int height) { 00093 max_text_height_ = height; 00094 } 00095 bool StructuredTable::is_lined() const { 00096 return is_lined_; 00097 } 00098 int StructuredTable::row_count() const { 00099 return cell_y_.length() == 0 ? 0 : cell_y_.length() - 1; 00100 } 00101 int StructuredTable::column_count() const { 00102 return cell_x_.length() == 0 ? 0 : cell_x_.length() - 1; 00103 } 00104 int StructuredTable::cell_count() const { 00105 return row_count() * column_count(); 00106 } 00107 void StructuredTable::set_bounding_box(const TBOX& box) { 00108 bounding_box_ = box; 00109 } 00110 const TBOX& StructuredTable::bounding_box() const { 00111 return bounding_box_; 00112 } 00113 int StructuredTable::median_cell_height() { 00114 return median_cell_height_; 00115 } 00116 int StructuredTable::median_cell_width() { 00117 return median_cell_width_; 00118 } 00119 int StructuredTable::row_height(int row) const { 00120 ASSERT_HOST(0 <= row && row < row_count()); 00121 return cell_y_[row + 1] - cell_y_[row]; 00122 } 00123 int StructuredTable::column_width(int column) const { 00124 ASSERT_HOST(0 <= column && column < column_count()); 00125 return cell_x_[column + 1] - cell_x_[column]; 00126 } 00127 int StructuredTable::space_above() const { 00128 return space_above_; 00129 } 00130 int StructuredTable::space_below() const { 00131 return space_below_; 00132 } 00133 00134 // At this point, we know that the lines are contained 00135 // by the box (by FindLinesBoundingBox). 00136 // So try to find the cell structure and make sure it works out. 00137 // The assumption is that all lines span the table. If this 00138 // assumption fails, the VerifyLinedTable method will 00139 // abort the lined table. The TableRecognizer will fall 00140 // back on FindWhitespacedStructure. 00141 bool StructuredTable::FindLinedStructure() { 00142 ClearStructure(); 00143 00144 // Search for all of the lines in the current box. 00145 // Update the cellular structure with the exact lines. 00146 ColPartitionGridSearch box_search(line_grid_); 00147 box_search.SetUniqueMode(true); 00148 box_search.StartRectSearch(bounding_box_); 00149 ColPartition* line = NULL; 00150 00151 while ((line = box_search.NextRectSearch()) != NULL) { 00152 if (line->IsHorizontalLine()) 00153 cell_y_.push_back(line->MidY()); 00154 if (line->IsVerticalLine()) 00155 cell_x_.push_back(line->MidX()); 00156 } 00157 00158 // HasSignificantLines should guarantee cells. 00159 // Because that code is a different class, just gracefully 00160 // return false. This could be an assert. 00161 if (cell_x_.length() < 3 || cell_y_.length() < 3) 00162 return false; 00163 00164 cell_x_.sort(); 00165 cell_y_.sort(); 00166 00167 // Remove duplicates that may have occurred due to split lines. 00168 cell_x_.compact_sorted(); 00169 cell_y_.compact_sorted(); 00170 00171 // The border should be the extents of line boxes, not middle. 00172 cell_x_[0] = bounding_box_.left(); 00173 cell_x_[cell_x_.length() - 1] = bounding_box_.right(); 00174 cell_y_[0] = bounding_box_.bottom(); 00175 cell_y_[cell_y_.length() - 1] = bounding_box_.top(); 00176 00177 // Remove duplicates that may have occurred due to moving the borders. 00178 cell_x_.compact_sorted(); 00179 cell_y_.compact_sorted(); 00180 00181 CalculateMargins(); 00182 CalculateStats(); 00183 is_lined_ = VerifyLinedTableCells(); 00184 return is_lined_; 00185 } 00186 00187 // Finds the cellular structure given a particular box. 00188 bool StructuredTable::FindWhitespacedStructure() { 00189 ClearStructure(); 00190 FindWhitespacedColumns(); 00191 FindWhitespacedRows(); 00192 00193 if (!VerifyWhitespacedTable()) { 00194 return false; 00195 } else { 00196 bounding_box_.set_left(cell_x_[0]); 00197 bounding_box_.set_right(cell_x_[cell_x_.length() - 1]); 00198 bounding_box_.set_bottom(cell_y_[0]); 00199 bounding_box_.set_top(cell_y_[cell_y_.length() - 1]); 00200 AbsorbNearbyLines(); 00201 CalculateMargins(); 00202 CalculateStats(); 00203 return true; 00204 } 00205 } 00206 00207 // Tests if a partition fits inside the table structure. 00208 // Partitions must fully span a grid line in order to intersect it. 00209 // This means that a partition does not intersect a line 00210 // that it "just" touches. This is mainly because the assumption 00211 // throughout the code is that "0" distance is a very very small space. 00212 bool StructuredTable::DoesPartitionFit(const ColPartition& part) const { 00213 const TBOX& box = part.bounding_box(); 00214 for (int i = 0; i < cell_x_.length(); ++i) 00215 if (box.left() < cell_x_[i] && cell_x_[i] < box.right()) 00216 return false; 00217 for (int i = 0; i < cell_y_.length(); ++i) 00218 if (box.bottom() < cell_y_[i] && cell_y_[i] < box.top()) 00219 return false; 00220 return true; 00221 } 00222 00223 // Checks if a sub-table has multiple data cells filled. 00224 int StructuredTable::CountFilledCells() { 00225 return CountFilledCells(0, row_count() - 1, 0, column_count() - 1); 00226 } 00227 int StructuredTable::CountFilledCellsInRow(int row) { 00228 return CountFilledCells(row, row, 0, column_count() - 1); 00229 } 00230 int StructuredTable::CountFilledCellsInColumn(int column) { 00231 return CountFilledCells(0, row_count() - 1, column, column); 00232 } 00233 int StructuredTable::CountFilledCells(int row_start, int row_end, 00234 int column_start, int column_end) { 00235 ASSERT_HOST(0 <= row_start && row_start <= row_end && row_end < row_count()); 00236 ASSERT_HOST(0 <= column_start && column_start <= column_end && 00237 column_end < column_count()); 00238 int cell_count = 0; 00239 TBOX cell_box; 00240 for (int row = row_start; row <= row_end; ++row) { 00241 cell_box.set_bottom(cell_y_[row]); 00242 cell_box.set_top(cell_y_[row + 1]); 00243 for (int col = column_start; col <= column_end; ++col) { 00244 cell_box.set_left(cell_x_[col]); 00245 cell_box.set_right(cell_x_[col + 1]); 00246 if (CountPartitions(cell_box) > 0) 00247 ++cell_count; 00248 } 00249 } 00250 return cell_count; 00251 } 00252 00253 // Makes sure that at least one cell in a row has substantial area filled. 00254 // This can filter out large whitespace caused by growing tables too far 00255 // and page numbers. 00256 bool StructuredTable::VerifyRowFilled(int row) { 00257 for (int i = 0; i < column_count(); ++i) { 00258 double area_filled = CalculateCellFilledPercentage(row, i); 00259 if (area_filled >= kMinFilledArea) 00260 return true; 00261 } 00262 return false; 00263 } 00264 00265 // Finds the filled area in a cell. 00266 // Assume ColPartitions do not overlap for simplicity (even though they do). 00267 double StructuredTable::CalculateCellFilledPercentage(int row, int column) { 00268 ASSERT_HOST(0 <= row && row <= row_count()); 00269 ASSERT_HOST(0 <= column && column <= column_count()); 00270 const TBOX kCellBox(cell_x_[column], cell_y_[row], 00271 cell_x_[column + 1], cell_y_[row + 1]); 00272 ASSERT_HOST(!kCellBox.null_box()); 00273 00274 ColPartitionGridSearch gsearch(text_grid_); 00275 gsearch.SetUniqueMode(true); 00276 gsearch.StartRectSearch(kCellBox); 00277 double area_covered = 0; 00278 ColPartition* text = NULL; 00279 while ((text = gsearch.NextRectSearch()) != NULL) { 00280 if (text->IsTextType()) 00281 area_covered += text->bounding_box().intersection(kCellBox).area(); 00282 } 00283 const inT32 current_area = kCellBox.area(); 00284 if (current_area == 0) { 00285 return 1.0; 00286 } 00287 return MIN(1.0, area_covered / current_area); 00288 } 00289 00290 void StructuredTable::Display(ScrollView* window, ScrollView::Color color) { 00291 #ifndef GRAPHICS_DISABLED 00292 window->Brush(ScrollView::NONE); 00293 window->Pen(color); 00294 window->Rectangle(bounding_box_.left(), bounding_box_.bottom(), 00295 bounding_box_.right(), bounding_box_.top()); 00296 for (int i = 0; i < cell_x_.length(); i++) { 00297 window->Line(cell_x_[i], bounding_box_.bottom(), 00298 cell_x_[i], bounding_box_.top()); 00299 } 00300 for (int i = 0; i < cell_y_.length(); i++) { 00301 window->Line(bounding_box_.left(), cell_y_[i], 00302 bounding_box_.right(), cell_y_[i]); 00303 } 00304 window->UpdateWindow(); 00305 #endif 00306 } 00307 00308 // Clear structure information. 00309 void StructuredTable::ClearStructure() { 00310 cell_x_.clear(); 00311 cell_y_.clear(); 00312 is_lined_ = false; 00313 space_above_ = 0; 00314 space_below_ = 0; 00315 space_left_ = 0; 00316 space_right_ = 0; 00317 median_cell_height_ = 0; 00318 median_cell_width_ = 0; 00319 } 00320 00321 // When a table has lines, the lines should not intersect any partitions. 00322 // The following function makes sure the previous assumption is met. 00323 bool StructuredTable::VerifyLinedTableCells() { 00324 // Function only called when lines exist. 00325 ASSERT_HOST(cell_y_.length() >= 2 && cell_x_.length() >= 2); 00326 for (int i = 0; i < cell_y_.length(); ++i) { 00327 if (CountHorizontalIntersections(cell_y_[i]) > 0) 00328 return false; 00329 } 00330 for (int i = 0; i < cell_x_.length(); ++i) { 00331 if (CountVerticalIntersections(cell_x_[i]) > 0) 00332 return false; 00333 } 00334 return true; 00335 } 00336 00337 // TODO(nbeato): Could be much better than this. 00338 // Examples: 00339 // - Caclulate the percentage of filled cells. 00340 // - Calculate the average number of ColPartitions per cell. 00341 // - Calculate the number of cells per row with partitions. 00342 // - Check if ColPartitions in adjacent cells are similar. 00343 // - Check that all columns are at least a certain width. 00344 // - etc. 00345 bool StructuredTable::VerifyWhitespacedTable() { 00346 // criteria for a table, must be at least 2x3 or 3x2 00347 return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6; 00348 } 00349 00350 // Finds vertical splits in the ColPartitions of text_grid_ by considering 00351 // all possible "good" guesses. A good guess is just the left/right sides of 00352 // the partitions, since these locations will uniquely define where the 00353 // extremal values where the splits can occur. The split happens 00354 // in the middle of the two nearest partitions. 00355 void StructuredTable::FindWhitespacedColumns() { 00356 // Set of the extents of all partitions on the page. 00357 GenericVectorEqEq<int> left_sides; 00358 GenericVectorEqEq<int> right_sides; 00359 00360 // Look at each text partition. We want to find the partitions 00361 // that have extremal left/right sides. These will give us a basis 00362 // for the table columns. 00363 ColPartitionGridSearch gsearch(text_grid_); 00364 gsearch.SetUniqueMode(true); 00365 gsearch.StartRectSearch(bounding_box_); 00366 ColPartition* text = NULL; 00367 while ((text = gsearch.NextRectSearch()) != NULL) { 00368 if (!text->IsTextType()) 00369 continue; 00370 00371 ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right()); 00372 int spacing = static_cast<int>(text->median_width() * 00373 kHorizontalSpacing / 2.0 + 0.5); 00374 left_sides.push_back(text->bounding_box().left() - spacing); 00375 right_sides.push_back(text->bounding_box().right() + spacing); 00376 } 00377 // It causes disaster below, so avoid it! 00378 if (left_sides.length() == 0 || right_sides.length() == 0) 00379 return; 00380 00381 // Since data may be inserted in grid order, we sort the left/right sides. 00382 left_sides.sort(); 00383 right_sides.sort(); 00384 00385 // At this point, in the "merged list", we expect to have a left side, 00386 // followed by either more left sides or a right side. The last number 00387 // should be a right side. We find places where the splits occur by looking 00388 // for "valleys". If we want to force gap sizes or allow overlap, change 00389 // the spacing above. If you want to let lines "slice" partitions as long 00390 // as it is infrequent, change the following function. 00391 FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold, 00392 &cell_x_); 00393 } 00394 00395 // Finds horizontal splits in the ColPartitions of text_grid_ by considering 00396 // all possible "good" guesses. A good guess is just the bottom/top sides of 00397 // the partitions, since these locations will uniquely define where the 00398 // extremal values where the splits can occur. The split happens 00399 // in the middle of the two nearest partitions. 00400 void StructuredTable::FindWhitespacedRows() { 00401 // Set of the extents of all partitions on the page. 00402 GenericVectorEqEq<int> bottom_sides; 00403 GenericVectorEqEq<int> top_sides; 00404 // We will be "shrinking" partitions, so keep the min/max around to 00405 // make sure the bottom/top lines do not intersect text. 00406 int min_bottom = MAX_INT32; 00407 int max_top = MIN_INT32; 00408 00409 // Look at each text partition. We want to find the partitions 00410 // that have extremal bottom/top sides. These will give us a basis 00411 // for the table rows. Because the textlines can be skewed and close due 00412 // to warping, the height of the partitions is toned down a little bit. 00413 ColPartitionGridSearch gsearch(text_grid_); 00414 gsearch.SetUniqueMode(true); 00415 gsearch.StartRectSearch(bounding_box_); 00416 ColPartition* text = NULL; 00417 while ((text = gsearch.NextRectSearch()) != NULL) { 00418 if (!text->IsTextType()) 00419 continue; 00420 00421 ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top()); 00422 min_bottom = MIN(min_bottom, text->bounding_box().bottom()); 00423 max_top = MAX(max_top, text->bounding_box().top()); 00424 00425 // Ignore "tall" text partitions, as these are usually false positive 00426 // vertical text or multiple lines pulled together. 00427 if (text->bounding_box().height() > max_text_height_) 00428 continue; 00429 00430 int spacing = static_cast<int>(text->bounding_box().height() * 00431 kVerticalSpacing / 2.0 + 0.5); 00432 int bottom = text->bounding_box().bottom() - spacing; 00433 int top = text->bounding_box().top() + spacing; 00434 // For horizontal text, the factor can be negative. This should 00435 // probably cause a warning or failure. I haven't actually checked if 00436 // it happens. 00437 if (bottom >= top) 00438 continue; 00439 00440 bottom_sides.push_back(bottom); 00441 top_sides.push_back(top); 00442 } 00443 // It causes disaster below, so avoid it! 00444 if (bottom_sides.length() == 0 || top_sides.length() == 0) 00445 return; 00446 00447 // Since data may be inserted in grid order, we sort the bottom/top sides. 00448 bottom_sides.sort(); 00449 top_sides.sort(); 00450 00451 // At this point, in the "merged list", we expect to have a bottom side, 00452 // followed by either more bottom sides or a top side. The last number 00453 // should be a top side. We find places where the splits occur by looking 00454 // for "valleys". If we want to force gap sizes or allow overlap, change 00455 // the spacing above. If you want to let lines "slice" partitions as long 00456 // as it is infrequent, change the following function. 00457 FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold, 00458 &cell_y_); 00459 00460 // Recover the min/max correctly since it was shifted. 00461 cell_y_[0] = min_bottom; 00462 cell_y_[cell_y_.length() - 1] = max_top; 00463 } 00464 00465 void StructuredTable::CalculateMargins() { 00466 space_above_ = MAX_INT32; 00467 space_below_ = MAX_INT32; 00468 space_right_ = MAX_INT32; 00469 space_left_ = MAX_INT32; 00470 UpdateMargins(text_grid_); 00471 UpdateMargins(line_grid_); 00472 } 00473 // Finds the nearest partition in grid to the table 00474 // boundaries and updates the margin. 00475 void StructuredTable::UpdateMargins(ColPartitionGrid* grid) { 00476 int below = FindVerticalMargin(grid, bounding_box_.bottom(), true); 00477 space_below_ = MIN(space_below_, below); 00478 int above = FindVerticalMargin(grid, bounding_box_.top(), false); 00479 space_above_ = MIN(space_above_, above); 00480 int left = FindHorizontalMargin(grid, bounding_box_.left(), true); 00481 space_left_ = MIN(space_left_, left); 00482 int right = FindHorizontalMargin(grid, bounding_box_.right(), false); 00483 space_right_ = MIN(space_right_, right); 00484 } 00485 int StructuredTable::FindVerticalMargin(ColPartitionGrid* grid, int border, 00486 bool decrease) const { 00487 ColPartitionGridSearch gsearch(grid); 00488 gsearch.SetUniqueMode(true); 00489 gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), 00490 border); 00491 ColPartition* part = NULL; 00492 while ((part = gsearch.NextVerticalSearch(decrease)) != NULL) { 00493 if (!part->IsTextType() && !part->IsHorizontalLine()) 00494 continue; 00495 int distance = decrease ? border - part->bounding_box().top() 00496 : part->bounding_box().bottom() - border; 00497 if (distance >= 0) 00498 return distance; 00499 } 00500 return MAX_INT32; 00501 } 00502 int StructuredTable::FindHorizontalMargin(ColPartitionGrid* grid, int border, 00503 bool decrease) const { 00504 ColPartitionGridSearch gsearch(grid); 00505 gsearch.SetUniqueMode(true); 00506 gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top()); 00507 ColPartition* part = NULL; 00508 while ((part = gsearch.NextSideSearch(decrease)) != NULL) { 00509 if (!part->IsTextType() && !part->IsVerticalLine()) 00510 continue; 00511 int distance = decrease ? border - part->bounding_box().right() 00512 : part->bounding_box().left() - border; 00513 if (distance >= 0) 00514 return distance; 00515 } 00516 return MAX_INT32; 00517 } 00518 00519 void StructuredTable::CalculateStats() { 00520 const int kMaxCellHeight = 1000; 00521 const int kMaxCellWidth = 1000; 00522 STATS height_stats(0, kMaxCellHeight + 1); 00523 STATS width_stats(0, kMaxCellWidth + 1); 00524 00525 for (int i = 0; i < row_count(); ++i) 00526 height_stats.add(row_height(i), column_count()); 00527 for (int i = 0; i < column_count(); ++i) 00528 width_stats.add(column_width(i), row_count()); 00529 00530 median_cell_height_ = static_cast<int>(height_stats.median() + 0.5); 00531 median_cell_width_ = static_cast<int>(width_stats.median() + 0.5); 00532 } 00533 00534 // Looks for grid lines near the current bounding box and 00535 // grows the bounding box to include them if no intersections 00536 // will occur as a result. This is necessary because the margins 00537 // are calculated relative to the closest line/text. If the 00538 // line isn't absorbed, the margin will be the distance to the line. 00539 void StructuredTable::AbsorbNearbyLines() { 00540 ColPartitionGridSearch gsearch(line_grid_); 00541 gsearch.SetUniqueMode(true); 00542 00543 // Is the closest line above good? Loop multiple times for tables with 00544 // multi-line (sometimes 2) borders. Limit the number of lines by 00545 // making sure they stay within a table cell or so. 00546 ColPartition* line = NULL; 00547 gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), 00548 bounding_box_.top()); 00549 while ((line = gsearch.NextVerticalSearch(false)) != NULL) { 00550 if (!line->IsHorizontalLine()) 00551 break; 00552 TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1, 00553 bounding_box_.right(), line->MidY()); 00554 if (text_search.height() > median_cell_height_ * 2) 00555 break; 00556 if (CountPartitions(text_search) > 0) 00557 break; 00558 bounding_box_.set_top(line->MidY()); 00559 } 00560 // As above, is the closest line below good? 00561 line = NULL; 00562 gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), 00563 bounding_box_.bottom()); 00564 while ((line = gsearch.NextVerticalSearch(true)) != NULL) { 00565 if (!line->IsHorizontalLine()) 00566 break; 00567 TBOX text_search(bounding_box_.left(), line->MidY(), 00568 bounding_box_.right(), bounding_box_.bottom() - 1); 00569 if (text_search.height() > median_cell_height_ * 2) 00570 break; 00571 if (CountPartitions(text_search) > 0) 00572 break; 00573 bounding_box_.set_bottom(line->MidY()); 00574 } 00575 // TODO(nbeato): vertical lines 00576 } 00577 00578 00579 // This function will find all "0 valleys" (of any length) given two 00580 // arrays. The arrays are the mins and maxes of partitions (either 00581 // left and right or bottom and top). Since the min/max lists are generated 00582 // with pairs of increasing integers, we can make some assumptions in 00583 // the function about ordering of the overall list, which are shown in the 00584 // asserts. 00585 // The algorithm works as follows: 00586 // While there are numbers to process, take the smallest number. 00587 // If it is from the min_list, increment the "hill" counter. 00588 // Otherwise, decrement the "hill" counter. 00589 // In the process of doing this, keep track of "crossing" the 00590 // desired height. 00591 // The first/last items are extremal values of the list and known. 00592 // NOTE: This function assumes the lists are sorted! 00593 void StructuredTable::FindCellSplitLocations(const GenericVector<int>& min_list, 00594 const GenericVector<int>& max_list, 00595 int max_merged, 00596 GenericVector<int>* locations) { 00597 locations->clear(); 00598 ASSERT_HOST(min_list.length() == max_list.length()); 00599 if (min_list.length() == 0) 00600 return; 00601 ASSERT_HOST(min_list.get(0) < max_list.get(0)); 00602 ASSERT_HOST(min_list.get(min_list.length() - 1) < 00603 max_list.get(max_list.length() - 1)); 00604 00605 locations->push_back(min_list.get(0)); 00606 int min_index = 0; 00607 int max_index = 0; 00608 int stacked_partitions = 0; 00609 int last_cross_position = MAX_INT32; 00610 // max_index will expire after min_index. 00611 // However, we can't "increase" the hill size if min_index expired. 00612 // So finish processing when min_index expires. 00613 while (min_index < min_list.length()) { 00614 // Increase the hill count. 00615 if (min_list[min_index] < max_list[max_index]) { 00616 ++stacked_partitions; 00617 if (last_cross_position != MAX_INT32 && 00618 stacked_partitions > max_merged) { 00619 int mid = (last_cross_position + min_list[min_index]) / 2; 00620 locations->push_back(mid); 00621 last_cross_position = MAX_INT32; 00622 } 00623 ++min_index; 00624 } else { 00625 // Decrease the hill count. 00626 --stacked_partitions; 00627 if (last_cross_position == MAX_INT32 && 00628 stacked_partitions <= max_merged) { 00629 last_cross_position = max_list[max_index]; 00630 } 00631 ++max_index; 00632 } 00633 } 00634 locations->push_back(max_list.get(max_list.length() - 1)); 00635 } 00636 00637 // Counts the number of partitions in the table 00638 // box that intersection the given x value. 00639 int StructuredTable::CountVerticalIntersections(int x) { 00640 int count = 0; 00641 // Make a small box to keep the search time down. 00642 const int kGridSize = text_grid_->gridsize(); 00643 TBOX vertical_box = bounding_box_; 00644 vertical_box.set_left(x - kGridSize); 00645 vertical_box.set_right(x + kGridSize); 00646 00647 ColPartitionGridSearch gsearch(text_grid_); 00648 gsearch.SetUniqueMode(true); 00649 gsearch.StartRectSearch(vertical_box); 00650 ColPartition* text = NULL; 00651 while ((text = gsearch.NextRectSearch()) != NULL) { 00652 if (!text->IsTextType()) 00653 continue; 00654 const TBOX& box = text->bounding_box(); 00655 if (box.left() < x && x < box.right()) 00656 ++count; 00657 } 00658 return count; 00659 } 00660 00661 // Counts the number of partitions in the table 00662 // box that intersection the given y value. 00663 int StructuredTable::CountHorizontalIntersections(int y) { 00664 int count = 0; 00665 // Make a small box to keep the search time down. 00666 const int kGridSize = text_grid_->gridsize(); 00667 TBOX horizontal_box = bounding_box_; 00668 horizontal_box.set_bottom(y - kGridSize); 00669 horizontal_box.set_top(y + kGridSize); 00670 00671 ColPartitionGridSearch gsearch(text_grid_); 00672 gsearch.SetUniqueMode(true); 00673 gsearch.StartRectSearch(horizontal_box); 00674 ColPartition* text = NULL; 00675 while ((text = gsearch.NextRectSearch()) != NULL) { 00676 if (!text->IsTextType()) 00677 continue; 00678 00679 const TBOX& box = text->bounding_box(); 00680 if (box.bottom() < y && y < box.top()) 00681 ++count; 00682 } 00683 return count; 00684 } 00685 00686 // Counts how many text partitions are in this box. 00687 // This is used to count partitons in cells, as that can indicate 00688 // how "strong" a potential table row/column (or even full table) actually is. 00689 int StructuredTable::CountPartitions(const TBOX& box) { 00690 ColPartitionGridSearch gsearch(text_grid_); 00691 gsearch.SetUniqueMode(true); 00692 gsearch.StartRectSearch(box); 00693 int count = 0; 00694 ColPartition* text = NULL; 00695 while ((text = gsearch.NextRectSearch()) != NULL) { 00696 if (text->IsTextType()) 00697 ++count; 00698 } 00699 return count; 00700 } 00701 00705 00706 TableRecognizer::TableRecognizer() 00707 : text_grid_(NULL), 00708 line_grid_(NULL), 00709 min_height_(0), 00710 min_width_(0), 00711 max_text_height_(MAX_INT32) { 00712 } 00713 00714 TableRecognizer::~TableRecognizer() { 00715 } 00716 00717 void TableRecognizer::Init() { 00718 } 00719 00720 void TableRecognizer::set_text_grid(ColPartitionGrid* text_grid) { 00721 text_grid_ = text_grid; 00722 } 00723 void TableRecognizer::set_line_grid(ColPartitionGrid* line_grid) { 00724 line_grid_ = line_grid; 00725 } 00726 void TableRecognizer::set_min_height(int height) { 00727 min_height_ = height; 00728 } 00729 void TableRecognizer::set_min_width(int width) { 00730 min_width_ = width; 00731 } 00732 void TableRecognizer::set_max_text_height(int height) { 00733 max_text_height_ = height; 00734 } 00735 00736 StructuredTable* TableRecognizer::RecognizeTable(const TBOX& guess) { 00737 StructuredTable* table = new StructuredTable(); 00738 table->Init(); 00739 table->set_text_grid(text_grid_); 00740 table->set_line_grid(line_grid_); 00741 table->set_max_text_height(max_text_height_); 00742 00743 // Try to solve this simple case, a table with *both* 00744 // vertical and horizontal lines. 00745 if (RecognizeLinedTable(guess, table)) 00746 return table; 00747 00748 // Fallback to whitespace if that failed. 00749 // TODO(nbeato): Break this apart to take advantage of horizontal 00750 // lines or vertical lines when present. 00751 if (RecognizeWhitespacedTable(guess, table)) 00752 return table; 00753 00754 // No table found... 00755 delete table; 00756 return NULL; 00757 } 00758 00759 bool TableRecognizer::RecognizeLinedTable(const TBOX& guess_box, 00760 StructuredTable* table) { 00761 if (!HasSignificantLines(guess_box)) 00762 return false; 00763 TBOX line_bound = guess_box; 00764 if (!FindLinesBoundingBox(&line_bound)) 00765 return false; 00766 table->set_bounding_box(line_bound); 00767 return table->FindLinedStructure(); 00768 } 00769 00770 // Quick implementation. Just count the number of lines in the box. 00771 // A better implementation would counter intersections and look for connected 00772 // components. It could even go as far as finding similar length lines. 00773 // To account for these possible issues, the VerifyLinedTableCells function 00774 // will reject lined tables that cause intersections with text on the page. 00775 // TODO(nbeato): look for "better" lines 00776 bool TableRecognizer::HasSignificantLines(const TBOX& guess) { 00777 ColPartitionGridSearch box_search(line_grid_); 00778 box_search.SetUniqueMode(true); 00779 box_search.StartRectSearch(guess); 00780 ColPartition* line = NULL; 00781 int vertical_count = 0; 00782 int horizontal_count = 0; 00783 00784 while ((line = box_search.NextRectSearch()) != NULL) { 00785 if (line->IsHorizontalLine()) 00786 ++horizontal_count; 00787 if (line->IsVerticalLine()) 00788 ++vertical_count; 00789 } 00790 00791 return vertical_count >= kLinedTableMinVerticalLines && 00792 horizontal_count >= kLinedTableMinHorizontalLines; 00793 } 00794 00795 // Given a bounding box with a bunch of horizontal / vertical lines, 00796 // we just find the extents of all of these lines iteratively. 00797 // The box will be at least as large as guess. This 00798 // could possibly be a bad assumption. 00799 // It is guaranteed to halt in at least O(n * gridarea) where n 00800 // is the number of lines. 00801 // The assumption is that growing the box iteratively will add lines 00802 // several times, but eventually we'll find the extents. 00803 // 00804 // For tables, the approach is a bit aggressive, a single line (which could be 00805 // noise or a column ruling) can destroy the table inside. 00806 // 00807 // TODO(nbeato): This is a quick first implementation. 00808 // A better implementation would actually look for consistency 00809 // in extents of the lines and find the extents using lines 00810 // that clearly describe the table. This would allow the 00811 // lines to "vote" for height/width. An approach like 00812 // this would solve issues with page layout rulings. 00813 // I haven't looked for these issues yet, so I can't even 00814 // say they happen confidently. 00815 bool TableRecognizer::FindLinesBoundingBox(TBOX* bounding_box) { 00816 // The first iteration will tell us if there are lines 00817 // present and shrink the box to a minimal iterative size. 00818 if (!FindLinesBoundingBoxIteration(bounding_box)) 00819 return false; 00820 00821 // Keep growing until the area of the table stabilizes. 00822 // The box can only get bigger, increasing area. 00823 bool changed = true; 00824 while (changed) { 00825 changed = false; 00826 int old_area = bounding_box->area(); 00827 bool check = FindLinesBoundingBoxIteration(bounding_box); 00828 // At this point, the function will return true. 00829 ASSERT_HOST(check); 00830 ASSERT_HOST(bounding_box->area() >= old_area); 00831 changed = (bounding_box->area() > old_area); 00832 } 00833 00834 return true; 00835 } 00836 00837 bool TableRecognizer::FindLinesBoundingBoxIteration(TBOX* bounding_box) { 00838 // Search for all of the lines in the current box, keeping track of extents. 00839 ColPartitionGridSearch box_search(line_grid_); 00840 box_search.SetUniqueMode(true); 00841 box_search.StartRectSearch(*bounding_box); 00842 ColPartition* line = NULL; 00843 bool first_line = true; 00844 00845 while ((line = box_search.NextRectSearch()) != NULL) { 00846 if (line->IsLineType()) { 00847 if (first_line) { 00848 // The first iteration can shrink the box. 00849 *bounding_box = line->bounding_box(); 00850 first_line = false; 00851 } else { 00852 *bounding_box += line->bounding_box(); 00853 } 00854 } 00855 } 00856 return !first_line; 00857 } 00858 00859 // The goal of this function is to move the table boundaries around and find 00860 // a table that maximizes the whitespace around the table while maximizing 00861 // the cellular structure. As a result, it gets confused by headers, footers, 00862 // and merged columns (text that crosses columns). There is a tolerance 00863 // that allows a few partitions to count towards potential cell merges. 00864 // It's the max_merged parameter to FindPartitionLocations. 00865 // It can work, but it needs some false positive remove on boundaries. 00866 // For now, the grid structure must not intersect any partitions. 00867 // Also, small tolerance is added to the horizontal lines for tightly packed 00868 // tables. The tolerance is added by adjusting the bounding boxes of the 00869 // partitions (in FindHorizontalPartitions). The current implementation 00870 // only adjusts the vertical extents of the table. 00871 // 00872 // Also note. This was hacked at a lot. It could probably use some 00873 // more hacking at to find a good set of border conditions and then a 00874 // nice clean up. 00875 bool TableRecognizer::RecognizeWhitespacedTable(const TBOX& guess_box, 00876 StructuredTable* table) { 00877 TBOX best_box = guess_box; // Best borders known. 00878 int best_below = 0; // Margin size above best table. 00879 int best_above = 0; // Margin size below best table. 00880 TBOX adjusted = guess_box; // The search box. 00881 00882 // We assume that the guess box is somewhat accurate, so we don't allow 00883 // the adjusted border to pass half of the guessed area. This prevents 00884 // "negative" tables from forming. 00885 const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2; 00886 // Keeps track of the most columns in an accepted table. The resulting table 00887 // may be less than the max, but we don't want to stray too far. 00888 int best_cols = 0; 00889 // Make sure we find a good border. 00890 bool found_good_border = false; 00891 00892 // Find the bottom of the table by trying a few different locations. For 00893 // each location, the top, left, and right are fixed. We start the search 00894 // in a smaller table to favor best_cols getting a good estimate sooner. 00895 int last_bottom = MAX_INT32; 00896 int bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), 00897 kMidGuessY - min_height_ / 2, true); 00898 int top = NextHorizontalSplit(guess_box.left(), guess_box.right(), 00899 kMidGuessY + min_height_ / 2, false); 00900 adjusted.set_top(top); 00901 00902 // Headers/footers can be spaced far from everything. 00903 // Make sure that the space below is greater than the space above 00904 // the lowest row. 00905 int previous_below = 0; 00906 const int kMaxChances = 10; 00907 int chances = kMaxChances; 00908 while (bottom != last_bottom) { 00909 adjusted.set_bottom(bottom); 00910 00911 if (adjusted.height() >= min_height_) { 00912 // Try to fit the grid on the current box. We give it a chance 00913 // if the number of columns didn't significantly drop. 00914 table->set_bounding_box(adjusted); 00915 if (table->FindWhitespacedStructure() && 00916 table->column_count() >= best_cols * kRequiredColumns) { 00917 if (false && IsWeakTableRow(table, 0)) { 00918 // Currently buggy, but was looking promising so disabled. 00919 --chances; 00920 } else { 00921 // We favor 2 things, 00922 // 1- Adding rows that have partitioned data. 00923 // 2- Better margins (to find header/footer). 00924 // For better tables, we just look for multiple cells in the 00925 // bottom row with data in them. 00926 // For margins, the space below the last row should 00927 // be better than a table with the last row removed. 00928 chances = kMaxChances; 00929 double max_row_height = kMaxRowSize * table->median_cell_height(); 00930 if ((table->space_below() * kMarginFactor >= best_below && 00931 table->space_below() >= previous_below) || 00932 (table->CountFilledCellsInRow(0) > 1 && 00933 table->row_height(0) < max_row_height)) { 00934 best_box.set_bottom(bottom); 00935 best_below = table->space_below(); 00936 best_cols = MAX(table->column_count(), best_cols); 00937 found_good_border = true; 00938 } 00939 } 00940 previous_below = table->space_below(); 00941 } else { 00942 --chances; 00943 } 00944 } 00945 if (chances <= 0) 00946 break; 00947 00948 last_bottom = bottom; 00949 bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), 00950 last_bottom, true); 00951 } 00952 if (!found_good_border) 00953 return false; 00954 00955 // TODO(nbeato) comments: follow modified code above... put it in a function! 00956 found_good_border = false; 00957 int last_top = MIN_INT32; 00958 top = NextHorizontalSplit(guess_box.left(), guess_box.right(), 00959 kMidGuessY + min_height_ / 2, false); 00960 int previous_above = 0; 00961 chances = kMaxChances; 00962 00963 adjusted.set_bottom(best_box.bottom()); 00964 while (last_top != top) { 00965 adjusted.set_top(top); 00966 if (adjusted.height() >= min_height_) { 00967 table->set_bounding_box(adjusted); 00968 if (table->FindWhitespacedStructure() && 00969 table->column_count() >= best_cols * kRequiredColumns) { 00970 int last_row = table->row_count() - 1; 00971 if (false && IsWeakTableRow(table, last_row)) { 00972 // Currently buggy, but was looking promising so disabled. 00973 --chances; 00974 } else { 00975 chances = kMaxChances; 00976 double max_row_height = kMaxRowSize * table->median_cell_height(); 00977 if ((table->space_above() * kMarginFactor >= best_above && 00978 table->space_above() >= previous_above) || 00979 (table->CountFilledCellsInRow(last_row) > 1 && 00980 table->row_height(last_row) < max_row_height)) { 00981 best_box.set_top(top); 00982 best_above = table->space_above(); 00983 best_cols = MAX(table->column_count(), best_cols); 00984 found_good_border = true; 00985 } 00986 } 00987 previous_above = table->space_above(); 00988 } else { 00989 --chances; 00990 } 00991 } 00992 if (chances <= 0) 00993 break; 00994 00995 last_top = top; 00996 top = NextHorizontalSplit(guess_box.left(), guess_box.right(), 00997 last_top, false); 00998 } 00999 01000 if (!found_good_border) 01001 return false; 01002 01003 // If we get here, this shouldn't happen. It can be an assert, but 01004 // I haven't tested it enough to make it crash things. 01005 if (best_box.null_box()) 01006 return false; 01007 01008 // Given the best locations, fit the box to those locations. 01009 table->set_bounding_box(best_box); 01010 return table->FindWhitespacedStructure(); 01011 } 01012 01013 // Finds the closest value to y that can safely cause a horizontal 01014 // split in the partitions. 01015 // This function has been buggy and not as reliable as I would've 01016 // liked. I suggest finding all of the splits using the 01017 // FindPartitionLocations once and then just keeping the results 01018 // of that function cached somewhere. 01019 int TableRecognizer::NextHorizontalSplit(int left, int right, int y, 01020 bool top_to_bottom) { 01021 ColPartitionGridSearch gsearch(text_grid_); 01022 gsearch.SetUniqueMode(true); 01023 gsearch.StartVerticalSearch(left, right, y); 01024 ColPartition* text = NULL; 01025 int last_y = y; 01026 while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != NULL) { 01027 if (!text->IsTextType() || !text->IsHorizontalType()) 01028 continue; 01029 if (text->bounding_box().height() > max_text_height_) 01030 continue; 01031 01032 const TBOX& text_box = text->bounding_box(); 01033 if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) { 01034 last_y = MIN(last_y, text_box.bottom()); 01035 continue; 01036 } 01037 if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) { 01038 last_y = MAX(last_y, text_box.top()); 01039 continue; 01040 } 01041 01042 return last_y; 01043 } 01044 // If none is found, we at least want to preserve the min/max, 01045 // which defines the overlap of y with the last partition in the grid. 01046 return last_y; 01047 } 01048 01049 // Code is buggy right now. It is disabled in the calling function. 01050 // It seems like sometimes the row that is passed in is not correct 01051 // sometimes (like a phantom row is introduced). There's something going 01052 // on in the cell_y_ data member before this is called... not certain. 01053 bool TableRecognizer::IsWeakTableRow(StructuredTable* table, int row) { 01054 if (!table->VerifyRowFilled(row)) 01055 return false; 01056 01057 double threshold = 0.0; 01058 if (table->column_count() > kGoodRowNumberOfColumnsSmallSize) 01059 threshold = table->column_count() * kGoodRowNumberOfColumnsLarge; 01060 else 01061 threshold = kGoodRowNumberOfColumnsSmall[table->column_count()]; 01062 01063 return table->CountFilledCellsInRow(row) < threshold; 01064 } 01065 01066 } // namespace tesseract