tesseract 3.04.01

textord/tordmain.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        tordmain.cpp  (Formerly textordp.c)
00003  * Description: C++ top level textord code.
00004  * Author:                  Ray Smith
00005  * Created:                 Tue Jul 28 17:12:33 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef HAVE_CONFIG_H
00021 #include "config_auto.h"
00022 #endif
00023 
00024 #ifdef __UNIX__
00025 #include <assert.h>
00026 #endif
00027 #include "stderr.h"
00028 #include "globaloc.h"
00029 #include "blread.h"
00030 #include "blobbox.h"
00031 #include "ccstruct.h"
00032 #include "edgblob.h"
00033 #include "drawtord.h"
00034 #include "makerow.h"
00035 #include "wordseg.h"
00036 #include "textord.h"
00037 #include "tordmain.h"
00038 
00039 #include "allheaders.h"
00040 
00041 // Gridsize for word grid when reassigning diacritics to words. Not critical.
00042 const int kWordGridSize = 50;
00043 
00044 #undef EXTERN
00045 #define EXTERN
00046 
00047 #define MAX_NEAREST_DIST  600    //for block skew stats
00048 
00049 namespace tesseract {
00050 
00051 CLISTIZE(WordWithBox)
00052 
00053 /**********************************************************************
00054  * SetBlobStrokeWidth
00055  *
00056  * Set the horizontal and vertical stroke widths in the blob.
00057  **********************************************************************/
00058 void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) {
00059   // Cut the blob rectangle into a Pix.
00060   int pix_height = pixGetHeight(pix);
00061   const TBOX& box = blob->bounding_box();
00062   int width = box.width();
00063   int height = box.height();
00064   Box* blob_pix_box = boxCreate(box.left(), pix_height - box.top(),
00065                                 width, height);
00066   Pix* pix_blob = pixClipRectangle(pix, blob_pix_box, NULL);
00067   boxDestroy(&blob_pix_box);
00068   Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG);
00069   pixDestroy(&pix_blob);
00070   // Compute the stroke widths.
00071   uinT32* data = pixGetData(dist_pix);
00072   int wpl = pixGetWpl(dist_pix);
00073   // Horizontal width of stroke.
00074   STATS h_stats(0, width + 1);
00075   for (int y = 0; y < height; ++y) {
00076     uinT32* pixels = data + y*wpl;
00077     int prev_pixel = 0;
00078     int pixel = GET_DATA_BYTE(pixels, 0);
00079     for (int x = 1; x < width; ++x) {
00080       int next_pixel = GET_DATA_BYTE(pixels, x);
00081       // We are looking for a pixel that is equal to its vertical neighbours,
00082       // yet greater than its left neighbour.
00083       if (prev_pixel < pixel &&
00084           (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
00085           (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
00086         if (pixel > next_pixel) {
00087           // Single local max, so an odd width.
00088           h_stats.add(pixel * 2 - 1, 1);
00089         } else if (pixel == next_pixel && x + 1 < width &&
00090                  pixel > GET_DATA_BYTE(pixels, x + 1)) {
00091           // Double local max, so an even width.
00092           h_stats.add(pixel * 2, 1);
00093         }
00094       }
00095       prev_pixel = pixel;
00096       pixel = next_pixel;
00097     }
00098   }
00099   // Vertical width of stroke.
00100   STATS v_stats(0, height + 1);
00101   for (int x = 0; x < width; ++x) {
00102     int prev_pixel = 0;
00103     int pixel = GET_DATA_BYTE(data, x);
00104     for (int y = 1; y < height; ++y) {
00105       uinT32* pixels = data + y*wpl;
00106       int next_pixel = GET_DATA_BYTE(pixels, x);
00107       // We are looking for a pixel that is equal to its horizontal neighbours,
00108       // yet greater than its upper neighbour.
00109       if (prev_pixel < pixel &&
00110           (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
00111           (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
00112         if (pixel > next_pixel) {
00113           // Single local max, so an odd width.
00114           v_stats.add(pixel * 2 - 1, 1);
00115         } else if (pixel == next_pixel && y + 1 < height &&
00116                  pixel > GET_DATA_BYTE(pixels + wpl, x)) {
00117           // Double local max, so an even width.
00118           v_stats.add(pixel * 2, 1);
00119         }
00120       }
00121       prev_pixel = pixel;
00122       pixel = next_pixel;
00123     }
00124   }
00125   pixDestroy(&dist_pix);
00126   // Store the horizontal and vertical width in the blob, keeping both
00127   // widths if there is enough information, otherwse only the one with
00128   // the most samples.
00129   // If there are insufficent samples, store zero, rather than using
00130   // 2*area/perimeter, as the numbers that gives do not match the numbers
00131   // from the distance method.
00132   if (h_stats.get_total() >= (width + height) / 4) {
00133     blob->set_horz_stroke_width(h_stats.ile(0.5f));
00134     if (v_stats.get_total() >= (width + height) / 4)
00135       blob->set_vert_stroke_width(v_stats.ile(0.5f));
00136     else
00137       blob->set_vert_stroke_width(0.0f);
00138   } else {
00139     if (v_stats.get_total() >= (width + height) / 4 ||
00140         v_stats.get_total() > h_stats.get_total()) {
00141       blob->set_horz_stroke_width(0.0f);
00142       blob->set_vert_stroke_width(v_stats.ile(0.5f));
00143     } else {
00144       blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f)
00145                                                           : 0.0f);
00146       blob->set_vert_stroke_width(0.0f);
00147     }
00148   }
00149 }
00150 
00151 /**********************************************************************
00152  * assign_blobs_to_blocks2
00153  *
00154  * Make a list of TO_BLOCKs for portrait and landscape orientation.
00155  **********************************************************************/
00156 
00157 void assign_blobs_to_blocks2(Pix* pix,
00158                              BLOCK_LIST *blocks,          // blocks to process
00159                              TO_BLOCK_LIST *port_blocks) {  // output list
00160   BLOCK *block;                  // current block
00161   BLOBNBOX *newblob;             // created blob
00162   C_BLOB *blob;                  // current blob
00163   BLOCK_IT block_it = blocks;
00164   C_BLOB_IT blob_it;             // iterator
00165   BLOBNBOX_IT port_box_it;       // iterator
00166                                  // destination iterator
00167   TO_BLOCK_IT port_block_it = port_blocks;
00168   TO_BLOCK *port_block;          // created block
00169 
00170   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
00171     block = block_it.data();
00172     port_block = new TO_BLOCK(block);
00173 
00174     // Convert the good outlines to block->blob_list
00175     port_box_it.set_to_list(&port_block->blobs);
00176     blob_it.set_to_list(block->blob_list());
00177     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00178       blob = blob_it.extract();
00179       newblob = new BLOBNBOX(blob);  // Convert blob to BLOBNBOX.
00180       SetBlobStrokeWidth(pix, newblob);
00181       port_box_it.add_after_then_move(newblob);
00182     }
00183 
00184     // Put the rejected outlines in block->noise_blobs, which allows them to
00185     // be reconsidered and sorted back into rows and recover outlines mistakenly
00186     // rejected.
00187     port_box_it.set_to_list(&port_block->noise_blobs);
00188     blob_it.set_to_list(block->reject_blobs());
00189     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00190       blob = blob_it.extract();
00191       newblob = new BLOBNBOX(blob);  // Convert blob to BLOBNBOX.
00192       SetBlobStrokeWidth(pix, newblob);
00193       port_box_it.add_after_then_move(newblob);
00194     }
00195 
00196     port_block_it.add_after_then_move(port_block);
00197   }
00198 }
00199 
00200 /**********************************************************************
00201  * find_components
00202  *
00203  * Find the C_OUTLINEs of the connected components in each block, put them
00204  * in C_BLOBs, and filter them by size, putting the different size
00205  * grades on different lists in the matching TO_BLOCK in to_blocks.
00206  **********************************************************************/
00207 
00208 void Textord::find_components(Pix* pix, BLOCK_LIST *blocks,
00209                               TO_BLOCK_LIST *to_blocks) {
00210   int width = pixGetWidth(pix);
00211   int height = pixGetHeight(pix);
00212   if (width > MAX_INT16 || height > MAX_INT16) {
00213     tprintf("Input image too large! (%d, %d)\n", width, height);
00214     return;  // Can't handle it.
00215   }
00216 
00217   set_global_loc_code(LOC_EDGE_PROG);
00218 
00219   BLOCK_IT block_it(blocks);    // iterator
00220   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00221        block_it.forward()) {
00222     BLOCK* block = block_it.data();
00223     if (block->poly_block() == NULL || block->poly_block()->IsText()) {
00224       extract_edges(pix, block);
00225     }
00226   }
00227 
00228   assign_blobs_to_blocks2(pix, blocks, to_blocks);
00229   ICOORD page_tr(width, height);
00230   filter_blobs(page_tr, to_blocks, !textord_test_landscape);
00231 }
00232 
00233 /**********************************************************************
00234  * filter_blobs
00235  *
00236  * Sort the blobs into sizes in all the blocks for later work.
00237  **********************************************************************/
00238 
00239 void Textord::filter_blobs(ICOORD page_tr,         // top right
00240                            TO_BLOCK_LIST *blocks,  // output list
00241                            BOOL8 testing_on) {     // for plotting
00242   TO_BLOCK_IT block_it = blocks;          // destination iterator
00243   TO_BLOCK *block;                        // created block
00244 
00245   #ifndef GRAPHICS_DISABLED
00246   if (to_win != NULL)
00247     to_win->Clear();
00248   #endif  // GRAPHICS_DISABLED
00249 
00250   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00251        block_it.forward()) {
00252     block = block_it.data();
00253     block->line_size = filter_noise_blobs(&block->blobs,
00254       &block->noise_blobs,
00255       &block->small_blobs,
00256       &block->large_blobs);
00257     block->line_spacing = block->line_size *
00258         (tesseract::CCStruct::kDescenderFraction +
00259          tesseract::CCStruct::kXHeightFraction +
00260          2 * tesseract::CCStruct::kAscenderFraction) /
00261          tesseract::CCStruct::kXHeightFraction;
00262     block->line_size *= textord_min_linesize;
00263     block->max_blob_size = block->line_size * textord_excess_blobsize;
00264 
00265     #ifndef GRAPHICS_DISABLED
00266     if (textord_show_blobs && testing_on) {
00267       if (to_win == NULL)
00268         create_to_win(page_tr);
00269       block->plot_graded_blobs(to_win);
00270     }
00271     if (textord_show_boxes && testing_on) {
00272       if (to_win == NULL)
00273         create_to_win(page_tr);
00274       plot_box_list(to_win, &block->noise_blobs, ScrollView::WHITE);
00275       plot_box_list(to_win, &block->small_blobs, ScrollView::WHITE);
00276       plot_box_list(to_win, &block->large_blobs, ScrollView::WHITE);
00277       plot_box_list(to_win, &block->blobs, ScrollView::WHITE);
00278     }
00279     #endif  // GRAPHICS_DISABLED
00280   }
00281 }
00282 
00283 /**********************************************************************
00284  * filter_noise_blobs
00285  *
00286  * Move small blobs to a separate list.
00287  **********************************************************************/
00288 
00289 float Textord::filter_noise_blobs(
00290     BLOBNBOX_LIST *src_list,      // original list
00291     BLOBNBOX_LIST *noise_list,    // noise list
00292     BLOBNBOX_LIST *small_list,    // small blobs
00293     BLOBNBOX_LIST *large_list) {  // large blobs
00294   inT16 height;                  //height of blob
00295   inT16 width;                   //of blob
00296   BLOBNBOX *blob;                //current blob
00297   float initial_x;               //first guess
00298   BLOBNBOX_IT src_it = src_list; //iterators
00299   BLOBNBOX_IT noise_it = noise_list;
00300   BLOBNBOX_IT small_it = small_list;
00301   BLOBNBOX_IT large_it = large_list;
00302   STATS size_stats (0, MAX_NEAREST_DIST);
00303   //blob heights
00304   float min_y;                   //size limits
00305   float max_y;
00306   float max_x;
00307   float max_height;              //of good blobs
00308 
00309   for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
00310     blob = src_it.data();
00311     if (blob->bounding_box().height() < textord_max_noise_size)
00312       noise_it.add_after_then_move(src_it.extract());
00313     else if (blob->enclosed_area() >= blob->bounding_box().height()
00314       * blob->bounding_box().width() * textord_noise_area_ratio)
00315       small_it.add_after_then_move(src_it.extract());
00316   }
00317   for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
00318     size_stats.add(src_it.data()->bounding_box().height(), 1);
00319   }
00320   initial_x = size_stats.ile(textord_initialx_ile);
00321   max_y = ceil(initial_x *
00322                (tesseract::CCStruct::kDescenderFraction +
00323                 tesseract::CCStruct::kXHeightFraction +
00324                 2 * tesseract::CCStruct::kAscenderFraction) /
00325                tesseract::CCStruct::kXHeightFraction);
00326   min_y = floor (initial_x / 2);
00327   max_x = ceil (initial_x * textord_width_limit);
00328   small_it.move_to_first ();
00329   for (small_it.mark_cycle_pt (); !small_it.cycled_list ();
00330   small_it.forward ()) {
00331     height = small_it.data()->bounding_box().height();
00332     if (height > max_y)
00333       large_it.add_after_then_move(small_it.extract ());
00334     else if (height >= min_y)
00335       src_it.add_after_then_move(small_it.extract ());
00336   }
00337   size_stats.clear ();
00338   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00339     height = src_it.data ()->bounding_box ().height ();
00340     width = src_it.data ()->bounding_box ().width ();
00341     if (height < min_y)
00342       small_it.add_after_then_move (src_it.extract ());
00343     else if (height > max_y || width > max_x)
00344       large_it.add_after_then_move (src_it.extract ());
00345     else
00346       size_stats.add (height, 1);
00347   }
00348   max_height = size_stats.ile (textord_initialasc_ile);
00349   //      tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,",
00350   //              max_y,min_y,initial_x,max_height);
00351   max_height *= tesseract::CCStruct::kXHeightCapRatio;
00352   if (max_height > initial_x)
00353     initial_x = max_height;
00354   //      tprintf(" ret=%g\n",initial_x);
00355   return initial_x;
00356 }
00357 
00358 // Fixes the block so it obeys all the rules:
00359 // Must have at least one ROW.
00360 // Must have at least one WERD.
00361 // WERDs contain a fake blob.
00362 void Textord::cleanup_nontext_block(BLOCK* block) {
00363   // Non-text blocks must contain at least one row.
00364   ROW_IT row_it(block->row_list());
00365   if (row_it.empty()) {
00366     TBOX box = block->bounding_box();
00367     float height = box.height();
00368     inT32 xstarts[2] = {box.left(), box.right()};
00369     double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};
00370     ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f,
00371                        height / 4.0f, 0, 1);
00372     row_it.add_after_then_move(row);
00373   }
00374   // Each row must contain at least one word.
00375   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00376     ROW* row = row_it.data();
00377     WERD_IT w_it(row->word_list());
00378     if (w_it.empty()) {
00379       // Make a fake blob to put in the word.
00380       TBOX box = block->row_list()->singleton() ? block->bounding_box()
00381                                                 : row->bounding_box();
00382       C_BLOB* blob = C_BLOB::FakeBlob(box);
00383       C_BLOB_LIST blobs;
00384       C_BLOB_IT blob_it(&blobs);
00385       blob_it.add_after_then_move(blob);
00386       WERD* word = new WERD(&blobs, 0, NULL);
00387       w_it.add_after_then_move(word);
00388     }
00389     // Each word must contain a fake blob.
00390     for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
00391       WERD* word = w_it.data();
00392       // Just assert that this is true, as it would be useful to find
00393       // out why it isn't.
00394       ASSERT_HOST(!word->cblob_list()->empty());
00395     }
00396     row->recalc_bounding_box();
00397   }
00398 }
00399 
00400 /**********************************************************************
00401  * cleanup_blocks
00402  *
00403  * Delete empty blocks, rows from the page.
00404  **********************************************************************/
00405 
00406 void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST* blocks) {
00407   BLOCK_IT block_it = blocks;    //iterator
00408   ROW_IT row_it;                 //row iterator
00409 
00410   int num_rows = 0;
00411   int num_rows_all = 0;
00412   int num_blocks = 0;
00413   int num_blocks_all = 0;
00414   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00415        block_it.forward()) {
00416     BLOCK* block = block_it.data();
00417     if (block->poly_block() != NULL && !block->poly_block()->IsText()) {
00418       cleanup_nontext_block(block);
00419       continue;
00420     }
00421     num_rows = 0;
00422     num_rows_all = 0;
00423     if (clean_noise) {
00424       row_it.set_to_list(block->row_list());
00425       for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00426         ROW* row = row_it.data();
00427         ++num_rows_all;
00428         clean_small_noise_from_words(row);
00429         if ((textord_noise_rejrows && !row->word_list()->empty() &&
00430              clean_noise_from_row(row)) ||
00431             row->word_list()->empty()) {
00432           delete row_it.extract();  // lose empty row.
00433         } else {
00434           if (textord_noise_rejwords)
00435             clean_noise_from_words(row_it.data());
00436           if (textord_blshift_maxshift >= 0)
00437             tweak_row_baseline(row, textord_blshift_maxshift,
00438                                textord_blshift_xfraction);
00439           ++num_rows;
00440         }
00441       }
00442     }
00443     if (block->row_list()->empty()) {
00444       delete block_it.extract();  // Lose empty text blocks.
00445     } else {
00446       ++num_blocks;
00447     }
00448     ++num_blocks_all;
00449     if (textord_noise_debug)
00450       tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all);
00451   }
00452   if (textord_noise_debug)
00453     tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all);
00454 }
00455 
00456 
00457 /**********************************************************************
00458  * clean_noise_from_row
00459  *
00460  * Move blobs of words from rows of garbage into the reject blobs list.
00461  **********************************************************************/
00462 
00463 BOOL8 Textord::clean_noise_from_row(          //remove empties
00464                                     ROW *row  //row to clean
00465                                    ) {
00466   BOOL8 testing_on;
00467   TBOX blob_box;                  //bounding box
00468   C_BLOB *blob;                  //current blob
00469   C_OUTLINE *outline;            //current outline
00470   WERD *word;                    //current word
00471   inT32 blob_size;               //biggest size
00472   inT32 trans_count = 0;         //no of transitions
00473   inT32 trans_threshold;         //noise tolerance
00474   inT32 dot_count;               //small objects
00475   inT32 norm_count;              //normal objects
00476   inT32 super_norm_count;        //real char-like
00477                                  //words of row
00478   WERD_IT word_it = row->word_list ();
00479   C_BLOB_IT blob_it;             //blob iterator
00480   C_OUTLINE_IT out_it;           //outline iterator
00481 
00482   if (textord_test_y > row->base_line (textord_test_x)
00483     && textord_show_blobs
00484     && textord_test_y < row->base_line (textord_test_x) + row->x_height ())
00485     testing_on = TRUE;
00486   else
00487     testing_on = FALSE;
00488   dot_count = 0;
00489   norm_count = 0;
00490   super_norm_count = 0;
00491   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00492     word = word_it.data ();      //current word
00493                                  //blobs in word
00494     blob_it.set_to_list (word->cblob_list ());
00495     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00496     blob_it.forward ()) {
00497       blob = blob_it.data ();
00498       if (!word->flag (W_DONT_CHOP)) {
00499                                  //get outlines
00500         out_it.set_to_list (blob->out_list ());
00501         for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
00502         out_it.forward ()) {
00503           outline = out_it.data ();
00504           blob_box = outline->bounding_box ();
00505           blob_size =
00506             blob_box.width () >
00507             blob_box.height ()? blob_box.width () : blob_box.
00508             height();
00509           if (blob_size < textord_noise_sizelimit * row->x_height ())
00510             dot_count++;         //count smal outlines
00511           if (!outline->child ()->empty ()
00512             && blob_box.height () <
00513             (1 + textord_noise_syfract) * row->x_height ()
00514             && blob_box.height () >
00515             (1 - textord_noise_syfract) * row->x_height ()
00516             && blob_box.width () <
00517             (1 + textord_noise_sxfract) * row->x_height ()
00518             && blob_box.width () >
00519             (1 - textord_noise_sxfract) * row->x_height ())
00520             super_norm_count++;  //count smal outlines
00521         }
00522       }
00523       else
00524         super_norm_count++;
00525       blob_box = blob->bounding_box ();
00526       blob_size =
00527         blob_box.width () >
00528         blob_box.height ()? blob_box.width () : blob_box.height ();
00529       if (blob_size >= textord_noise_sizelimit * row->x_height ()
00530           && blob_size < row->x_height () * 2) {
00531         trans_threshold = blob_size / textord_noise_sizefraction;
00532         trans_count = blob->count_transitions (trans_threshold);
00533         if (trans_count < textord_noise_translimit)
00534           norm_count++;
00535       }
00536       else if (blob_box.height () > row->x_height () * 2
00537         && (!word_it.at_first () || !blob_it.at_first ()))
00538         dot_count += 2;
00539       if (testing_on) {
00540         tprintf
00541           ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
00542           blob_box.left (), blob_box.bottom (), blob_box.right (),
00543           blob_box.top (), blob->out_list ()->length (), trans_count,
00544           blob_box.bottom () - row->base_line (blob_box.left ()));
00545       }
00546     }
00547   }
00548   if (textord_noise_debug) {
00549     tprintf ("Row ending at (%d,%g):",
00550       blob_box.right (), row->base_line (blob_box.right ()));
00551     tprintf (" R=%g, dc=%d, nc=%d, %s\n",
00552       norm_count > 0 ? (float) dot_count / norm_count : 9999,
00553       dot_count, norm_count,
00554       dot_count > norm_count * textord_noise_normratio
00555       && dot_count > 2 ? "REJECTED" : "ACCEPTED");
00556   }
00557   return super_norm_count < textord_noise_sncount
00558     && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
00559 }
00560 
00561 /**********************************************************************
00562  * clean_noise_from_words
00563  *
00564  * Move blobs of words from rows of garbage into the reject blobs list.
00565  **********************************************************************/
00566 
00567 void Textord::clean_noise_from_words(          //remove empties
00568                                      ROW *row  //row to clean
00569                                     ) {
00570   TBOX blob_box;                  //bounding box
00571   inT8 *word_dud;                //was it chucked
00572   C_BLOB *blob;                  //current blob
00573   C_OUTLINE *outline;            //current outline
00574   WERD *word;                    //current word
00575   inT32 blob_size;               //biggest size
00576   inT32 trans_count;             //no of transitions
00577   inT32 trans_threshold;         //noise tolerance
00578   inT32 dot_count;               //small objects
00579   inT32 norm_count;              //normal objects
00580   inT32 dud_words;               //number discarded
00581   inT32 ok_words;                //number remaining
00582   inT32 word_index;              //current word
00583                                  //words of row
00584   WERD_IT word_it = row->word_list ();
00585   C_BLOB_IT blob_it;             //blob iterator
00586   C_OUTLINE_IT out_it;           //outline iterator
00587 
00588   ok_words = word_it.length ();
00589   if (ok_words == 0 || textord_no_rejects)
00590     return;
00591   word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8));
00592   dud_words = 0;
00593   ok_words = 0;
00594   word_index = 0;
00595   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00596     word = word_it.data ();      //current word
00597     dot_count = 0;
00598     norm_count = 0;
00599                                  //blobs in word
00600     blob_it.set_to_list (word->cblob_list ());
00601     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00602     blob_it.forward ()) {
00603       blob = blob_it.data ();
00604       if (!word->flag (W_DONT_CHOP)) {
00605                                  //get outlines
00606         out_it.set_to_list (blob->out_list ());
00607         for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
00608         out_it.forward ()) {
00609           outline = out_it.data ();
00610           blob_box = outline->bounding_box ();
00611           blob_size =
00612             blob_box.width () >
00613             blob_box.height ()? blob_box.width () : blob_box.
00614             height();
00615           if (blob_size < textord_noise_sizelimit * row->x_height ())
00616             dot_count++;         //count smal outlines
00617           if (!outline->child ()->empty ()
00618             && blob_box.height () <
00619             (1 + textord_noise_syfract) * row->x_height ()
00620             && blob_box.height () >
00621             (1 - textord_noise_syfract) * row->x_height ()
00622             && blob_box.width () <
00623             (1 + textord_noise_sxfract) * row->x_height ()
00624             && blob_box.width () >
00625             (1 - textord_noise_sxfract) * row->x_height ())
00626             norm_count++;        //count smal outlines
00627         }
00628       }
00629       else
00630         norm_count++;
00631       blob_box = blob->bounding_box ();
00632       blob_size =
00633         blob_box.width () >
00634         blob_box.height ()? blob_box.width () : blob_box.height ();
00635       if (blob_size >= textord_noise_sizelimit * row->x_height ()
00636       && blob_size < row->x_height () * 2) {
00637         trans_threshold = blob_size / textord_noise_sizefraction;
00638         trans_count = blob->count_transitions (trans_threshold);
00639         if (trans_count < textord_noise_translimit)
00640           norm_count++;
00641       }
00642       else if (blob_box.height () > row->x_height () * 2
00643         && (!word_it.at_first () || !blob_it.at_first ()))
00644         dot_count += 2;
00645     }
00646     if (dot_count > 2 && !word->flag(W_REP_CHAR)) {
00647       if (dot_count > norm_count * textord_noise_normratio * 2)
00648         word_dud[word_index] = 2;
00649       else if (dot_count > norm_count * textord_noise_normratio)
00650         word_dud[word_index] = 1;
00651       else
00652         word_dud[word_index] = 0;
00653     } else {
00654       word_dud[word_index] = 0;
00655     }
00656     if (word_dud[word_index] == 2)
00657       dud_words++;
00658     else
00659       ok_words++;
00660     word_index++;
00661   }
00662 
00663   word_index = 0;
00664   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00665     if (word_dud[word_index] == 2
00666     || (word_dud[word_index] == 1 && dud_words > ok_words)) {
00667       word = word_it.data();  // Current word.
00668       // Previously we threw away the entire word.
00669       // Now just aggressively throw all small blobs into the reject list, where
00670       // the classifier can decide whether they are actually needed.
00671       word->CleanNoise(textord_noise_sizelimit * row->x_height());
00672     }
00673     word_index++;
00674   }
00675   free_mem(word_dud);
00676 }
00677 
00678 // Remove outlines that are a tiny fraction in either width or height
00679 // of the word height.
00680 void Textord::clean_small_noise_from_words(ROW *row) {
00681   WERD_IT word_it(row->word_list());
00682   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00683     WERD* word = word_it.data();
00684     int min_size = static_cast<int>(
00685       textord_noise_hfract * word->bounding_box().height() + 0.5);
00686     C_BLOB_IT blob_it(word->cblob_list());
00687     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00688       C_BLOB* blob = blob_it.data();
00689       C_OUTLINE_IT out_it(blob->out_list());
00690       for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
00691         C_OUTLINE* outline = out_it.data();
00692         outline->RemoveSmallRecursive(min_size, &out_it);
00693       }
00694       if (blob->out_list()->empty()) {
00695         delete blob_it.extract();
00696       }
00697     }
00698     if (word->cblob_list()->empty()) {
00699       if (!word_it.at_last()) {
00700         // The next word is no longer a fuzzy non space if it was before,
00701         // since the word before is about to be deleted.
00702         WERD* next_word = word_it.data_relative(1);
00703         if (next_word->flag(W_FUZZY_NON)) {
00704           next_word->set_flag(W_FUZZY_NON, false);
00705         }
00706       }
00707       delete word_it.extract();
00708     }
00709   }
00710 }
00711 
00712 // Local struct to hold a group of blocks.
00713 struct BlockGroup {
00714   BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {}
00715   explicit BlockGroup(BLOCK* block)
00716       : bounding_box(block->bounding_box()),
00717         rotation(block->re_rotation()),
00718         angle(block->re_rotation().angle()),
00719         min_xheight(block->x_height()) {
00720     blocks.push_back(block);
00721   }
00722   // Union of block bounding boxes.
00723   TBOX bounding_box;
00724   // Common rotation of the blocks.
00725   FCOORD rotation;
00726   // Angle of rotation.
00727   float angle;
00728   // Min xheight of the blocks.
00729   float min_xheight;
00730   // Collection of borrowed pointers to the blocks in the group.
00731   GenericVector<BLOCK*> blocks;
00732 };
00733 
00734 // Groups blocks by rotation, then, for each group, makes a WordGrid and calls
00735 // TransferDiacriticsToWords to copy the diacritic blobs to the most
00736 // appropriate words in the group of blocks. Source blobs are not touched.
00737 void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
00738                                               BLOCK_LIST* blocks) {
00739   // Angle difference larger than this is too much to consider equal.
00740   // They should only be in multiples of M_PI/2 anyway.
00741   const double kMaxAngleDiff = 0.01;  // About 0.6 degrees.
00742   PointerVector<BlockGroup> groups;
00743   BLOCK_IT bk_it(blocks);
00744   for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) {
00745     BLOCK* block = bk_it.data();
00746     if (block->poly_block() != NULL && !block->poly_block()->IsText()) {
00747       continue;
00748     }
00749     // Linear search of the groups to find a matching rotation.
00750     float block_angle = block->re_rotation().angle();
00751     int best_g = 0;
00752     float best_angle_diff = MAX_FLOAT32;
00753     for (int g = 0; g < groups.size(); ++g) {
00754       double angle_diff = fabs(block_angle - groups[g]->angle);
00755       if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI);
00756       if (angle_diff < best_angle_diff) {
00757         best_angle_diff = angle_diff;
00758         best_g = g;
00759       }
00760     }
00761     if (best_angle_diff > kMaxAngleDiff) {
00762       groups.push_back(new BlockGroup(block));
00763     } else {
00764       groups[best_g]->blocks.push_back(block);
00765       groups[best_g]->bounding_box += block->bounding_box();
00766       float x_height = block->x_height();
00767       if (x_height < groups[best_g]->min_xheight)
00768         groups[best_g]->min_xheight = x_height;
00769     }
00770   }
00771   // Now process each group of blocks.
00772   PointerVector<WordWithBox> word_ptrs;
00773   for (int g = 0; g < groups.size(); ++g) {
00774     const BlockGroup* group = groups[g];
00775     WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(),
00776                        group->bounding_box.topright());
00777     for (int b = 0; b < group->blocks.size(); ++b) {
00778       ROW_IT row_it(group->blocks[b]->row_list());
00779       for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00780         ROW* row = row_it.data();
00781         // Put the words of the row into the grid.
00782         WERD_IT w_it(row->word_list());
00783         for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
00784           WERD* word = w_it.data();
00785           WordWithBox* box_word = new WordWithBox(word);
00786           word_grid.InsertBBox(true, true, box_word);
00787           // Save the pointer where it will be auto-deleted.
00788           word_ptrs.push_back(box_word);
00789         }
00790       }
00791     }
00792     FCOORD rotation = group->rotation;
00793     // Make it a forward rotation that will transform blob coords to block.
00794     rotation.set_y(-rotation.y());
00795     TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid);
00796   }
00797 }
00798 
00799 // Places a copy of blobs that are near a word (after applying rotation to the
00800 // blob) in the most appropriate word, unless there is doubt, in which case a
00801 // blob can end up in two words. Source blobs are not touched.
00802 void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs,
00803                                         const FCOORD& rotation,
00804                                         WordGrid* word_grid) {
00805   WordSearch ws(word_grid);
00806   BLOBNBOX_IT b_it(diacritic_blobs);
00807   // Apply rotation to each blob before finding the nearest words. The rotation
00808   // allows us to only consider above/below placement and not left/right on
00809   // vertical text, because all text is horizontal here.
00810   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00811     BLOBNBOX* blobnbox = b_it.data();
00812     TBOX blob_box = blobnbox->bounding_box();
00813     blob_box.rotate(rotation);
00814     ws.StartRectSearch(blob_box);
00815     // Above/below refer to word position relative to diacritic. Since some
00816     // scripts eg Kannada/Telugu habitually put diacritics below words, and
00817     // others eg Thai/Vietnamese/Latin put most diacritics above words, try
00818     // for both if there isn't much in it.
00819     WordWithBox* best_above_word = NULL;
00820     WordWithBox* best_below_word = NULL;
00821     int best_above_distance = 0;
00822     int best_below_distance = 0;
00823     for (WordWithBox* word = ws.NextRectSearch(); word != NULL;
00824          word = ws.NextRectSearch()) {
00825       if (word->word()->flag(W_REP_CHAR)) continue;
00826       TBOX word_box = word->true_bounding_box();
00827       int x_distance = blob_box.x_gap(word_box);
00828       int y_distance = blob_box.y_gap(word_box);
00829       if (x_distance > 0) {
00830         // Arbitrarily divide x-distance by 2 if there is a major y overlap,
00831         // and the word is to the left of the diacritic. If the
00832         // diacritic is a dropped broken character between two words, this will
00833         // help send all the pieces to a single word, instead of splitting them
00834         // over the 2 words.
00835         if (word_box.major_y_overlap(blob_box) &&
00836             blob_box.left() > word_box.right()) {
00837           x_distance /= 2;
00838         }
00839         y_distance += x_distance;
00840       }
00841       if (word_box.y_middle() > blob_box.y_middle() &&
00842           (best_above_word == NULL || y_distance < best_above_distance)) {
00843         best_above_word = word;
00844         best_above_distance = y_distance;
00845       }
00846       if (word_box.y_middle() <= blob_box.y_middle() &&
00847           (best_below_word == NULL || y_distance < best_below_distance)) {
00848         best_below_word = word;
00849         best_below_distance = y_distance;
00850       }
00851     }
00852     bool above_good =
00853         best_above_word != NULL &&
00854         (best_below_word == NULL ||
00855          best_above_distance < best_below_distance + blob_box.height());
00856     bool below_good =
00857         best_below_word != NULL && best_below_word != best_above_word &&
00858         (best_above_word == NULL ||
00859          best_below_distance < best_above_distance + blob_box.height());
00860     if (below_good) {
00861       C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
00862       copied_blob->rotate(rotation);
00863       // Put the blob into the word's reject blobs list.
00864       C_BLOB_IT blob_it(best_below_word->RejBlobs());
00865       blob_it.add_to_end(copied_blob);
00866     }
00867     if (above_good) {
00868       C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
00869       copied_blob->rotate(rotation);
00870       // Put the blob into the word's reject blobs list.
00871       C_BLOB_IT blob_it(best_above_word->RejBlobs());
00872       blob_it.add_to_end(copied_blob);
00873     }
00874   }
00875 }
00876 
00877 }  // tesseract
00878 
00879 /**********************************************************************
00880  * tweak_row_baseline
00881  *
00882  * Shift baseline to fit the blobs more accurately where they are
00883  * close enough.
00884  **********************************************************************/
00885 
00886 void tweak_row_baseline(ROW *row,
00887                         double blshift_maxshift,
00888                         double blshift_xfraction) {
00889   TBOX blob_box;                 //bounding box
00890   C_BLOB *blob;                  //current blob
00891   WERD *word;                    //current word
00892   inT32 blob_count;              //no of blobs
00893   inT32 src_index;               //source segment
00894   inT32 dest_index;              //destination segment
00895   inT32 *xstarts;                //spline segments
00896   double *coeffs;                //spline coeffs
00897   float ydiff;                   //baseline error
00898   float x_centre;                //centre of blob
00899                                  //words of row
00900   WERD_IT word_it = row->word_list ();
00901   C_BLOB_IT blob_it;             //blob iterator
00902 
00903   blob_count = 0;
00904   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00905     word = word_it.data ();      //current word
00906                                  //get total blobs
00907     blob_count += word->cblob_list ()->length ();
00908   }
00909   if (blob_count == 0)
00910     return;
00911   xstarts =
00912     (inT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) *
00913     sizeof (inT32));
00914   coeffs =
00915     (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 *
00916     sizeof (double));
00917 
00918   src_index = 0;
00919   dest_index = 0;
00920   xstarts[0] = row->baseline.xcoords[0];
00921   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00922     word = word_it.data ();      //current word
00923                                  //blobs in word
00924     blob_it.set_to_list (word->cblob_list ());
00925     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00926     blob_it.forward ()) {
00927       blob = blob_it.data ();
00928       blob_box = blob->bounding_box ();
00929       x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
00930       ydiff = blob_box.bottom () - row->base_line (x_centre);
00931       if (ydiff < 0)
00932         ydiff = -ydiff / row->x_height ();
00933       else
00934         ydiff = ydiff / row->x_height ();
00935       if (ydiff < blshift_maxshift
00936         && blob_box.height () / row->x_height () > blshift_xfraction) {
00937         if (xstarts[dest_index] >= x_centre)
00938           xstarts[dest_index] = blob_box.left ();
00939         coeffs[dest_index * 3] = 0;
00940         coeffs[dest_index * 3 + 1] = 0;
00941         coeffs[dest_index * 3 + 2] = blob_box.bottom ();
00942         //shift it
00943         dest_index++;
00944         xstarts[dest_index] = blob_box.right () + 1;
00945       }
00946       else {
00947         if (xstarts[dest_index] <= x_centre) {
00948           while (row->baseline.xcoords[src_index + 1] <= x_centre
00949           && src_index < row->baseline.segments - 1) {
00950             if (row->baseline.xcoords[src_index + 1] >
00951             xstarts[dest_index]) {
00952               coeffs[dest_index * 3] =
00953                 row->baseline.quadratics[src_index].a;
00954               coeffs[dest_index * 3 + 1] =
00955                 row->baseline.quadratics[src_index].b;
00956               coeffs[dest_index * 3 + 2] =
00957                 row->baseline.quadratics[src_index].c;
00958               dest_index++;
00959               xstarts[dest_index] =
00960                 row->baseline.xcoords[src_index + 1];
00961             }
00962             src_index++;
00963           }
00964           coeffs[dest_index * 3] =
00965             row->baseline.quadratics[src_index].a;
00966           coeffs[dest_index * 3 + 1] =
00967             row->baseline.quadratics[src_index].b;
00968           coeffs[dest_index * 3 + 2] =
00969             row->baseline.quadratics[src_index].c;
00970           dest_index++;
00971           xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
00972         }
00973       }
00974     }
00975   }
00976   while (src_index < row->baseline.segments
00977     && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
00978     src_index++;
00979   while (src_index < row->baseline.segments) {
00980     coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
00981     coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
00982     coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
00983     dest_index++;
00984     src_index++;
00985     xstarts[dest_index] = row->baseline.xcoords[src_index];
00986   }
00987                                  //turn to spline
00988   row->baseline = QSPLINE (dest_index, xstarts, coeffs);
00989   free_mem(xstarts);
00990   free_mem(coeffs);
00991 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines