|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: tordmain.cpp (Formerly textordp.c) 00003 * Description: C++ top level textord code. 00004 * Author: Ray Smith 00005 * Created: Tue Jul 28 17:12:33 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef HAVE_CONFIG_H 00021 #include "config_auto.h" 00022 #endif 00023 00024 #ifdef __UNIX__ 00025 #include <assert.h> 00026 #endif 00027 #include "stderr.h" 00028 #include "globaloc.h" 00029 #include "blread.h" 00030 #include "blobbox.h" 00031 #include "ccstruct.h" 00032 #include "edgblob.h" 00033 #include "drawtord.h" 00034 #include "makerow.h" 00035 #include "wordseg.h" 00036 #include "textord.h" 00037 #include "tordmain.h" 00038 00039 #include "allheaders.h" 00040 00041 // Gridsize for word grid when reassigning diacritics to words. Not critical. 00042 const int kWordGridSize = 50; 00043 00044 #undef EXTERN 00045 #define EXTERN 00046 00047 #define MAX_NEAREST_DIST 600 //for block skew stats 00048 00049 namespace tesseract { 00050 00051 CLISTIZE(WordWithBox) 00052 00053 /********************************************************************** 00054 * SetBlobStrokeWidth 00055 * 00056 * Set the horizontal and vertical stroke widths in the blob. 00057 **********************************************************************/ 00058 void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) { 00059 // Cut the blob rectangle into a Pix. 00060 int pix_height = pixGetHeight(pix); 00061 const TBOX& box = blob->bounding_box(); 00062 int width = box.width(); 00063 int height = box.height(); 00064 Box* blob_pix_box = boxCreate(box.left(), pix_height - box.top(), 00065 width, height); 00066 Pix* pix_blob = pixClipRectangle(pix, blob_pix_box, NULL); 00067 boxDestroy(&blob_pix_box); 00068 Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG); 00069 pixDestroy(&pix_blob); 00070 // Compute the stroke widths. 00071 uinT32* data = pixGetData(dist_pix); 00072 int wpl = pixGetWpl(dist_pix); 00073 // Horizontal width of stroke. 00074 STATS h_stats(0, width + 1); 00075 for (int y = 0; y < height; ++y) { 00076 uinT32* pixels = data + y*wpl; 00077 int prev_pixel = 0; 00078 int pixel = GET_DATA_BYTE(pixels, 0); 00079 for (int x = 1; x < width; ++x) { 00080 int next_pixel = GET_DATA_BYTE(pixels, x); 00081 // We are looking for a pixel that is equal to its vertical neighbours, 00082 // yet greater than its left neighbour. 00083 if (prev_pixel < pixel && 00084 (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) && 00085 (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) { 00086 if (pixel > next_pixel) { 00087 // Single local max, so an odd width. 00088 h_stats.add(pixel * 2 - 1, 1); 00089 } else if (pixel == next_pixel && x + 1 < width && 00090 pixel > GET_DATA_BYTE(pixels, x + 1)) { 00091 // Double local max, so an even width. 00092 h_stats.add(pixel * 2, 1); 00093 } 00094 } 00095 prev_pixel = pixel; 00096 pixel = next_pixel; 00097 } 00098 } 00099 // Vertical width of stroke. 00100 STATS v_stats(0, height + 1); 00101 for (int x = 0; x < width; ++x) { 00102 int prev_pixel = 0; 00103 int pixel = GET_DATA_BYTE(data, x); 00104 for (int y = 1; y < height; ++y) { 00105 uinT32* pixels = data + y*wpl; 00106 int next_pixel = GET_DATA_BYTE(pixels, x); 00107 // We are looking for a pixel that is equal to its horizontal neighbours, 00108 // yet greater than its upper neighbour. 00109 if (prev_pixel < pixel && 00110 (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) && 00111 (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) { 00112 if (pixel > next_pixel) { 00113 // Single local max, so an odd width. 00114 v_stats.add(pixel * 2 - 1, 1); 00115 } else if (pixel == next_pixel && y + 1 < height && 00116 pixel > GET_DATA_BYTE(pixels + wpl, x)) { 00117 // Double local max, so an even width. 00118 v_stats.add(pixel * 2, 1); 00119 } 00120 } 00121 prev_pixel = pixel; 00122 pixel = next_pixel; 00123 } 00124 } 00125 pixDestroy(&dist_pix); 00126 // Store the horizontal and vertical width in the blob, keeping both 00127 // widths if there is enough information, otherwse only the one with 00128 // the most samples. 00129 // If there are insufficent samples, store zero, rather than using 00130 // 2*area/perimeter, as the numbers that gives do not match the numbers 00131 // from the distance method. 00132 if (h_stats.get_total() >= (width + height) / 4) { 00133 blob->set_horz_stroke_width(h_stats.ile(0.5f)); 00134 if (v_stats.get_total() >= (width + height) / 4) 00135 blob->set_vert_stroke_width(v_stats.ile(0.5f)); 00136 else 00137 blob->set_vert_stroke_width(0.0f); 00138 } else { 00139 if (v_stats.get_total() >= (width + height) / 4 || 00140 v_stats.get_total() > h_stats.get_total()) { 00141 blob->set_horz_stroke_width(0.0f); 00142 blob->set_vert_stroke_width(v_stats.ile(0.5f)); 00143 } else { 00144 blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f) 00145 : 0.0f); 00146 blob->set_vert_stroke_width(0.0f); 00147 } 00148 } 00149 } 00150 00151 /********************************************************************** 00152 * assign_blobs_to_blocks2 00153 * 00154 * Make a list of TO_BLOCKs for portrait and landscape orientation. 00155 **********************************************************************/ 00156 00157 void assign_blobs_to_blocks2(Pix* pix, 00158 BLOCK_LIST *blocks, // blocks to process 00159 TO_BLOCK_LIST *port_blocks) { // output list 00160 BLOCK *block; // current block 00161 BLOBNBOX *newblob; // created blob 00162 C_BLOB *blob; // current blob 00163 BLOCK_IT block_it = blocks; 00164 C_BLOB_IT blob_it; // iterator 00165 BLOBNBOX_IT port_box_it; // iterator 00166 // destination iterator 00167 TO_BLOCK_IT port_block_it = port_blocks; 00168 TO_BLOCK *port_block; // created block 00169 00170 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { 00171 block = block_it.data(); 00172 port_block = new TO_BLOCK(block); 00173 00174 // Convert the good outlines to block->blob_list 00175 port_box_it.set_to_list(&port_block->blobs); 00176 blob_it.set_to_list(block->blob_list()); 00177 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { 00178 blob = blob_it.extract(); 00179 newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX. 00180 SetBlobStrokeWidth(pix, newblob); 00181 port_box_it.add_after_then_move(newblob); 00182 } 00183 00184 // Put the rejected outlines in block->noise_blobs, which allows them to 00185 // be reconsidered and sorted back into rows and recover outlines mistakenly 00186 // rejected. 00187 port_box_it.set_to_list(&port_block->noise_blobs); 00188 blob_it.set_to_list(block->reject_blobs()); 00189 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { 00190 blob = blob_it.extract(); 00191 newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX. 00192 SetBlobStrokeWidth(pix, newblob); 00193 port_box_it.add_after_then_move(newblob); 00194 } 00195 00196 port_block_it.add_after_then_move(port_block); 00197 } 00198 } 00199 00200 /********************************************************************** 00201 * find_components 00202 * 00203 * Find the C_OUTLINEs of the connected components in each block, put them 00204 * in C_BLOBs, and filter them by size, putting the different size 00205 * grades on different lists in the matching TO_BLOCK in to_blocks. 00206 **********************************************************************/ 00207 00208 void Textord::find_components(Pix* pix, BLOCK_LIST *blocks, 00209 TO_BLOCK_LIST *to_blocks) { 00210 int width = pixGetWidth(pix); 00211 int height = pixGetHeight(pix); 00212 if (width > MAX_INT16 || height > MAX_INT16) { 00213 tprintf("Input image too large! (%d, %d)\n", width, height); 00214 return; // Can't handle it. 00215 } 00216 00217 set_global_loc_code(LOC_EDGE_PROG); 00218 00219 BLOCK_IT block_it(blocks); // iterator 00220 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 00221 block_it.forward()) { 00222 BLOCK* block = block_it.data(); 00223 if (block->poly_block() == NULL || block->poly_block()->IsText()) { 00224 extract_edges(pix, block); 00225 } 00226 } 00227 00228 assign_blobs_to_blocks2(pix, blocks, to_blocks); 00229 ICOORD page_tr(width, height); 00230 filter_blobs(page_tr, to_blocks, !textord_test_landscape); 00231 } 00232 00233 /********************************************************************** 00234 * filter_blobs 00235 * 00236 * Sort the blobs into sizes in all the blocks for later work. 00237 **********************************************************************/ 00238 00239 void Textord::filter_blobs(ICOORD page_tr, // top right 00240 TO_BLOCK_LIST *blocks, // output list 00241 BOOL8 testing_on) { // for plotting 00242 TO_BLOCK_IT block_it = blocks; // destination iterator 00243 TO_BLOCK *block; // created block 00244 00245 #ifndef GRAPHICS_DISABLED 00246 if (to_win != NULL) 00247 to_win->Clear(); 00248 #endif // GRAPHICS_DISABLED 00249 00250 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 00251 block_it.forward()) { 00252 block = block_it.data(); 00253 block->line_size = filter_noise_blobs(&block->blobs, 00254 &block->noise_blobs, 00255 &block->small_blobs, 00256 &block->large_blobs); 00257 block->line_spacing = block->line_size * 00258 (tesseract::CCStruct::kDescenderFraction + 00259 tesseract::CCStruct::kXHeightFraction + 00260 2 * tesseract::CCStruct::kAscenderFraction) / 00261 tesseract::CCStruct::kXHeightFraction; 00262 block->line_size *= textord_min_linesize; 00263 block->max_blob_size = block->line_size * textord_excess_blobsize; 00264 00265 #ifndef GRAPHICS_DISABLED 00266 if (textord_show_blobs && testing_on) { 00267 if (to_win == NULL) 00268 create_to_win(page_tr); 00269 block->plot_graded_blobs(to_win); 00270 } 00271 if (textord_show_boxes && testing_on) { 00272 if (to_win == NULL) 00273 create_to_win(page_tr); 00274 plot_box_list(to_win, &block->noise_blobs, ScrollView::WHITE); 00275 plot_box_list(to_win, &block->small_blobs, ScrollView::WHITE); 00276 plot_box_list(to_win, &block->large_blobs, ScrollView::WHITE); 00277 plot_box_list(to_win, &block->blobs, ScrollView::WHITE); 00278 } 00279 #endif // GRAPHICS_DISABLED 00280 } 00281 } 00282 00283 /********************************************************************** 00284 * filter_noise_blobs 00285 * 00286 * Move small blobs to a separate list. 00287 **********************************************************************/ 00288 00289 float Textord::filter_noise_blobs( 00290 BLOBNBOX_LIST *src_list, // original list 00291 BLOBNBOX_LIST *noise_list, // noise list 00292 BLOBNBOX_LIST *small_list, // small blobs 00293 BLOBNBOX_LIST *large_list) { // large blobs 00294 inT16 height; //height of blob 00295 inT16 width; //of blob 00296 BLOBNBOX *blob; //current blob 00297 float initial_x; //first guess 00298 BLOBNBOX_IT src_it = src_list; //iterators 00299 BLOBNBOX_IT noise_it = noise_list; 00300 BLOBNBOX_IT small_it = small_list; 00301 BLOBNBOX_IT large_it = large_list; 00302 STATS size_stats (0, MAX_NEAREST_DIST); 00303 //blob heights 00304 float min_y; //size limits 00305 float max_y; 00306 float max_x; 00307 float max_height; //of good blobs 00308 00309 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { 00310 blob = src_it.data(); 00311 if (blob->bounding_box().height() < textord_max_noise_size) 00312 noise_it.add_after_then_move(src_it.extract()); 00313 else if (blob->enclosed_area() >= blob->bounding_box().height() 00314 * blob->bounding_box().width() * textord_noise_area_ratio) 00315 small_it.add_after_then_move(src_it.extract()); 00316 } 00317 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { 00318 size_stats.add(src_it.data()->bounding_box().height(), 1); 00319 } 00320 initial_x = size_stats.ile(textord_initialx_ile); 00321 max_y = ceil(initial_x * 00322 (tesseract::CCStruct::kDescenderFraction + 00323 tesseract::CCStruct::kXHeightFraction + 00324 2 * tesseract::CCStruct::kAscenderFraction) / 00325 tesseract::CCStruct::kXHeightFraction); 00326 min_y = floor (initial_x / 2); 00327 max_x = ceil (initial_x * textord_width_limit); 00328 small_it.move_to_first (); 00329 for (small_it.mark_cycle_pt (); !small_it.cycled_list (); 00330 small_it.forward ()) { 00331 height = small_it.data()->bounding_box().height(); 00332 if (height > max_y) 00333 large_it.add_after_then_move(small_it.extract ()); 00334 else if (height >= min_y) 00335 src_it.add_after_then_move(small_it.extract ()); 00336 } 00337 size_stats.clear (); 00338 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) { 00339 height = src_it.data ()->bounding_box ().height (); 00340 width = src_it.data ()->bounding_box ().width (); 00341 if (height < min_y) 00342 small_it.add_after_then_move (src_it.extract ()); 00343 else if (height > max_y || width > max_x) 00344 large_it.add_after_then_move (src_it.extract ()); 00345 else 00346 size_stats.add (height, 1); 00347 } 00348 max_height = size_stats.ile (textord_initialasc_ile); 00349 // tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,", 00350 // max_y,min_y,initial_x,max_height); 00351 max_height *= tesseract::CCStruct::kXHeightCapRatio; 00352 if (max_height > initial_x) 00353 initial_x = max_height; 00354 // tprintf(" ret=%g\n",initial_x); 00355 return initial_x; 00356 } 00357 00358 // Fixes the block so it obeys all the rules: 00359 // Must have at least one ROW. 00360 // Must have at least one WERD. 00361 // WERDs contain a fake blob. 00362 void Textord::cleanup_nontext_block(BLOCK* block) { 00363 // Non-text blocks must contain at least one row. 00364 ROW_IT row_it(block->row_list()); 00365 if (row_it.empty()) { 00366 TBOX box = block->bounding_box(); 00367 float height = box.height(); 00368 inT32 xstarts[2] = {box.left(), box.right()}; 00369 double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())}; 00370 ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f, 00371 height / 4.0f, 0, 1); 00372 row_it.add_after_then_move(row); 00373 } 00374 // Each row must contain at least one word. 00375 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00376 ROW* row = row_it.data(); 00377 WERD_IT w_it(row->word_list()); 00378 if (w_it.empty()) { 00379 // Make a fake blob to put in the word. 00380 TBOX box = block->row_list()->singleton() ? block->bounding_box() 00381 : row->bounding_box(); 00382 C_BLOB* blob = C_BLOB::FakeBlob(box); 00383 C_BLOB_LIST blobs; 00384 C_BLOB_IT blob_it(&blobs); 00385 blob_it.add_after_then_move(blob); 00386 WERD* word = new WERD(&blobs, 0, NULL); 00387 w_it.add_after_then_move(word); 00388 } 00389 // Each word must contain a fake blob. 00390 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 00391 WERD* word = w_it.data(); 00392 // Just assert that this is true, as it would be useful to find 00393 // out why it isn't. 00394 ASSERT_HOST(!word->cblob_list()->empty()); 00395 } 00396 row->recalc_bounding_box(); 00397 } 00398 } 00399 00400 /********************************************************************** 00401 * cleanup_blocks 00402 * 00403 * Delete empty blocks, rows from the page. 00404 **********************************************************************/ 00405 00406 void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST* blocks) { 00407 BLOCK_IT block_it = blocks; //iterator 00408 ROW_IT row_it; //row iterator 00409 00410 int num_rows = 0; 00411 int num_rows_all = 0; 00412 int num_blocks = 0; 00413 int num_blocks_all = 0; 00414 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 00415 block_it.forward()) { 00416 BLOCK* block = block_it.data(); 00417 if (block->poly_block() != NULL && !block->poly_block()->IsText()) { 00418 cleanup_nontext_block(block); 00419 continue; 00420 } 00421 num_rows = 0; 00422 num_rows_all = 0; 00423 if (clean_noise) { 00424 row_it.set_to_list(block->row_list()); 00425 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00426 ROW* row = row_it.data(); 00427 ++num_rows_all; 00428 clean_small_noise_from_words(row); 00429 if ((textord_noise_rejrows && !row->word_list()->empty() && 00430 clean_noise_from_row(row)) || 00431 row->word_list()->empty()) { 00432 delete row_it.extract(); // lose empty row. 00433 } else { 00434 if (textord_noise_rejwords) 00435 clean_noise_from_words(row_it.data()); 00436 if (textord_blshift_maxshift >= 0) 00437 tweak_row_baseline(row, textord_blshift_maxshift, 00438 textord_blshift_xfraction); 00439 ++num_rows; 00440 } 00441 } 00442 } 00443 if (block->row_list()->empty()) { 00444 delete block_it.extract(); // Lose empty text blocks. 00445 } else { 00446 ++num_blocks; 00447 } 00448 ++num_blocks_all; 00449 if (textord_noise_debug) 00450 tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all); 00451 } 00452 if (textord_noise_debug) 00453 tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all); 00454 } 00455 00456 00457 /********************************************************************** 00458 * clean_noise_from_row 00459 * 00460 * Move blobs of words from rows of garbage into the reject blobs list. 00461 **********************************************************************/ 00462 00463 BOOL8 Textord::clean_noise_from_row( //remove empties 00464 ROW *row //row to clean 00465 ) { 00466 BOOL8 testing_on; 00467 TBOX blob_box; //bounding box 00468 C_BLOB *blob; //current blob 00469 C_OUTLINE *outline; //current outline 00470 WERD *word; //current word 00471 inT32 blob_size; //biggest size 00472 inT32 trans_count = 0; //no of transitions 00473 inT32 trans_threshold; //noise tolerance 00474 inT32 dot_count; //small objects 00475 inT32 norm_count; //normal objects 00476 inT32 super_norm_count; //real char-like 00477 //words of row 00478 WERD_IT word_it = row->word_list (); 00479 C_BLOB_IT blob_it; //blob iterator 00480 C_OUTLINE_IT out_it; //outline iterator 00481 00482 if (textord_test_y > row->base_line (textord_test_x) 00483 && textord_show_blobs 00484 && textord_test_y < row->base_line (textord_test_x) + row->x_height ()) 00485 testing_on = TRUE; 00486 else 00487 testing_on = FALSE; 00488 dot_count = 0; 00489 norm_count = 0; 00490 super_norm_count = 0; 00491 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00492 word = word_it.data (); //current word 00493 //blobs in word 00494 blob_it.set_to_list (word->cblob_list ()); 00495 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); 00496 blob_it.forward ()) { 00497 blob = blob_it.data (); 00498 if (!word->flag (W_DONT_CHOP)) { 00499 //get outlines 00500 out_it.set_to_list (blob->out_list ()); 00501 for (out_it.mark_cycle_pt (); !out_it.cycled_list (); 00502 out_it.forward ()) { 00503 outline = out_it.data (); 00504 blob_box = outline->bounding_box (); 00505 blob_size = 00506 blob_box.width () > 00507 blob_box.height ()? blob_box.width () : blob_box. 00508 height(); 00509 if (blob_size < textord_noise_sizelimit * row->x_height ()) 00510 dot_count++; //count smal outlines 00511 if (!outline->child ()->empty () 00512 && blob_box.height () < 00513 (1 + textord_noise_syfract) * row->x_height () 00514 && blob_box.height () > 00515 (1 - textord_noise_syfract) * row->x_height () 00516 && blob_box.width () < 00517 (1 + textord_noise_sxfract) * row->x_height () 00518 && blob_box.width () > 00519 (1 - textord_noise_sxfract) * row->x_height ()) 00520 super_norm_count++; //count smal outlines 00521 } 00522 } 00523 else 00524 super_norm_count++; 00525 blob_box = blob->bounding_box (); 00526 blob_size = 00527 blob_box.width () > 00528 blob_box.height ()? blob_box.width () : blob_box.height (); 00529 if (blob_size >= textord_noise_sizelimit * row->x_height () 00530 && blob_size < row->x_height () * 2) { 00531 trans_threshold = blob_size / textord_noise_sizefraction; 00532 trans_count = blob->count_transitions (trans_threshold); 00533 if (trans_count < textord_noise_translimit) 00534 norm_count++; 00535 } 00536 else if (blob_box.height () > row->x_height () * 2 00537 && (!word_it.at_first () || !blob_it.at_first ())) 00538 dot_count += 2; 00539 if (testing_on) { 00540 tprintf 00541 ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n", 00542 blob_box.left (), blob_box.bottom (), blob_box.right (), 00543 blob_box.top (), blob->out_list ()->length (), trans_count, 00544 blob_box.bottom () - row->base_line (blob_box.left ())); 00545 } 00546 } 00547 } 00548 if (textord_noise_debug) { 00549 tprintf ("Row ending at (%d,%g):", 00550 blob_box.right (), row->base_line (blob_box.right ())); 00551 tprintf (" R=%g, dc=%d, nc=%d, %s\n", 00552 norm_count > 0 ? (float) dot_count / norm_count : 9999, 00553 dot_count, norm_count, 00554 dot_count > norm_count * textord_noise_normratio 00555 && dot_count > 2 ? "REJECTED" : "ACCEPTED"); 00556 } 00557 return super_norm_count < textord_noise_sncount 00558 && dot_count > norm_count * textord_noise_rowratio && dot_count > 2; 00559 } 00560 00561 /********************************************************************** 00562 * clean_noise_from_words 00563 * 00564 * Move blobs of words from rows of garbage into the reject blobs list. 00565 **********************************************************************/ 00566 00567 void Textord::clean_noise_from_words( //remove empties 00568 ROW *row //row to clean 00569 ) { 00570 TBOX blob_box; //bounding box 00571 inT8 *word_dud; //was it chucked 00572 C_BLOB *blob; //current blob 00573 C_OUTLINE *outline; //current outline 00574 WERD *word; //current word 00575 inT32 blob_size; //biggest size 00576 inT32 trans_count; //no of transitions 00577 inT32 trans_threshold; //noise tolerance 00578 inT32 dot_count; //small objects 00579 inT32 norm_count; //normal objects 00580 inT32 dud_words; //number discarded 00581 inT32 ok_words; //number remaining 00582 inT32 word_index; //current word 00583 //words of row 00584 WERD_IT word_it = row->word_list (); 00585 C_BLOB_IT blob_it; //blob iterator 00586 C_OUTLINE_IT out_it; //outline iterator 00587 00588 ok_words = word_it.length (); 00589 if (ok_words == 0 || textord_no_rejects) 00590 return; 00591 word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8)); 00592 dud_words = 0; 00593 ok_words = 0; 00594 word_index = 0; 00595 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00596 word = word_it.data (); //current word 00597 dot_count = 0; 00598 norm_count = 0; 00599 //blobs in word 00600 blob_it.set_to_list (word->cblob_list ()); 00601 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); 00602 blob_it.forward ()) { 00603 blob = blob_it.data (); 00604 if (!word->flag (W_DONT_CHOP)) { 00605 //get outlines 00606 out_it.set_to_list (blob->out_list ()); 00607 for (out_it.mark_cycle_pt (); !out_it.cycled_list (); 00608 out_it.forward ()) { 00609 outline = out_it.data (); 00610 blob_box = outline->bounding_box (); 00611 blob_size = 00612 blob_box.width () > 00613 blob_box.height ()? blob_box.width () : blob_box. 00614 height(); 00615 if (blob_size < textord_noise_sizelimit * row->x_height ()) 00616 dot_count++; //count smal outlines 00617 if (!outline->child ()->empty () 00618 && blob_box.height () < 00619 (1 + textord_noise_syfract) * row->x_height () 00620 && blob_box.height () > 00621 (1 - textord_noise_syfract) * row->x_height () 00622 && blob_box.width () < 00623 (1 + textord_noise_sxfract) * row->x_height () 00624 && blob_box.width () > 00625 (1 - textord_noise_sxfract) * row->x_height ()) 00626 norm_count++; //count smal outlines 00627 } 00628 } 00629 else 00630 norm_count++; 00631 blob_box = blob->bounding_box (); 00632 blob_size = 00633 blob_box.width () > 00634 blob_box.height ()? blob_box.width () : blob_box.height (); 00635 if (blob_size >= textord_noise_sizelimit * row->x_height () 00636 && blob_size < row->x_height () * 2) { 00637 trans_threshold = blob_size / textord_noise_sizefraction; 00638 trans_count = blob->count_transitions (trans_threshold); 00639 if (trans_count < textord_noise_translimit) 00640 norm_count++; 00641 } 00642 else if (blob_box.height () > row->x_height () * 2 00643 && (!word_it.at_first () || !blob_it.at_first ())) 00644 dot_count += 2; 00645 } 00646 if (dot_count > 2 && !word->flag(W_REP_CHAR)) { 00647 if (dot_count > norm_count * textord_noise_normratio * 2) 00648 word_dud[word_index] = 2; 00649 else if (dot_count > norm_count * textord_noise_normratio) 00650 word_dud[word_index] = 1; 00651 else 00652 word_dud[word_index] = 0; 00653 } else { 00654 word_dud[word_index] = 0; 00655 } 00656 if (word_dud[word_index] == 2) 00657 dud_words++; 00658 else 00659 ok_words++; 00660 word_index++; 00661 } 00662 00663 word_index = 0; 00664 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00665 if (word_dud[word_index] == 2 00666 || (word_dud[word_index] == 1 && dud_words > ok_words)) { 00667 word = word_it.data(); // Current word. 00668 // Previously we threw away the entire word. 00669 // Now just aggressively throw all small blobs into the reject list, where 00670 // the classifier can decide whether they are actually needed. 00671 word->CleanNoise(textord_noise_sizelimit * row->x_height()); 00672 } 00673 word_index++; 00674 } 00675 free_mem(word_dud); 00676 } 00677 00678 // Remove outlines that are a tiny fraction in either width or height 00679 // of the word height. 00680 void Textord::clean_small_noise_from_words(ROW *row) { 00681 WERD_IT word_it(row->word_list()); 00682 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00683 WERD* word = word_it.data(); 00684 int min_size = static_cast<int>( 00685 textord_noise_hfract * word->bounding_box().height() + 0.5); 00686 C_BLOB_IT blob_it(word->cblob_list()); 00687 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { 00688 C_BLOB* blob = blob_it.data(); 00689 C_OUTLINE_IT out_it(blob->out_list()); 00690 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { 00691 C_OUTLINE* outline = out_it.data(); 00692 outline->RemoveSmallRecursive(min_size, &out_it); 00693 } 00694 if (blob->out_list()->empty()) { 00695 delete blob_it.extract(); 00696 } 00697 } 00698 if (word->cblob_list()->empty()) { 00699 if (!word_it.at_last()) { 00700 // The next word is no longer a fuzzy non space if it was before, 00701 // since the word before is about to be deleted. 00702 WERD* next_word = word_it.data_relative(1); 00703 if (next_word->flag(W_FUZZY_NON)) { 00704 next_word->set_flag(W_FUZZY_NON, false); 00705 } 00706 } 00707 delete word_it.extract(); 00708 } 00709 } 00710 } 00711 00712 // Local struct to hold a group of blocks. 00713 struct BlockGroup { 00714 BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {} 00715 explicit BlockGroup(BLOCK* block) 00716 : bounding_box(block->bounding_box()), 00717 rotation(block->re_rotation()), 00718 angle(block->re_rotation().angle()), 00719 min_xheight(block->x_height()) { 00720 blocks.push_back(block); 00721 } 00722 // Union of block bounding boxes. 00723 TBOX bounding_box; 00724 // Common rotation of the blocks. 00725 FCOORD rotation; 00726 // Angle of rotation. 00727 float angle; 00728 // Min xheight of the blocks. 00729 float min_xheight; 00730 // Collection of borrowed pointers to the blocks in the group. 00731 GenericVector<BLOCK*> blocks; 00732 }; 00733 00734 // Groups blocks by rotation, then, for each group, makes a WordGrid and calls 00735 // TransferDiacriticsToWords to copy the diacritic blobs to the most 00736 // appropriate words in the group of blocks. Source blobs are not touched. 00737 void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs, 00738 BLOCK_LIST* blocks) { 00739 // Angle difference larger than this is too much to consider equal. 00740 // They should only be in multiples of M_PI/2 anyway. 00741 const double kMaxAngleDiff = 0.01; // About 0.6 degrees. 00742 PointerVector<BlockGroup> groups; 00743 BLOCK_IT bk_it(blocks); 00744 for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) { 00745 BLOCK* block = bk_it.data(); 00746 if (block->poly_block() != NULL && !block->poly_block()->IsText()) { 00747 continue; 00748 } 00749 // Linear search of the groups to find a matching rotation. 00750 float block_angle = block->re_rotation().angle(); 00751 int best_g = 0; 00752 float best_angle_diff = MAX_FLOAT32; 00753 for (int g = 0; g < groups.size(); ++g) { 00754 double angle_diff = fabs(block_angle - groups[g]->angle); 00755 if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI); 00756 if (angle_diff < best_angle_diff) { 00757 best_angle_diff = angle_diff; 00758 best_g = g; 00759 } 00760 } 00761 if (best_angle_diff > kMaxAngleDiff) { 00762 groups.push_back(new BlockGroup(block)); 00763 } else { 00764 groups[best_g]->blocks.push_back(block); 00765 groups[best_g]->bounding_box += block->bounding_box(); 00766 float x_height = block->x_height(); 00767 if (x_height < groups[best_g]->min_xheight) 00768 groups[best_g]->min_xheight = x_height; 00769 } 00770 } 00771 // Now process each group of blocks. 00772 PointerVector<WordWithBox> word_ptrs; 00773 for (int g = 0; g < groups.size(); ++g) { 00774 const BlockGroup* group = groups[g]; 00775 WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(), 00776 group->bounding_box.topright()); 00777 for (int b = 0; b < group->blocks.size(); ++b) { 00778 ROW_IT row_it(group->blocks[b]->row_list()); 00779 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00780 ROW* row = row_it.data(); 00781 // Put the words of the row into the grid. 00782 WERD_IT w_it(row->word_list()); 00783 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 00784 WERD* word = w_it.data(); 00785 WordWithBox* box_word = new WordWithBox(word); 00786 word_grid.InsertBBox(true, true, box_word); 00787 // Save the pointer where it will be auto-deleted. 00788 word_ptrs.push_back(box_word); 00789 } 00790 } 00791 } 00792 FCOORD rotation = group->rotation; 00793 // Make it a forward rotation that will transform blob coords to block. 00794 rotation.set_y(-rotation.y()); 00795 TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid); 00796 } 00797 } 00798 00799 // Places a copy of blobs that are near a word (after applying rotation to the 00800 // blob) in the most appropriate word, unless there is doubt, in which case a 00801 // blob can end up in two words. Source blobs are not touched. 00802 void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs, 00803 const FCOORD& rotation, 00804 WordGrid* word_grid) { 00805 WordSearch ws(word_grid); 00806 BLOBNBOX_IT b_it(diacritic_blobs); 00807 // Apply rotation to each blob before finding the nearest words. The rotation 00808 // allows us to only consider above/below placement and not left/right on 00809 // vertical text, because all text is horizontal here. 00810 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { 00811 BLOBNBOX* blobnbox = b_it.data(); 00812 TBOX blob_box = blobnbox->bounding_box(); 00813 blob_box.rotate(rotation); 00814 ws.StartRectSearch(blob_box); 00815 // Above/below refer to word position relative to diacritic. Since some 00816 // scripts eg Kannada/Telugu habitually put diacritics below words, and 00817 // others eg Thai/Vietnamese/Latin put most diacritics above words, try 00818 // for both if there isn't much in it. 00819 WordWithBox* best_above_word = NULL; 00820 WordWithBox* best_below_word = NULL; 00821 int best_above_distance = 0; 00822 int best_below_distance = 0; 00823 for (WordWithBox* word = ws.NextRectSearch(); word != NULL; 00824 word = ws.NextRectSearch()) { 00825 if (word->word()->flag(W_REP_CHAR)) continue; 00826 TBOX word_box = word->true_bounding_box(); 00827 int x_distance = blob_box.x_gap(word_box); 00828 int y_distance = blob_box.y_gap(word_box); 00829 if (x_distance > 0) { 00830 // Arbitrarily divide x-distance by 2 if there is a major y overlap, 00831 // and the word is to the left of the diacritic. If the 00832 // diacritic is a dropped broken character between two words, this will 00833 // help send all the pieces to a single word, instead of splitting them 00834 // over the 2 words. 00835 if (word_box.major_y_overlap(blob_box) && 00836 blob_box.left() > word_box.right()) { 00837 x_distance /= 2; 00838 } 00839 y_distance += x_distance; 00840 } 00841 if (word_box.y_middle() > blob_box.y_middle() && 00842 (best_above_word == NULL || y_distance < best_above_distance)) { 00843 best_above_word = word; 00844 best_above_distance = y_distance; 00845 } 00846 if (word_box.y_middle() <= blob_box.y_middle() && 00847 (best_below_word == NULL || y_distance < best_below_distance)) { 00848 best_below_word = word; 00849 best_below_distance = y_distance; 00850 } 00851 } 00852 bool above_good = 00853 best_above_word != NULL && 00854 (best_below_word == NULL || 00855 best_above_distance < best_below_distance + blob_box.height()); 00856 bool below_good = 00857 best_below_word != NULL && best_below_word != best_above_word && 00858 (best_above_word == NULL || 00859 best_below_distance < best_above_distance + blob_box.height()); 00860 if (below_good) { 00861 C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); 00862 copied_blob->rotate(rotation); 00863 // Put the blob into the word's reject blobs list. 00864 C_BLOB_IT blob_it(best_below_word->RejBlobs()); 00865 blob_it.add_to_end(copied_blob); 00866 } 00867 if (above_good) { 00868 C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); 00869 copied_blob->rotate(rotation); 00870 // Put the blob into the word's reject blobs list. 00871 C_BLOB_IT blob_it(best_above_word->RejBlobs()); 00872 blob_it.add_to_end(copied_blob); 00873 } 00874 } 00875 } 00876 00877 } // tesseract 00878 00879 /********************************************************************** 00880 * tweak_row_baseline 00881 * 00882 * Shift baseline to fit the blobs more accurately where they are 00883 * close enough. 00884 **********************************************************************/ 00885 00886 void tweak_row_baseline(ROW *row, 00887 double blshift_maxshift, 00888 double blshift_xfraction) { 00889 TBOX blob_box; //bounding box 00890 C_BLOB *blob; //current blob 00891 WERD *word; //current word 00892 inT32 blob_count; //no of blobs 00893 inT32 src_index; //source segment 00894 inT32 dest_index; //destination segment 00895 inT32 *xstarts; //spline segments 00896 double *coeffs; //spline coeffs 00897 float ydiff; //baseline error 00898 float x_centre; //centre of blob 00899 //words of row 00900 WERD_IT word_it = row->word_list (); 00901 C_BLOB_IT blob_it; //blob iterator 00902 00903 blob_count = 0; 00904 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00905 word = word_it.data (); //current word 00906 //get total blobs 00907 blob_count += word->cblob_list ()->length (); 00908 } 00909 if (blob_count == 0) 00910 return; 00911 xstarts = 00912 (inT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) * 00913 sizeof (inT32)); 00914 coeffs = 00915 (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 * 00916 sizeof (double)); 00917 00918 src_index = 0; 00919 dest_index = 0; 00920 xstarts[0] = row->baseline.xcoords[0]; 00921 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00922 word = word_it.data (); //current word 00923 //blobs in word 00924 blob_it.set_to_list (word->cblob_list ()); 00925 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); 00926 blob_it.forward ()) { 00927 blob = blob_it.data (); 00928 blob_box = blob->bounding_box (); 00929 x_centre = (blob_box.left () + blob_box.right ()) / 2.0; 00930 ydiff = blob_box.bottom () - row->base_line (x_centre); 00931 if (ydiff < 0) 00932 ydiff = -ydiff / row->x_height (); 00933 else 00934 ydiff = ydiff / row->x_height (); 00935 if (ydiff < blshift_maxshift 00936 && blob_box.height () / row->x_height () > blshift_xfraction) { 00937 if (xstarts[dest_index] >= x_centre) 00938 xstarts[dest_index] = blob_box.left (); 00939 coeffs[dest_index * 3] = 0; 00940 coeffs[dest_index * 3 + 1] = 0; 00941 coeffs[dest_index * 3 + 2] = blob_box.bottom (); 00942 //shift it 00943 dest_index++; 00944 xstarts[dest_index] = blob_box.right () + 1; 00945 } 00946 else { 00947 if (xstarts[dest_index] <= x_centre) { 00948 while (row->baseline.xcoords[src_index + 1] <= x_centre 00949 && src_index < row->baseline.segments - 1) { 00950 if (row->baseline.xcoords[src_index + 1] > 00951 xstarts[dest_index]) { 00952 coeffs[dest_index * 3] = 00953 row->baseline.quadratics[src_index].a; 00954 coeffs[dest_index * 3 + 1] = 00955 row->baseline.quadratics[src_index].b; 00956 coeffs[dest_index * 3 + 2] = 00957 row->baseline.quadratics[src_index].c; 00958 dest_index++; 00959 xstarts[dest_index] = 00960 row->baseline.xcoords[src_index + 1]; 00961 } 00962 src_index++; 00963 } 00964 coeffs[dest_index * 3] = 00965 row->baseline.quadratics[src_index].a; 00966 coeffs[dest_index * 3 + 1] = 00967 row->baseline.quadratics[src_index].b; 00968 coeffs[dest_index * 3 + 2] = 00969 row->baseline.quadratics[src_index].c; 00970 dest_index++; 00971 xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; 00972 } 00973 } 00974 } 00975 } 00976 while (src_index < row->baseline.segments 00977 && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index]) 00978 src_index++; 00979 while (src_index < row->baseline.segments) { 00980 coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; 00981 coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; 00982 coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; 00983 dest_index++; 00984 src_index++; 00985 xstarts[dest_index] = row->baseline.xcoords[src_index]; 00986 } 00987 //turn to spline 00988 row->baseline = QSPLINE (dest_index, xstarts, coeffs); 00989 free_mem(xstarts); 00990 free_mem(coeffs); 00991 }