|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: topitch.cpp (Formerly to_pitch.c) 00003 * Description: Code to determine fixed pitchness and the pitch if fixed. 00004 * Author: Ray Smith 00005 * Created: Tue Aug 24 16:57:29 BST 1993 00006 * 00007 * (C) Copyright 1993, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef __UNIX__ 00021 #include <assert.h> 00022 #endif 00023 #include "stderr.h" 00024 #include "blobbox.h" 00025 #include "statistc.h" 00026 #include "drawtord.h" 00027 #include "makerow.h" 00028 #include "pitsync1.h" 00029 #include "pithsync.h" 00030 #include "tovars.h" 00031 #include "wordseg.h" 00032 #include "topitch.h" 00033 #include "helpers.h" 00034 00035 // Include automatically generated configuration file if running autoconf. 00036 #ifdef HAVE_CONFIG_H 00037 #include "config_auto.h" 00038 #endif 00039 00040 #define EXTERN 00041 00042 EXTERN BOOL_VAR (textord_all_prop, FALSE, "All doc is proportial text"); 00043 EXTERN BOOL_VAR (textord_debug_pitch_test, FALSE, 00044 "Debug on fixed pitch test"); 00045 EXTERN BOOL_VAR (textord_disable_pitch_test, FALSE, 00046 "Turn off dp fixed pitch algorithm"); 00047 EXTERN BOOL_VAR (textord_fast_pitch_test, FALSE, 00048 "Do even faster pitch algorithm"); 00049 EXTERN BOOL_VAR (textord_debug_pitch_metric, FALSE, 00050 "Write full metric stuff"); 00051 EXTERN BOOL_VAR (textord_show_row_cuts, FALSE, "Draw row-level cuts"); 00052 EXTERN BOOL_VAR (textord_show_page_cuts, FALSE, "Draw page-level cuts"); 00053 EXTERN BOOL_VAR (textord_pitch_cheat, FALSE, 00054 "Use correct answer for fixed/prop"); 00055 EXTERN BOOL_VAR (textord_blockndoc_fixed, FALSE, 00056 "Attempt whole doc/block fixed pitch"); 00057 EXTERN double_VAR (textord_projection_scale, 0.200, "Ding rate for mid-cuts"); 00058 EXTERN double_VAR (textord_balance_factor, 1.0, 00059 "Ding rate for unbalanced char cells"); 00060 00061 #define FIXED_WIDTH_MULTIPLE 5 00062 #define BLOCK_STATS_CLUSTERS 10 00063 #define MAX_ALLOWED_PITCH 100 //max pixel pitch. 00064 00065 /********************************************************************** 00066 * compute_fixed_pitch 00067 * 00068 * Decide whether each row is fixed pitch individually. 00069 * Correlate definite and uncertain results to obtain an individual 00070 * result for each row in the TO_ROW class. 00071 **********************************************************************/ 00072 00073 void compute_fixed_pitch(ICOORD page_tr, // top right 00074 TO_BLOCK_LIST *port_blocks, // input list 00075 float gradient, // page skew 00076 FCOORD rotation, // for drawing 00077 BOOL8 testing_on) { // correct orientation 00078 TO_BLOCK_IT block_it; //iterator 00079 TO_BLOCK *block; //current block; 00080 TO_ROW_IT row_it; //row iterator 00081 TO_ROW *row; //current row 00082 int block_index; //block number 00083 int row_index; //row number 00084 00085 #ifndef GRAPHICS_DISABLED 00086 if (textord_show_initial_words && testing_on) { 00087 if (to_win == NULL) 00088 create_to_win(page_tr); 00089 } 00090 #endif 00091 00092 block_it.set_to_list (port_blocks); 00093 block_index = 1; 00094 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00095 block_it.forward ()) { 00096 block = block_it.data (); 00097 compute_block_pitch(block, rotation, block_index, testing_on); 00098 block_index++; 00099 } 00100 00101 if (!try_doc_fixed (page_tr, port_blocks, gradient)) { 00102 block_index = 1; 00103 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00104 block_it.forward ()) { 00105 block = block_it.data (); 00106 if (!try_block_fixed (block, block_index)) 00107 try_rows_fixed(block, block_index, testing_on); 00108 block_index++; 00109 } 00110 } 00111 00112 block_index = 1; 00113 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 00114 block_it.forward()) { 00115 block = block_it.data (); 00116 POLY_BLOCK* pb = block->block->poly_block(); 00117 if (pb != NULL && !pb->IsText()) continue; // Non-text doesn't exist! 00118 row_it.set_to_list (block->get_rows ()); 00119 row_index = 1; 00120 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00121 row = row_it.data (); 00122 fix_row_pitch(row, block, port_blocks, row_index, block_index); 00123 row_index++; 00124 } 00125 block_index++; 00126 } 00127 #ifndef GRAPHICS_DISABLED 00128 if (textord_show_initial_words && testing_on) { 00129 ScrollView::Update(); 00130 } 00131 #endif 00132 } 00133 00134 00135 /********************************************************************** 00136 * fix_row_pitch 00137 * 00138 * Get a pitch_decision for this row by voting among similar rows in the 00139 * block, then similar rows over all the page, or any other rows at all. 00140 **********************************************************************/ 00141 00142 void fix_row_pitch(TO_ROW *bad_row, // row to fix 00143 TO_BLOCK *bad_block, // block of bad_row 00144 TO_BLOCK_LIST *blocks, // blocks to scan 00145 inT32 row_target, // number of row 00146 inT32 block_target) { // number of block 00147 inT16 mid_cuts; 00148 int block_votes; //votes in block 00149 int like_votes; //votes over page 00150 int other_votes; //votes of unlike blocks 00151 int block_index; //number of block 00152 int row_index; //number of row 00153 int maxwidth; //max pitch 00154 TO_BLOCK_IT block_it = blocks; //block iterator 00155 TO_ROW_IT row_it; 00156 TO_BLOCK *block; //current block 00157 TO_ROW *row; //current row 00158 float sp_sd; //space deviation 00159 STATS block_stats; //pitches in block 00160 STATS like_stats; //pitches in page 00161 00162 block_votes = like_votes = other_votes = 0; 00163 maxwidth = (inT32) ceil (bad_row->xheight * textord_words_maxspace); 00164 if (bad_row->pitch_decision != PITCH_DEF_FIXED 00165 && bad_row->pitch_decision != PITCH_DEF_PROP) { 00166 block_stats.set_range (0, maxwidth); 00167 like_stats.set_range (0, maxwidth); 00168 block_index = 1; 00169 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 00170 block_it.forward()) { 00171 block = block_it.data(); 00172 POLY_BLOCK* pb = block->block->poly_block(); 00173 if (pb != NULL && !pb->IsText()) continue; // Non text doesn't exist! 00174 row_index = 1; 00175 row_it.set_to_list (block->get_rows ()); 00176 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); 00177 row_it.forward ()) { 00178 row = row_it.data (); 00179 if ((bad_row->all_caps 00180 && row->xheight + row->ascrise 00181 < 00182 (bad_row->xheight + bad_row->ascrise) * (1 + 00183 textord_pitch_rowsimilarity) 00184 && row->xheight + row->ascrise > 00185 (bad_row->xheight + bad_row->ascrise) * (1 - 00186 textord_pitch_rowsimilarity)) 00187 || (!bad_row->all_caps 00188 && row->xheight < 00189 bad_row->xheight * (1 + textord_pitch_rowsimilarity) 00190 && row->xheight > 00191 bad_row->xheight * (1 - textord_pitch_rowsimilarity))) { 00192 if (block_index == block_target) { 00193 if (row->pitch_decision == PITCH_DEF_FIXED) { 00194 block_votes += textord_words_veto_power; 00195 block_stats.add ((inT32) row->fixed_pitch, 00196 textord_words_veto_power); 00197 } 00198 else if (row->pitch_decision == PITCH_MAYBE_FIXED 00199 || row->pitch_decision == PITCH_CORR_FIXED) { 00200 block_votes++; 00201 block_stats.add ((inT32) row->fixed_pitch, 1); 00202 } 00203 else if (row->pitch_decision == PITCH_DEF_PROP) 00204 block_votes -= textord_words_veto_power; 00205 else if (row->pitch_decision == PITCH_MAYBE_PROP 00206 || row->pitch_decision == PITCH_CORR_PROP) 00207 block_votes--; 00208 } 00209 else { 00210 if (row->pitch_decision == PITCH_DEF_FIXED) { 00211 like_votes += textord_words_veto_power; 00212 like_stats.add ((inT32) row->fixed_pitch, 00213 textord_words_veto_power); 00214 } 00215 else if (row->pitch_decision == PITCH_MAYBE_FIXED 00216 || row->pitch_decision == PITCH_CORR_FIXED) { 00217 like_votes++; 00218 like_stats.add ((inT32) row->fixed_pitch, 1); 00219 } 00220 else if (row->pitch_decision == PITCH_DEF_PROP) 00221 like_votes -= textord_words_veto_power; 00222 else if (row->pitch_decision == PITCH_MAYBE_PROP 00223 || row->pitch_decision == PITCH_CORR_PROP) 00224 like_votes--; 00225 } 00226 } 00227 else { 00228 if (row->pitch_decision == PITCH_DEF_FIXED) 00229 other_votes += textord_words_veto_power; 00230 else if (row->pitch_decision == PITCH_MAYBE_FIXED 00231 || row->pitch_decision == PITCH_CORR_FIXED) 00232 other_votes++; 00233 else if (row->pitch_decision == PITCH_DEF_PROP) 00234 other_votes -= textord_words_veto_power; 00235 else if (row->pitch_decision == PITCH_MAYBE_PROP 00236 || row->pitch_decision == PITCH_CORR_PROP) 00237 other_votes--; 00238 } 00239 row_index++; 00240 } 00241 block_index++; 00242 } 00243 if (block_votes > textord_words_veto_power) { 00244 bad_row->fixed_pitch = block_stats.ile (0.5); 00245 bad_row->pitch_decision = PITCH_CORR_FIXED; 00246 } 00247 else if (block_votes <= textord_words_veto_power && like_votes > 0) { 00248 bad_row->fixed_pitch = like_stats.ile (0.5); 00249 bad_row->pitch_decision = PITCH_CORR_FIXED; 00250 } 00251 else { 00252 bad_row->pitch_decision = PITCH_CORR_PROP; 00253 if (block_votes == 0 && like_votes == 0 && other_votes > 0 00254 && (textord_debug_pitch_test || textord_debug_pitch_metric)) 00255 tprintf 00256 ("Warning:row %d of block %d set prop with no like rows against trend\n", 00257 row_target, block_target); 00258 } 00259 } 00260 if (textord_debug_pitch_metric) { 00261 tprintf(":b_votes=%d:l_votes=%d:o_votes=%d", 00262 block_votes, like_votes, other_votes); 00263 tprintf("x=%g:asc=%g\n", bad_row->xheight, bad_row->ascrise); 00264 } 00265 if (bad_row->pitch_decision == PITCH_CORR_FIXED) { 00266 if (bad_row->fixed_pitch < textord_min_xheight) { 00267 if (block_votes > 0) 00268 bad_row->fixed_pitch = block_stats.ile (0.5); 00269 else if (block_votes == 0 && like_votes > 0) 00270 bad_row->fixed_pitch = like_stats.ile (0.5); 00271 else { 00272 tprintf 00273 ("Warning:guessing pitch as xheight on row %d, block %d\n", 00274 row_target, block_target); 00275 bad_row->fixed_pitch = bad_row->xheight; 00276 } 00277 } 00278 if (bad_row->fixed_pitch < textord_min_xheight) 00279 bad_row->fixed_pitch = (float) textord_min_xheight; 00280 bad_row->kern_size = bad_row->fixed_pitch / 4; 00281 bad_row->min_space = (inT32) (bad_row->fixed_pitch * 0.6); 00282 bad_row->max_nonspace = (inT32) (bad_row->fixed_pitch * 0.4); 00283 bad_row->space_threshold = 00284 (bad_row->min_space + bad_row->max_nonspace) / 2; 00285 bad_row->space_size = bad_row->fixed_pitch; 00286 if (bad_row->char_cells.empty() && !bad_row->blob_list()->empty()) { 00287 tune_row_pitch (bad_row, &bad_row->projection, 00288 bad_row->projection_left, bad_row->projection_right, 00289 (bad_row->fixed_pitch + 00290 bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch, 00291 sp_sd, mid_cuts, &bad_row->char_cells, FALSE); 00292 } 00293 } 00294 else if (bad_row->pitch_decision == PITCH_CORR_PROP 00295 || bad_row->pitch_decision == PITCH_DEF_PROP) { 00296 bad_row->fixed_pitch = 0.0f; 00297 bad_row->char_cells.clear (); 00298 } 00299 } 00300 00301 00302 /********************************************************************** 00303 * compute_block_pitch 00304 * 00305 * Decide whether each block is fixed pitch individually. 00306 **********************************************************************/ 00307 00308 void compute_block_pitch(TO_BLOCK *block, // input list 00309 FCOORD rotation, // for drawing 00310 inT32 block_index, // block number 00311 BOOL8 testing_on) { // correct orientation 00312 TBOX block_box; //bounding box 00313 00314 block_box = block->block->bounding_box (); 00315 if (testing_on && textord_debug_pitch_test) { 00316 tprintf ("Block %d at (%d,%d)->(%d,%d)\n", 00317 block_index, 00318 block_box.left (), block_box.bottom (), 00319 block_box.right (), block_box.top ()); 00320 } 00321 block->min_space = (inT32) floor (block->xheight 00322 * textord_words_default_minspace); 00323 block->max_nonspace = (inT32) ceil (block->xheight 00324 * textord_words_default_nonspace); 00325 block->fixed_pitch = 0.0f; 00326 block->space_size = (float) block->min_space; 00327 block->kern_size = (float) block->max_nonspace; 00328 block->pr_nonsp = block->xheight * words_default_prop_nonspace; 00329 block->pr_space = block->pr_nonsp * textord_spacesize_ratioprop; 00330 if (!block->get_rows ()->empty ()) { 00331 ASSERT_HOST (block->xheight > 0); 00332 find_repeated_chars(block, textord_show_initial_words && testing_on); 00333 #ifndef GRAPHICS_DISABLED 00334 if (textord_show_initial_words && testing_on) 00335 //overlap_picture_ops(TRUE); 00336 ScrollView::Update(); 00337 #endif 00338 compute_rows_pitch(block, 00339 block_index, 00340 textord_debug_pitch_test &&testing_on); 00341 } 00342 } 00343 00344 00345 /********************************************************************** 00346 * compute_rows_pitch 00347 * 00348 * Decide whether each row is fixed pitch individually. 00349 **********************************************************************/ 00350 00351 BOOL8 compute_rows_pitch( //find line stats 00352 TO_BLOCK *block, //block to do 00353 inT32 block_index, //block number 00354 BOOL8 testing_on //correct orientation 00355 ) { 00356 inT32 maxwidth; //of spaces 00357 TO_ROW *row; //current row 00358 inT32 row_index; //row number. 00359 float lower, upper; //cluster thresholds 00360 TO_ROW_IT row_it = block->get_rows (); 00361 00362 row_index = 1; 00363 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00364 row = row_it.data (); 00365 ASSERT_HOST (row->xheight > 0); 00366 row->compute_vertical_projection (); 00367 maxwidth = (inT32) ceil (row->xheight * textord_words_maxspace); 00368 if (row_pitch_stats (row, maxwidth, testing_on) 00369 && find_row_pitch (row, maxwidth, 00370 textord_dotmatrix_gap + 1, block, block_index, 00371 row_index, testing_on)) { 00372 if (row->fixed_pitch == 0) { 00373 lower = row->pr_nonsp; 00374 upper = row->pr_space; 00375 row->space_size = upper; 00376 row->kern_size = lower; 00377 } 00378 } 00379 else { 00380 row->fixed_pitch = 0.0f; //insufficient data 00381 row->pitch_decision = PITCH_DUNNO; 00382 } 00383 row_index++; 00384 } 00385 return FALSE; 00386 } 00387 00388 00389 /********************************************************************** 00390 * try_doc_fixed 00391 * 00392 * Attempt to call the entire document fixed pitch. 00393 **********************************************************************/ 00394 00395 BOOL8 try_doc_fixed( //determine pitch 00396 ICOORD page_tr, //top right 00397 TO_BLOCK_LIST *port_blocks, //input list 00398 float gradient //page skew 00399 ) { 00400 inT16 master_x; //uniform shifts 00401 inT16 pitch; //median pitch. 00402 int x; //profile coord 00403 int prop_blocks; //correct counts 00404 int fixed_blocks; 00405 int total_row_count; //total in page 00406 //iterator 00407 TO_BLOCK_IT block_it = port_blocks; 00408 TO_BLOCK *block; //current block; 00409 TO_ROW_IT row_it; //row iterator 00410 TO_ROW *row; //current row 00411 inT16 projection_left; //edges 00412 inT16 projection_right; 00413 inT16 row_left; //edges of row 00414 inT16 row_right; 00415 ICOORDELT_LIST *master_cells; //cells for page 00416 float master_y; //uniform shifts 00417 float shift_factor; //page skew correction 00418 float row_shift; //shift for row 00419 float final_pitch; //output pitch 00420 float row_y; //baseline 00421 STATS projection; //entire page 00422 STATS pitches (0, MAX_ALLOWED_PITCH); 00423 //for median 00424 float sp_sd; //space sd 00425 inT16 mid_cuts; //no of cheap cuts 00426 float pitch_sd; //sync rating 00427 00428 if (block_it.empty () 00429 // || block_it.data()==block_it.data_relative(1) 00430 || !textord_blockndoc_fixed) 00431 return FALSE; 00432 shift_factor = gradient / (gradient * gradient + 1); 00433 row_it.set_to_list (block_it.data ()->get_rows ()); 00434 master_x = row_it.data ()->projection_left; 00435 master_y = row_it.data ()->baseline.y (master_x); 00436 projection_left = MAX_INT16; 00437 projection_right = -MAX_INT16; 00438 prop_blocks = 0; 00439 fixed_blocks = 0; 00440 total_row_count = 0; 00441 00442 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00443 block_it.forward ()) { 00444 block = block_it.data (); 00445 row_it.set_to_list (block->get_rows ()); 00446 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00447 row = row_it.data (); 00448 total_row_count++; 00449 if (row->fixed_pitch > 0) 00450 pitches.add ((inT32) (row->fixed_pitch), 1); 00451 //find median 00452 row_y = row->baseline.y (master_x); 00453 row_left = 00454 (inT16) (row->projection_left - 00455 shift_factor * (master_y - row_y)); 00456 row_right = 00457 (inT16) (row->projection_right - 00458 shift_factor * (master_y - row_y)); 00459 if (row_left < projection_left) 00460 projection_left = row_left; 00461 if (row_right > projection_right) 00462 projection_right = row_right; 00463 } 00464 } 00465 if (pitches.get_total () == 0) 00466 return FALSE; 00467 projection.set_range (projection_left, projection_right); 00468 00469 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00470 block_it.forward ()) { 00471 block = block_it.data (); 00472 row_it.set_to_list (block->get_rows ()); 00473 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00474 row = row_it.data (); 00475 row_y = row->baseline.y (master_x); 00476 row_left = 00477 (inT16) (row->projection_left - 00478 shift_factor * (master_y - row_y)); 00479 for (x = row->projection_left; x < row->projection_right; 00480 x++, row_left++) { 00481 projection.add (row_left, row->projection.pile_count (x)); 00482 } 00483 } 00484 } 00485 00486 row_it.set_to_list (block_it.data ()->get_rows ()); 00487 row = row_it.data (); 00488 #ifndef GRAPHICS_DISABLED 00489 if (textord_show_page_cuts && to_win != NULL) 00490 projection.plot (to_win, projection_left, 00491 row->intercept (), 1.0f, -1.0f, ScrollView::CORAL); 00492 #endif 00493 final_pitch = pitches.ile (0.5); 00494 pitch = (inT16) final_pitch; 00495 pitch_sd = 00496 tune_row_pitch (row, &projection, projection_left, projection_right, 00497 pitch * 0.75, final_pitch, sp_sd, mid_cuts, 00498 &row->char_cells, FALSE); 00499 00500 if (textord_debug_pitch_metric) 00501 tprintf 00502 ("try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n", 00503 prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd, 00504 pitch_sd / total_row_count, pitch_sd / pitch, 00505 pitch_sd / total_row_count / pitch); 00506 00507 #ifndef GRAPHICS_DISABLED 00508 if (textord_show_page_cuts && to_win != NULL) { 00509 master_cells = &row->char_cells; 00510 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00511 block_it.forward ()) { 00512 block = block_it.data (); 00513 row_it.set_to_list (block->get_rows ()); 00514 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); 00515 row_it.forward ()) { 00516 row = row_it.data (); 00517 row_y = row->baseline.y (master_x); 00518 row_shift = shift_factor * (master_y - row_y); 00519 plot_row_cells(to_win, ScrollView::GOLDENROD, row, row_shift, master_cells); 00520 } 00521 } 00522 } 00523 #endif 00524 row->char_cells.clear (); 00525 return FALSE; 00526 } 00527 00528 00529 /********************************************************************** 00530 * try_block_fixed 00531 * 00532 * Try to call the entire block fixed. 00533 **********************************************************************/ 00534 00535 BOOL8 try_block_fixed( //find line stats 00536 TO_BLOCK *block, //block to do 00537 inT32 block_index //block number 00538 ) { 00539 return FALSE; 00540 } 00541 00542 00543 /********************************************************************** 00544 * try_rows_fixed 00545 * 00546 * Decide whether each row is fixed pitch individually. 00547 **********************************************************************/ 00548 00549 BOOL8 try_rows_fixed( //find line stats 00550 TO_BLOCK *block, //block to do 00551 inT32 block_index, //block number 00552 BOOL8 testing_on //correct orientation 00553 ) { 00554 TO_ROW *row; //current row 00555 inT32 row_index; //row number. 00556 inT32 def_fixed = 0; //counters 00557 inT32 def_prop = 0; 00558 inT32 maybe_fixed = 0; 00559 inT32 maybe_prop = 0; 00560 inT32 dunno = 0; 00561 inT32 corr_fixed = 0; 00562 inT32 corr_prop = 0; 00563 float lower, upper; //cluster thresholds 00564 TO_ROW_IT row_it = block->get_rows (); 00565 00566 row_index = 1; 00567 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00568 row = row_it.data (); 00569 ASSERT_HOST (row->xheight > 0); 00570 if (row->fixed_pitch > 0 && 00571 fixed_pitch_row(row, block->block, block_index)) { 00572 if (row->fixed_pitch == 0) { 00573 lower = row->pr_nonsp; 00574 upper = row->pr_space; 00575 row->space_size = upper; 00576 row->kern_size = lower; 00577 } 00578 } 00579 row_index++; 00580 } 00581 count_block_votes(block, 00582 def_fixed, 00583 def_prop, 00584 maybe_fixed, 00585 maybe_prop, 00586 corr_fixed, 00587 corr_prop, 00588 dunno); 00589 if (testing_on 00590 && (textord_debug_pitch_test 00591 || textord_blocksall_prop || textord_blocksall_fixed)) { 00592 tprintf ("Initially:"); 00593 print_block_counts(block, block_index); 00594 } 00595 if (def_fixed > def_prop * textord_words_veto_power) 00596 block->pitch_decision = PITCH_DEF_FIXED; 00597 else if (def_prop > def_fixed * textord_words_veto_power) 00598 block->pitch_decision = PITCH_DEF_PROP; 00599 else if (def_fixed > 0 || def_prop > 0) 00600 block->pitch_decision = PITCH_DUNNO; 00601 else if (maybe_fixed > maybe_prop * textord_words_veto_power) 00602 block->pitch_decision = PITCH_MAYBE_FIXED; 00603 else if (maybe_prop > maybe_fixed * textord_words_veto_power) 00604 block->pitch_decision = PITCH_MAYBE_PROP; 00605 else 00606 block->pitch_decision = PITCH_DUNNO; 00607 return FALSE; 00608 } 00609 00610 00611 /********************************************************************** 00612 * print_block_counts 00613 * 00614 * Count up how many rows have what decision and print the results. 00615 **********************************************************************/ 00616 00617 void print_block_counts( //find line stats 00618 TO_BLOCK *block, //block to do 00619 inT32 block_index //block number 00620 ) { 00621 inT32 def_fixed = 0; //counters 00622 inT32 def_prop = 0; 00623 inT32 maybe_fixed = 0; 00624 inT32 maybe_prop = 0; 00625 inT32 dunno = 0; 00626 inT32 corr_fixed = 0; 00627 inT32 corr_prop = 0; 00628 00629 count_block_votes(block, 00630 def_fixed, 00631 def_prop, 00632 maybe_fixed, 00633 maybe_prop, 00634 corr_fixed, 00635 corr_prop, 00636 dunno); 00637 tprintf ("Block %d has (%d,%d,%d)", 00638 block_index, def_fixed, maybe_fixed, corr_fixed); 00639 if (textord_blocksall_prop && (def_fixed || maybe_fixed || corr_fixed)) 00640 tprintf (" (Wrongly)"); 00641 tprintf (" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop); 00642 if (textord_blocksall_fixed && (def_prop || maybe_prop || corr_prop)) 00643 tprintf (" (Wrongly)"); 00644 tprintf (" prop, %d dunno\n", dunno); 00645 } 00646 00647 00648 /********************************************************************** 00649 * count_block_votes 00650 * 00651 * Count the number of rows in the block with each kind of pitch_decision. 00652 **********************************************************************/ 00653 00654 void count_block_votes( //find line stats 00655 TO_BLOCK *block, //block to do 00656 inT32 &def_fixed, //add to counts 00657 inT32 &def_prop, 00658 inT32 &maybe_fixed, 00659 inT32 &maybe_prop, 00660 inT32 &corr_fixed, 00661 inT32 &corr_prop, 00662 inT32 &dunno) { 00663 TO_ROW *row; //current row 00664 TO_ROW_IT row_it = block->get_rows (); 00665 00666 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00667 row = row_it.data (); 00668 switch (row->pitch_decision) { 00669 case PITCH_DUNNO: 00670 dunno++; 00671 break; 00672 case PITCH_DEF_PROP: 00673 def_prop++; 00674 break; 00675 case PITCH_MAYBE_PROP: 00676 maybe_prop++; 00677 break; 00678 case PITCH_DEF_FIXED: 00679 def_fixed++; 00680 break; 00681 case PITCH_MAYBE_FIXED: 00682 maybe_fixed++; 00683 break; 00684 case PITCH_CORR_PROP: 00685 corr_prop++; 00686 break; 00687 case PITCH_CORR_FIXED: 00688 corr_fixed++; 00689 break; 00690 } 00691 } 00692 } 00693 00694 00695 /********************************************************************** 00696 * row_pitch_stats 00697 * 00698 * Decide whether each row is fixed pitch individually. 00699 **********************************************************************/ 00700 00701 BOOL8 row_pitch_stats( //find line stats 00702 TO_ROW *row, //current row 00703 inT32 maxwidth, //of spaces 00704 BOOL8 testing_on //correct orientation 00705 ) { 00706 BLOBNBOX *blob; //current blob 00707 int gap_index; //current gap 00708 inT32 prev_x; //end of prev blob 00709 inT32 cluster_count; //no of clusters 00710 inT32 prev_count; //of clusters 00711 inT32 smooth_factor; //for smoothing stats 00712 TBOX blob_box; //bounding box 00713 float lower, upper; //cluster thresholds 00714 //gap sizes 00715 float gaps[BLOCK_STATS_CLUSTERS]; 00716 //blobs 00717 BLOBNBOX_IT blob_it = row->blob_list (); 00718 STATS gap_stats (0, maxwidth); 00719 STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1]; 00720 //clusters 00721 00722 smooth_factor = 00723 (inT32) (row->xheight * textord_wordstats_smooth_factor + 1.5); 00724 if (!blob_it.empty ()) { 00725 prev_x = blob_it.data ()->bounding_box ().right (); 00726 blob_it.forward (); 00727 while (!blob_it.at_first ()) { 00728 blob = blob_it.data (); 00729 if (!blob->joined_to_prev ()) { 00730 blob_box = blob->bounding_box (); 00731 if (blob_box.left () - prev_x < maxwidth) 00732 gap_stats.add (blob_box.left () - prev_x, 1); 00733 prev_x = blob_box.right (); 00734 } 00735 blob_it.forward (); 00736 } 00737 } 00738 if (gap_stats.get_total () == 0) { 00739 return FALSE; 00740 } 00741 cluster_count = 0; 00742 lower = row->xheight * words_initial_lower; 00743 upper = row->xheight * words_initial_upper; 00744 gap_stats.smooth (smooth_factor); 00745 do { 00746 prev_count = cluster_count; 00747 cluster_count = gap_stats.cluster (lower, upper, 00748 textord_spacesize_ratioprop, 00749 BLOCK_STATS_CLUSTERS, cluster_stats); 00750 } 00751 while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS); 00752 if (cluster_count < 1) { 00753 return FALSE; 00754 } 00755 for (gap_index = 0; gap_index < cluster_count; gap_index++) 00756 gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5); 00757 //get medians 00758 if (testing_on) { 00759 tprintf ("cluster_count=%d:", cluster_count); 00760 for (gap_index = 0; gap_index < cluster_count; gap_index++) 00761 tprintf (" %g(%d)", gaps[gap_index], 00762 cluster_stats[gap_index + 1].get_total ()); 00763 tprintf ("\n"); 00764 } 00765 qsort (gaps, cluster_count, sizeof (float), sort_floats); 00766 00767 //Try to find proportional non-space and space for row. 00768 lower = row->xheight * words_default_prop_nonspace; 00769 upper = row->xheight * textord_words_min_minspace; 00770 for (gap_index = 0; gap_index < cluster_count 00771 && gaps[gap_index] < lower; gap_index++); 00772 if (gap_index == 0) { 00773 if (testing_on) 00774 tprintf ("No clusters below nonspace threshold!!\n"); 00775 if (cluster_count > 1) { 00776 row->pr_nonsp = gaps[0]; 00777 row->pr_space = gaps[1]; 00778 } 00779 else { 00780 row->pr_nonsp = lower; 00781 row->pr_space = gaps[0]; 00782 } 00783 } 00784 else { 00785 row->pr_nonsp = gaps[gap_index - 1]; 00786 while (gap_index < cluster_count && gaps[gap_index] < upper) 00787 gap_index++; 00788 if (gap_index == cluster_count) { 00789 if (testing_on) 00790 tprintf ("No clusters above nonspace threshold!!\n"); 00791 row->pr_space = lower * textord_spacesize_ratioprop; 00792 } 00793 else 00794 row->pr_space = gaps[gap_index]; 00795 } 00796 00797 //Now try to find the fixed pitch space and non-space. 00798 upper = row->xheight * words_default_fixed_space; 00799 for (gap_index = 0; gap_index < cluster_count 00800 && gaps[gap_index] < upper; gap_index++); 00801 if (gap_index == 0) { 00802 if (testing_on) 00803 tprintf ("No clusters below space threshold!!\n"); 00804 row->fp_nonsp = upper; 00805 row->fp_space = gaps[0]; 00806 } 00807 else { 00808 row->fp_nonsp = gaps[gap_index - 1]; 00809 if (gap_index == cluster_count) { 00810 if (testing_on) 00811 tprintf ("No clusters above space threshold!!\n"); 00812 row->fp_space = row->xheight; 00813 } 00814 else 00815 row->fp_space = gaps[gap_index]; 00816 } 00817 if (testing_on) { 00818 tprintf 00819 ("Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, fp_space=%g\n", 00820 row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space); 00821 } 00822 return TRUE; //computed some stats 00823 } 00824 00825 00826 /********************************************************************** 00827 * find_row_pitch 00828 * 00829 * Check to see if this row could be fixed pitch using the given spacings. 00830 * Blobs with gaps smaller than the lower threshold are assumed to be one. 00831 * The larger threshold is the word gap threshold. 00832 **********************************************************************/ 00833 00834 BOOL8 find_row_pitch( //find lines 00835 TO_ROW *row, //row to do 00836 inT32 maxwidth, //max permitted space 00837 inT32 dm_gap, //ignorable gaps 00838 TO_BLOCK *block, //block of row 00839 inT32 block_index, //block_number 00840 inT32 row_index, //number of row 00841 BOOL8 testing_on //correct orientation 00842 ) { 00843 BOOL8 used_dm_model; //looks lik dot matrix 00844 float min_space; //estimate threshold 00845 float non_space; //gap size 00846 float gap_iqr; //interquartile range 00847 float pitch_iqr; 00848 float dm_gap_iqr; //interquartile range 00849 float dm_pitch_iqr; 00850 float dm_pitch; //pitch with dm on 00851 float pitch; //revised estimate 00852 float initial_pitch; //guess at pitch 00853 STATS gap_stats (0, maxwidth); 00854 //centre-centre 00855 STATS pitch_stats (0, maxwidth); 00856 00857 row->fixed_pitch = 0.0f; 00858 initial_pitch = row->fp_space; 00859 if (initial_pitch > row->xheight * (1 + words_default_fixed_limit)) 00860 initial_pitch = row->xheight;//keep pitch decent 00861 non_space = row->fp_nonsp; 00862 if (non_space > initial_pitch) 00863 non_space = initial_pitch; 00864 min_space = (initial_pitch + non_space) / 2; 00865 00866 if (!count_pitch_stats (row, &gap_stats, &pitch_stats, 00867 initial_pitch, min_space, TRUE, FALSE, dm_gap)) { 00868 dm_gap_iqr = 0.0001; 00869 dm_pitch_iqr = maxwidth * 2.0f; 00870 dm_pitch = initial_pitch; 00871 } 00872 else { 00873 dm_gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25); 00874 dm_pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25); 00875 dm_pitch = pitch_stats.ile (0.5); 00876 } 00877 gap_stats.clear (); 00878 pitch_stats.clear (); 00879 if (!count_pitch_stats (row, &gap_stats, &pitch_stats, 00880 initial_pitch, min_space, TRUE, FALSE, 0)) { 00881 gap_iqr = 0.0001; 00882 pitch_iqr = maxwidth * 3.0f; 00883 } 00884 else { 00885 gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25); 00886 pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25); 00887 if (testing_on) 00888 tprintf 00889 ("First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n", 00890 initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5)); 00891 initial_pitch = pitch_stats.ile (0.5); 00892 if (min_space > initial_pitch 00893 && count_pitch_stats (row, &gap_stats, &pitch_stats, 00894 initial_pitch, initial_pitch, TRUE, FALSE, 0)) { 00895 min_space = initial_pitch; 00896 gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25); 00897 pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25); 00898 if (testing_on) 00899 tprintf 00900 ("Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n", 00901 initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5)); 00902 initial_pitch = pitch_stats.ile (0.5); 00903 } 00904 } 00905 if (textord_debug_pitch_metric) 00906 tprintf("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:", 00907 block_index, row_index, 'X', 00908 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr, 00909 pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth ? 'D' : 00910 (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ? 'S' : 'M')); 00911 if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) { 00912 row->pitch_decision = PITCH_DUNNO; 00913 if (textord_debug_pitch_metric) 00914 tprintf ("\n"); 00915 return FALSE; //insufficient data 00916 } 00917 if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) { 00918 if (testing_on) 00919 tprintf 00920 ("Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n", 00921 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr); 00922 gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25); 00923 pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25); 00924 pitch = pitch_stats.ile (0.5); 00925 used_dm_model = FALSE; 00926 } 00927 else { 00928 if (testing_on) 00929 tprintf 00930 ("Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n", 00931 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr); 00932 gap_iqr = dm_gap_iqr; 00933 pitch_iqr = dm_pitch_iqr; 00934 pitch = dm_pitch; 00935 used_dm_model = TRUE; 00936 } 00937 if (textord_debug_pitch_metric) { 00938 tprintf ("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:", 00939 pitch_iqr, gap_iqr, pitch); 00940 tprintf ("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:", 00941 pitch_iqr / gap_iqr, pitch_iqr / block->xheight, 00942 pitch_iqr < gap_iqr * textord_fpiqr_ratio 00943 && pitch_iqr < block->xheight * textord_max_pitch_iqr 00944 && pitch < block->xheight * textord_words_default_maxspace 00945 ? 'F' : 'P'); 00946 } 00947 if (pitch_iqr < gap_iqr * textord_fpiqr_ratio 00948 && pitch_iqr < block->xheight * textord_max_pitch_iqr 00949 && pitch < block->xheight * textord_words_default_maxspace) 00950 row->pitch_decision = PITCH_MAYBE_FIXED; 00951 else 00952 row->pitch_decision = PITCH_MAYBE_PROP; 00953 row->fixed_pitch = pitch; 00954 row->kern_size = gap_stats.ile (0.5); 00955 row->min_space = (inT32) (row->fixed_pitch + non_space) / 2; 00956 if (row->min_space > row->fixed_pitch) 00957 row->min_space = (inT32) row->fixed_pitch; 00958 row->max_nonspace = row->min_space; 00959 row->space_size = row->fixed_pitch; 00960 row->space_threshold = (row->max_nonspace + row->min_space) / 2; 00961 row->used_dm_model = used_dm_model; 00962 return TRUE; 00963 } 00964 00965 00966 /********************************************************************** 00967 * fixed_pitch_row 00968 * 00969 * Check to see if this row could be fixed pitch using the given spacings. 00970 * Blobs with gaps smaller than the lower threshold are assumed to be one. 00971 * The larger threshold is the word gap threshold. 00972 **********************************************************************/ 00973 00974 BOOL8 fixed_pitch_row(TO_ROW *row, // row to do 00975 BLOCK* block, 00976 inT32 block_index // block_number 00977 ) { 00978 const char *res_string; // pitch result 00979 inT16 mid_cuts; // no of cheap cuts 00980 float non_space; // gap size 00981 float pitch_sd; // error on pitch 00982 float sp_sd = 0.0f; // space sd 00983 00984 non_space = row->fp_nonsp; 00985 if (non_space > row->fixed_pitch) 00986 non_space = row->fixed_pitch; 00987 POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL; 00988 if (textord_all_prop || (pb != NULL && !pb->IsText())) { 00989 // Set the decision to definitely proportional. 00990 pitch_sd = textord_words_def_prop * row->fixed_pitch; 00991 row->pitch_decision = PITCH_DEF_PROP; 00992 } else { 00993 pitch_sd = tune_row_pitch (row, &row->projection, row->projection_left, 00994 row->projection_right, 00995 (row->fixed_pitch + non_space * 3) / 4, 00996 row->fixed_pitch, sp_sd, mid_cuts, 00997 &row->char_cells, 00998 block_index == textord_debug_block); 00999 if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch 01000 && ((pitsync_linear_version & 3) < 3 01001 || ((pitsync_linear_version & 3) >= 3 && (row->used_dm_model 01002 || sp_sd > 20 01003 || (pitch_sd == 0 && sp_sd > 10))))) { 01004 if (pitch_sd < textord_words_def_fixed * row->fixed_pitch 01005 && !row->all_caps 01006 && ((pitsync_linear_version & 3) < 3 || sp_sd > 20)) 01007 row->pitch_decision = PITCH_DEF_FIXED; 01008 else 01009 row->pitch_decision = PITCH_MAYBE_FIXED; 01010 } 01011 else if ((pitsync_linear_version & 3) < 3 01012 || sp_sd > 20 01013 || mid_cuts > 0 01014 || pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) { 01015 if (pitch_sd < textord_words_def_prop * row->fixed_pitch) 01016 row->pitch_decision = PITCH_MAYBE_PROP; 01017 else 01018 row->pitch_decision = PITCH_DEF_PROP; 01019 } 01020 else 01021 row->pitch_decision = PITCH_DUNNO; 01022 } 01023 01024 if (textord_debug_pitch_metric) { 01025 res_string = "??"; 01026 switch (row->pitch_decision) { 01027 case PITCH_DEF_PROP: 01028 res_string = "DP"; 01029 break; 01030 case PITCH_MAYBE_PROP: 01031 res_string = "MP"; 01032 break; 01033 case PITCH_DEF_FIXED: 01034 res_string = "DF"; 01035 break; 01036 case PITCH_MAYBE_FIXED: 01037 res_string = "MF"; 01038 break; 01039 default: 01040 res_string = "??"; 01041 } 01042 tprintf (":sd/p=%g:occ=%g:init_res=%s\n", 01043 pitch_sd / row->fixed_pitch, sp_sd, res_string); 01044 } 01045 return TRUE; 01046 } 01047 01048 01049 /********************************************************************** 01050 * count_pitch_stats 01051 * 01052 * Count up the gap and pitch stats on the block to see if it is fixed pitch. 01053 * Blobs with gaps smaller than the lower threshold are assumed to be one. 01054 * The larger threshold is the word gap threshold. 01055 * The return value indicates whether there were any decent values to use. 01056 **********************************************************************/ 01057 01058 BOOL8 count_pitch_stats( //find lines 01059 TO_ROW *row, //row to do 01060 STATS *gap_stats, //blob gaps 01061 STATS *pitch_stats, //centre-centre stats 01062 float initial_pitch, //guess at pitch 01063 float min_space, //estimate space size 01064 BOOL8 ignore_outsize, //discard big objects 01065 BOOL8 split_outsize, //split big objects 01066 inT32 dm_gap //ignorable gaps 01067 ) { 01068 BOOL8 prev_valid; //not word broken 01069 BLOBNBOX *blob; //current blob 01070 //blobs 01071 BLOBNBOX_IT blob_it = row->blob_list (); 01072 inT32 prev_right; //end of prev blob 01073 inT32 prev_centre; //centre of previous blob 01074 inT32 x_centre; //centre of this blob 01075 inT32 blob_width; //width of blob 01076 inT32 width_units; //no of widths in blob 01077 float width; //blob width 01078 TBOX blob_box; //bounding box 01079 TBOX joined_box; //of super blob 01080 01081 gap_stats->clear (); 01082 pitch_stats->clear (); 01083 if (blob_it.empty ()) 01084 return FALSE; 01085 prev_valid = FALSE; 01086 prev_centre = 0; 01087 prev_right = 0; //stop compiler warning 01088 joined_box = blob_it.data ()->bounding_box (); 01089 do { 01090 blob_it.forward (); 01091 blob = blob_it.data (); 01092 if (!blob->joined_to_prev ()) { 01093 blob_box = blob->bounding_box (); 01094 if ((blob_box.left () - joined_box.right () < dm_gap 01095 && !blob_it.at_first ()) 01096 || blob->cblob() == NULL) 01097 joined_box += blob_box; //merge blobs 01098 else { 01099 blob_width = joined_box.width (); 01100 if (split_outsize) { 01101 width_units = 01102 (inT32) floor ((float) blob_width / initial_pitch + 0.5); 01103 if (width_units < 1) 01104 width_units = 1; 01105 width_units--; 01106 } 01107 else if (ignore_outsize) { 01108 width = (float) blob_width / initial_pitch; 01109 width_units = width < 1 + words_default_fixed_limit 01110 && width > 1 - words_default_fixed_limit ? 0 : -1; 01111 } 01112 else 01113 width_units = 0; //everything in 01114 x_centre = (inT32) (joined_box.left () 01115 + (blob_width - 01116 width_units * initial_pitch) / 2); 01117 if (prev_valid && width_units >= 0) { 01118 // if (width_units>0) 01119 // { 01120 // tprintf("wu=%d, width=%d, xc=%d, adding %d\n", 01121 // width_units,blob_width,x_centre,x_centre-prev_centre); 01122 // } 01123 gap_stats->add (joined_box.left () - prev_right, 1); 01124 pitch_stats->add (x_centre - prev_centre, 1); 01125 } 01126 prev_centre = (inT32) (x_centre + width_units * initial_pitch); 01127 prev_right = joined_box.right (); 01128 prev_valid = blob_box.left () - joined_box.right () < min_space; 01129 prev_valid = prev_valid && width_units >= 0; 01130 joined_box = blob_box; 01131 } 01132 } 01133 } 01134 while (!blob_it.at_first ()); 01135 return gap_stats->get_total () >= 3; 01136 } 01137 01138 01139 /********************************************************************** 01140 * tune_row_pitch 01141 * 01142 * Use a dp algorithm to fit the character cells and return the sd of 01143 * the cell size over the row. 01144 **********************************************************************/ 01145 01146 float tune_row_pitch( //find fp cells 01147 TO_ROW *row, //row to do 01148 STATS *projection, //vertical projection 01149 inT16 projection_left, //edge of projection 01150 inT16 projection_right, //edge of projection 01151 float space_size, //size of blank 01152 float &initial_pitch, //guess at pitch 01153 float &best_sp_sd, //space sd 01154 inT16 &best_mid_cuts, //no of cheap cuts 01155 ICOORDELT_LIST *best_cells, //row cells 01156 BOOL8 testing_on //inidividual words 01157 ) { 01158 int pitch_delta; //offset pitch 01159 inT16 mid_cuts; //cheap cuts 01160 float pitch_sd; //current sd 01161 float best_sd; //best result 01162 float best_pitch; //pitch for best result 01163 float initial_sd; //starting error 01164 float sp_sd; //space sd 01165 ICOORDELT_LIST test_cells; //row cells 01166 ICOORDELT_IT best_it; //start of best list 01167 01168 if (textord_fast_pitch_test) 01169 return tune_row_pitch2 (row, projection, projection_left, 01170 projection_right, space_size, initial_pitch, 01171 best_sp_sd, 01172 //space sd 01173 best_mid_cuts, best_cells, testing_on); 01174 if (textord_disable_pitch_test) { 01175 best_sp_sd = initial_pitch; 01176 return initial_pitch; 01177 } 01178 initial_sd = 01179 compute_pitch_sd(row, 01180 projection, 01181 projection_left, 01182 projection_right, 01183 space_size, 01184 initial_pitch, 01185 best_sp_sd, 01186 best_mid_cuts, 01187 best_cells, 01188 testing_on); 01189 best_sd = initial_sd; 01190 best_pitch = initial_pitch; 01191 if (testing_on) 01192 tprintf ("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd); 01193 for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) { 01194 pitch_sd = 01195 compute_pitch_sd (row, projection, projection_left, projection_right, 01196 space_size, initial_pitch + pitch_delta, sp_sd, 01197 mid_cuts, &test_cells, testing_on); 01198 if (testing_on) 01199 tprintf ("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta, 01200 pitch_sd); 01201 if (pitch_sd < best_sd) { 01202 best_sd = pitch_sd; 01203 best_mid_cuts = mid_cuts; 01204 best_sp_sd = sp_sd; 01205 best_pitch = initial_pitch + pitch_delta; 01206 best_cells->clear (); 01207 best_it.set_to_list (best_cells); 01208 best_it.add_list_after (&test_cells); 01209 } 01210 else 01211 test_cells.clear (); 01212 if (pitch_sd > initial_sd) 01213 break; //getting worse 01214 } 01215 for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) { 01216 pitch_sd = 01217 compute_pitch_sd (row, projection, projection_left, projection_right, 01218 space_size, initial_pitch - pitch_delta, sp_sd, 01219 mid_cuts, &test_cells, testing_on); 01220 if (testing_on) 01221 tprintf ("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta, 01222 pitch_sd); 01223 if (pitch_sd < best_sd) { 01224 best_sd = pitch_sd; 01225 best_mid_cuts = mid_cuts; 01226 best_sp_sd = sp_sd; 01227 best_pitch = initial_pitch - pitch_delta; 01228 best_cells->clear (); 01229 best_it.set_to_list (best_cells); 01230 best_it.add_list_after (&test_cells); 01231 } 01232 else 01233 test_cells.clear (); 01234 if (pitch_sd > initial_sd) 01235 break; 01236 } 01237 initial_pitch = best_pitch; 01238 01239 if (textord_debug_pitch_metric) 01240 print_pitch_sd(row, 01241 projection, 01242 projection_left, 01243 projection_right, 01244 space_size, 01245 best_pitch); 01246 01247 return best_sd; 01248 } 01249 01250 01251 /********************************************************************** 01252 * tune_row_pitch 01253 * 01254 * Use a dp algorithm to fit the character cells and return the sd of 01255 * the cell size over the row. 01256 **********************************************************************/ 01257 01258 float tune_row_pitch2( //find fp cells 01259 TO_ROW *row, //row to do 01260 STATS *projection, //vertical projection 01261 inT16 projection_left, //edge of projection 01262 inT16 projection_right, //edge of projection 01263 float space_size, //size of blank 01264 float &initial_pitch, //guess at pitch 01265 float &best_sp_sd, //space sd 01266 inT16 &best_mid_cuts, //no of cheap cuts 01267 ICOORDELT_LIST *best_cells, //row cells 01268 BOOL8 testing_on //inidividual words 01269 ) { 01270 int pitch_delta; //offset pitch 01271 inT16 pixel; //pixel coord 01272 inT16 best_pixel; //pixel coord 01273 inT16 best_delta; //best pitch 01274 inT16 best_pitch; //best pitch 01275 inT16 start; //of good range 01276 inT16 end; //of good range 01277 inT32 best_count; //lowest sum 01278 float best_sd; //best result 01279 STATS *sum_proj; //summed projection 01280 01281 best_sp_sd = initial_pitch; 01282 01283 best_pitch = static_cast<int>(initial_pitch); 01284 if (textord_disable_pitch_test || best_pitch <= textord_pitch_range) { 01285 return initial_pitch; 01286 } 01287 sum_proj = new STATS[textord_pitch_range * 2 + 1]; 01288 if (sum_proj == NULL) 01289 return initial_pitch; 01290 01291 for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; 01292 pitch_delta++) 01293 sum_proj[textord_pitch_range + pitch_delta].set_range (0, 01294 best_pitch + 01295 pitch_delta + 1); 01296 for (pixel = projection_left; pixel <= projection_right; pixel++) { 01297 for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; 01298 pitch_delta++) { 01299 sum_proj[textord_pitch_range + pitch_delta].add( 01300 (pixel - projection_left) % (best_pitch + pitch_delta), 01301 projection->pile_count(pixel)); 01302 } 01303 } 01304 best_count = sum_proj[textord_pitch_range].pile_count (0); 01305 best_delta = 0; 01306 best_pixel = 0; 01307 for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; 01308 pitch_delta++) { 01309 for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) { 01310 if (sum_proj[textord_pitch_range + pitch_delta].pile_count (pixel) 01311 < best_count) { 01312 best_count = 01313 sum_proj[textord_pitch_range + 01314 pitch_delta].pile_count (pixel); 01315 best_delta = pitch_delta; 01316 best_pixel = pixel; 01317 } 01318 } 01319 } 01320 if (testing_on) 01321 tprintf ("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n", 01322 initial_pitch, best_delta, best_count); 01323 best_pitch += best_delta; 01324 initial_pitch = best_pitch; 01325 best_count++; 01326 best_count += best_count; 01327 for (start = best_pixel - 2; start > best_pixel - best_pitch 01328 && sum_proj[textord_pitch_range + 01329 best_delta].pile_count (start % best_pitch) <= best_count; 01330 start--); 01331 for (end = best_pixel + 2; 01332 end < best_pixel + best_pitch 01333 && sum_proj[textord_pitch_range + 01334 best_delta].pile_count (end % best_pitch) <= best_count; 01335 end++); 01336 01337 best_sd = 01338 compute_pitch_sd(row, 01339 projection, 01340 projection_left, 01341 projection_right, 01342 space_size, 01343 initial_pitch, 01344 best_sp_sd, 01345 best_mid_cuts, 01346 best_cells, 01347 testing_on, 01348 start, 01349 end); 01350 if (testing_on) 01351 tprintf ("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch, 01352 best_sd); 01353 01354 if (textord_debug_pitch_metric) 01355 print_pitch_sd(row, 01356 projection, 01357 projection_left, 01358 projection_right, 01359 space_size, 01360 initial_pitch); 01361 01362 delete[]sum_proj; 01363 01364 return best_sd; 01365 } 01366 01367 01368 /********************************************************************** 01369 * compute_pitch_sd 01370 * 01371 * Use a dp algorithm to fit the character cells and return the sd of 01372 * the cell size over the row. 01373 **********************************************************************/ 01374 01375 float compute_pitch_sd( //find fp cells 01376 TO_ROW *row, //row to do 01377 STATS *projection, //vertical projection 01378 inT16 projection_left, //edge 01379 inT16 projection_right, //edge 01380 float space_size, //size of blank 01381 float initial_pitch, //guess at pitch 01382 float &sp_sd, //space sd 01383 inT16 &mid_cuts, //no of free cuts 01384 ICOORDELT_LIST *row_cells, //list of chop pts 01385 BOOL8 testing_on, //inidividual words 01386 inT16 start, //start of good range 01387 inT16 end //end of good range 01388 ) { 01389 inT16 occupation; //no of cells in word. 01390 //blobs 01391 BLOBNBOX_IT blob_it = row->blob_list (); 01392 BLOBNBOX_IT start_it; //start of word 01393 BLOBNBOX_IT plot_it; //for plotting 01394 inT16 blob_count; //no of blobs 01395 TBOX blob_box; //bounding box 01396 TBOX prev_box; //of super blob 01397 inT32 prev_right; //of word sync 01398 int scale_factor; //on scores for big words 01399 inT32 sp_count; //spaces 01400 FPSEGPT_LIST seg_list; //char cells 01401 FPSEGPT_IT seg_it; //iterator 01402 inT16 segpos; //position of segment 01403 inT16 cellpos; //previous cell boundary 01404 //iterator 01405 ICOORDELT_IT cell_it = row_cells; 01406 ICOORDELT *cell; //new cell 01407 double sqsum; //sum of squares 01408 double spsum; //of spaces 01409 double sp_var; //space error 01410 double word_sync; //result for word 01411 inT32 total_count; //total blobs 01412 01413 if ((pitsync_linear_version & 3) > 1) { 01414 word_sync = compute_pitch_sd2 (row, projection, projection_left, 01415 projection_right, initial_pitch, 01416 occupation, mid_cuts, row_cells, 01417 testing_on, start, end); 01418 sp_sd = occupation; 01419 return word_sync; 01420 } 01421 mid_cuts = 0; 01422 cellpos = 0; 01423 total_count = 0; 01424 sqsum = 0; 01425 sp_count = 0; 01426 spsum = 0; 01427 prev_right = -1; 01428 if (blob_it.empty ()) 01429 return space_size * 10; 01430 #ifndef GRAPHICS_DISABLED 01431 if (testing_on && to_win != NULL) { 01432 blob_box = blob_it.data ()->bounding_box (); 01433 projection->plot (to_win, projection_left, 01434 row->intercept (), 1.0f, -1.0f, ScrollView::CORAL); 01435 } 01436 #endif 01437 start_it = blob_it; 01438 blob_count = 0; 01439 blob_box = box_next (&blob_it);//first blob 01440 blob_it.mark_cycle_pt (); 01441 do { 01442 for (; blob_count > 0; blob_count--) 01443 box_next(&start_it); 01444 do { 01445 prev_box = blob_box; 01446 blob_count++; 01447 blob_box = box_next (&blob_it); 01448 } 01449 while (!blob_it.cycled_list () 01450 && blob_box.left () - prev_box.right () < space_size); 01451 plot_it = start_it; 01452 if (pitsync_linear_version & 3) 01453 word_sync = 01454 check_pitch_sync2 (&start_it, blob_count, (inT16) initial_pitch, 2, 01455 projection, projection_left, projection_right, 01456 row->xheight * textord_projection_scale, 01457 occupation, &seg_list, start, end); 01458 else 01459 word_sync = 01460 check_pitch_sync (&start_it, blob_count, (inT16) initial_pitch, 2, 01461 projection, &seg_list); 01462 if (testing_on) { 01463 tprintf ("Word ending at (%d,%d), len=%d, sync rating=%g, ", 01464 prev_box.right (), prev_box.top (), 01465 seg_list.length () - 1, word_sync); 01466 seg_it.set_to_list (&seg_list); 01467 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); 01468 seg_it.forward ()) { 01469 if (seg_it.data ()->faked) 01470 tprintf ("(F)"); 01471 tprintf ("%d, ", seg_it.data ()->position ()); 01472 // tprintf("C=%g, s=%g, sq=%g\n", 01473 // seg_it.data()->cost_function(), 01474 // seg_it.data()->sum(), 01475 // seg_it.data()->squares()); 01476 } 01477 tprintf ("\n"); 01478 } 01479 #ifndef GRAPHICS_DISABLED 01480 if (textord_show_fixed_cuts && blob_count > 0 && to_win != NULL) 01481 plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); 01482 #endif 01483 seg_it.set_to_list (&seg_list); 01484 if (prev_right >= 0) { 01485 sp_var = seg_it.data ()->position () - prev_right; 01486 sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch; 01487 sp_var *= sp_var; 01488 spsum += sp_var; 01489 sp_count++; 01490 } 01491 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) { 01492 segpos = seg_it.data ()->position (); 01493 if (cell_it.empty () || segpos > cellpos + initial_pitch / 2) { 01494 //big gap 01495 while (!cell_it.empty () && segpos > cellpos + initial_pitch * 3 / 2) { 01496 cell = new ICOORDELT (cellpos + (inT16) initial_pitch, 0); 01497 cell_it.add_after_then_move (cell); 01498 cellpos += (inT16) initial_pitch; 01499 } 01500 //make new one 01501 cell = new ICOORDELT (segpos, 0); 01502 cell_it.add_after_then_move (cell); 01503 cellpos = segpos; 01504 } 01505 else if (segpos > cellpos - initial_pitch / 2) { 01506 cell = cell_it.data (); 01507 //average positions 01508 cell->set_x ((cellpos + segpos) / 2); 01509 cellpos = cell->x (); 01510 } 01511 } 01512 seg_it.move_to_last (); 01513 prev_right = seg_it.data ()->position (); 01514 if (textord_pitch_scalebigwords) { 01515 scale_factor = (seg_list.length () - 2) / 2; 01516 if (scale_factor < 1) 01517 scale_factor = 1; 01518 } 01519 else 01520 scale_factor = 1; 01521 sqsum += word_sync * scale_factor; 01522 total_count += (seg_list.length () - 1) * scale_factor; 01523 seg_list.clear (); 01524 } 01525 while (!blob_it.cycled_list ()); 01526 sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0; 01527 return total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10; 01528 } 01529 01530 01531 /********************************************************************** 01532 * compute_pitch_sd2 01533 * 01534 * Use a dp algorithm to fit the character cells and return the sd of 01535 * the cell size over the row. 01536 **********************************************************************/ 01537 01538 float compute_pitch_sd2( //find fp cells 01539 TO_ROW *row, //row to do 01540 STATS *projection, //vertical projection 01541 inT16 projection_left, //edge 01542 inT16 projection_right, //edge 01543 float initial_pitch, //guess at pitch 01544 inT16 &occupation, //no of occupied cells 01545 inT16 &mid_cuts, //no of free cuts 01546 ICOORDELT_LIST *row_cells, //list of chop pts 01547 BOOL8 testing_on, //inidividual words 01548 inT16 start, //start of good range 01549 inT16 end //end of good range 01550 ) { 01551 //blobs 01552 BLOBNBOX_IT blob_it = row->blob_list (); 01553 BLOBNBOX_IT plot_it; 01554 inT16 blob_count; //no of blobs 01555 TBOX blob_box; //bounding box 01556 FPSEGPT_LIST seg_list; //char cells 01557 FPSEGPT_IT seg_it; //iterator 01558 inT16 segpos; //position of segment 01559 //iterator 01560 ICOORDELT_IT cell_it = row_cells; 01561 ICOORDELT *cell; //new cell 01562 double word_sync; //result for word 01563 01564 mid_cuts = 0; 01565 if (blob_it.empty ()) { 01566 occupation = 0; 01567 return initial_pitch * 10; 01568 } 01569 #ifndef GRAPHICS_DISABLED 01570 if (testing_on && to_win != NULL) { 01571 projection->plot (to_win, projection_left, 01572 row->intercept (), 1.0f, -1.0f, ScrollView::CORAL); 01573 } 01574 #endif 01575 blob_count = 0; 01576 blob_it.mark_cycle_pt (); 01577 do { 01578 //first blob 01579 blob_box = box_next (&blob_it); 01580 blob_count++; 01581 } 01582 while (!blob_it.cycled_list ()); 01583 plot_it = blob_it; 01584 word_sync = check_pitch_sync2 (&blob_it, blob_count, (inT16) initial_pitch, 01585 2, projection, projection_left, 01586 projection_right, 01587 row->xheight * textord_projection_scale, 01588 occupation, &seg_list, start, end); 01589 if (testing_on) { 01590 tprintf ("Row ending at (%d,%d), len=%d, sync rating=%g, ", 01591 blob_box.right (), blob_box.top (), 01592 seg_list.length () - 1, word_sync); 01593 seg_it.set_to_list (&seg_list); 01594 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) { 01595 if (seg_it.data ()->faked) 01596 tprintf ("(F)"); 01597 tprintf ("%d, ", seg_it.data ()->position ()); 01598 // tprintf("C=%g, s=%g, sq=%g\n", 01599 // seg_it.data()->cost_function(), 01600 // seg_it.data()->sum(), 01601 // seg_it.data()->squares()); 01602 } 01603 tprintf ("\n"); 01604 } 01605 #ifndef GRAPHICS_DISABLED 01606 if (textord_show_fixed_cuts && blob_count > 0 && to_win != NULL) 01607 plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); 01608 #endif 01609 seg_it.set_to_list (&seg_list); 01610 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) { 01611 segpos = seg_it.data ()->position (); 01612 //make new one 01613 cell = new ICOORDELT (segpos, 0); 01614 cell_it.add_after_then_move (cell); 01615 if (seg_it.at_last ()) 01616 mid_cuts = seg_it.data ()->cheap_cuts (); 01617 } 01618 seg_list.clear (); 01619 return occupation > 0 ? sqrt (word_sync / occupation) : initial_pitch * 10; 01620 } 01621 01622 01623 /********************************************************************** 01624 * print_pitch_sd 01625 * 01626 * Use a dp algorithm to fit the character cells and return the sd of 01627 * the cell size over the row. 01628 **********************************************************************/ 01629 01630 void print_pitch_sd( //find fp cells 01631 TO_ROW *row, //row to do 01632 STATS *projection, //vertical projection 01633 inT16 projection_left, //edges //size of blank 01634 inT16 projection_right, 01635 float space_size, 01636 float initial_pitch //guess at pitch 01637 ) { 01638 const char *res2; //pitch result 01639 inT16 occupation; //used cells 01640 float sp_sd; //space sd 01641 //blobs 01642 BLOBNBOX_IT blob_it = row->blob_list (); 01643 BLOBNBOX_IT start_it; //start of word 01644 BLOBNBOX_IT row_start; //start of row 01645 inT16 blob_count; //no of blobs 01646 inT16 total_blob_count; //total blobs in line 01647 TBOX blob_box; //bounding box 01648 TBOX prev_box; //of super blob 01649 inT32 prev_right; //of word sync 01650 int scale_factor; //on scores for big words 01651 inT32 sp_count; //spaces 01652 FPSEGPT_LIST seg_list; //char cells 01653 FPSEGPT_IT seg_it; //iterator 01654 double sqsum; //sum of squares 01655 double spsum; //of spaces 01656 double sp_var; //space error 01657 double word_sync; //result for word 01658 double total_count; //total cuts 01659 01660 if (blob_it.empty ()) 01661 return; 01662 row_start = blob_it; 01663 total_blob_count = 0; 01664 01665 total_count = 0; 01666 sqsum = 0; 01667 sp_count = 0; 01668 spsum = 0; 01669 prev_right = -1; 01670 blob_it = row_start; 01671 start_it = blob_it; 01672 blob_count = 0; 01673 blob_box = box_next (&blob_it);//first blob 01674 blob_it.mark_cycle_pt (); 01675 do { 01676 for (; blob_count > 0; blob_count--) 01677 box_next(&start_it); 01678 do { 01679 prev_box = blob_box; 01680 blob_count++; 01681 blob_box = box_next (&blob_it); 01682 } 01683 while (!blob_it.cycled_list () 01684 && blob_box.left () - prev_box.right () < space_size); 01685 word_sync = 01686 check_pitch_sync2 (&start_it, blob_count, (inT16) initial_pitch, 2, 01687 projection, projection_left, projection_right, 01688 row->xheight * textord_projection_scale, 01689 occupation, &seg_list, 0, 0); 01690 total_blob_count += blob_count; 01691 seg_it.set_to_list (&seg_list); 01692 if (prev_right >= 0) { 01693 sp_var = seg_it.data ()->position () - prev_right; 01694 sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch; 01695 sp_var *= sp_var; 01696 spsum += sp_var; 01697 sp_count++; 01698 } 01699 seg_it.move_to_last (); 01700 prev_right = seg_it.data ()->position (); 01701 if (textord_pitch_scalebigwords) { 01702 scale_factor = (seg_list.length () - 2) / 2; 01703 if (scale_factor < 1) 01704 scale_factor = 1; 01705 } 01706 else 01707 scale_factor = 1; 01708 sqsum += word_sync * scale_factor; 01709 total_count += (seg_list.length () - 1) * scale_factor; 01710 seg_list.clear (); 01711 } 01712 while (!blob_it.cycled_list ()); 01713 sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0; 01714 word_sync = total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10; 01715 tprintf ("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:", 01716 word_sync, word_sync / initial_pitch, sp_sd, 01717 word_sync < textord_words_pitchsd_threshold * initial_pitch 01718 ? 'F' : 'P'); 01719 01720 start_it = row_start; 01721 blob_it = row_start; 01722 word_sync = 01723 check_pitch_sync2 (&blob_it, total_blob_count, (inT16) initial_pitch, 2, 01724 projection, projection_left, projection_right, 01725 row->xheight * textord_projection_scale, occupation, 01726 &seg_list, 0, 0); 01727 if (occupation > 1) 01728 word_sync /= occupation; 01729 word_sync = sqrt (word_sync); 01730 01731 #ifndef GRAPHICS_DISABLED 01732 if (textord_show_row_cuts && to_win != NULL) 01733 plot_fp_cells2(to_win, ScrollView::CORAL, row, &seg_list); 01734 #endif 01735 seg_list.clear (); 01736 if (word_sync < textord_words_pitchsd_threshold * initial_pitch) { 01737 if (word_sync < textord_words_def_fixed * initial_pitch 01738 && !row->all_caps) 01739 res2 = "DF"; 01740 else 01741 res2 = "MF"; 01742 } 01743 else 01744 res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP"; 01745 tprintf 01746 ("row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, all_caps=%d\n", 01747 word_sync, word_sync / initial_pitch, 01748 word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P', 01749 occupation, res2, initial_pitch, row->fixed_pitch, row->all_caps); 01750 } 01751 01752 /********************************************************************** 01753 * find_repeated_chars 01754 * 01755 * Extract marked leader blobs and put them 01756 * into words in advance of fixed pitch checking and word generation. 01757 **********************************************************************/ 01758 void find_repeated_chars(TO_BLOCK *block, // Block to search. 01759 BOOL8 testing_on) { // Debug mode. 01760 POLY_BLOCK* pb = block->block->poly_block(); 01761 if (pb != NULL && !pb->IsText()) 01762 return; // Don't find repeated chars in non-text blocks. 01763 01764 TO_ROW *row; 01765 BLOBNBOX_IT box_it; 01766 BLOBNBOX_IT search_it; // forward search 01767 WERD_IT word_it; // new words 01768 WERD *word; // new word 01769 TBOX word_box; // for plotting 01770 int blobcount, repeated_set; 01771 01772 TO_ROW_IT row_it = block->get_rows(); 01773 if (row_it.empty()) return; // empty block 01774 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 01775 row = row_it.data(); 01776 box_it.set_to_list(row->blob_list()); 01777 if (box_it.empty()) continue; // no blobs in this row 01778 if (!row->rep_chars_marked()) { 01779 mark_repeated_chars(row); 01780 } 01781 if (row->num_repeated_sets() == 0) continue; // nothing to do for this row 01782 word_it.set_to_list(&row->rep_words); 01783 do { 01784 if (box_it.data()->repeated_set() != 0 && 01785 !box_it.data()->joined_to_prev()) { 01786 blobcount = 1; 01787 repeated_set = box_it.data()->repeated_set(); 01788 search_it = box_it; 01789 search_it.forward(); 01790 while (!search_it.at_first() && 01791 search_it.data()->repeated_set() == repeated_set) { 01792 blobcount++; 01793 search_it.forward(); 01794 } 01795 // After the call to make_real_word() all the blobs from this 01796 // repeated set will be removed from the blob list. box_it will be 01797 // set to point to the blob after the end of the extracted sequence. 01798 word = make_real_word(&box_it, blobcount, box_it.at_first(), 1); 01799 if (!box_it.empty() && box_it.data()->joined_to_prev()) { 01800 tprintf("Bad box joined to prev at"); 01801 box_it.data()->bounding_box().print(); 01802 tprintf("After repeated word:"); 01803 word->bounding_box().print(); 01804 } 01805 ASSERT_HOST(box_it.empty() || !box_it.data()->joined_to_prev()); 01806 word->set_flag(W_REP_CHAR, true); 01807 word->set_flag(W_DONT_CHOP, true); 01808 word_it.add_after_then_move(word); 01809 } else { 01810 box_it.forward(); 01811 } 01812 } while (!box_it.at_first()); 01813 } 01814 } 01815 01816 01817 /********************************************************************** 01818 * plot_fp_word 01819 * 01820 * Plot a block of words as if fixed pitch. 01821 **********************************************************************/ 01822 01823 #ifndef GRAPHICS_DISABLED 01824 void plot_fp_word( //draw block of words 01825 TO_BLOCK *block, //block to draw 01826 float pitch, //pitch to draw with 01827 float nonspace //for space threshold 01828 ) { 01829 TO_ROW *row; //current row 01830 TO_ROW_IT row_it = block->get_rows (); 01831 01832 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 01833 row = row_it.data (); 01834 row->min_space = (inT32) ((pitch + nonspace) / 2); 01835 row->max_nonspace = row->min_space; 01836 row->space_threshold = row->min_space; 01837 plot_word_decisions (to_win, (inT16) pitch, row); 01838 } 01839 } 01840 #endif