tesseract 3.04.01

textord/topitch.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        topitch.cpp  (Formerly to_pitch.c)
00003  * Description: Code to determine fixed pitchness and the pitch if fixed.
00004  * Author:              Ray Smith
00005  * Created:             Tue Aug 24 16:57:29 BST 1993
00006  *
00007  * (C) Copyright 1993, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef __UNIX__
00021 #include          <assert.h>
00022 #endif
00023 #include          "stderr.h"
00024 #include          "blobbox.h"
00025 #include          "statistc.h"
00026 #include          "drawtord.h"
00027 #include          "makerow.h"
00028 #include          "pitsync1.h"
00029 #include          "pithsync.h"
00030 #include          "tovars.h"
00031 #include          "wordseg.h"
00032 #include          "topitch.h"
00033 #include          "helpers.h"
00034 
00035 // Include automatically generated configuration file if running autoconf.
00036 #ifdef HAVE_CONFIG_H
00037 #include "config_auto.h"
00038 #endif
00039 
00040 #define EXTERN
00041 
00042 EXTERN BOOL_VAR (textord_all_prop, FALSE, "All doc is proportial text");
00043 EXTERN BOOL_VAR (textord_debug_pitch_test, FALSE,
00044 "Debug on fixed pitch test");
00045 EXTERN BOOL_VAR (textord_disable_pitch_test, FALSE,
00046 "Turn off dp fixed pitch algorithm");
00047 EXTERN BOOL_VAR (textord_fast_pitch_test, FALSE,
00048 "Do even faster pitch algorithm");
00049 EXTERN BOOL_VAR (textord_debug_pitch_metric, FALSE,
00050 "Write full metric stuff");
00051 EXTERN BOOL_VAR (textord_show_row_cuts, FALSE, "Draw row-level cuts");
00052 EXTERN BOOL_VAR (textord_show_page_cuts, FALSE, "Draw page-level cuts");
00053 EXTERN BOOL_VAR (textord_pitch_cheat, FALSE,
00054 "Use correct answer for fixed/prop");
00055 EXTERN BOOL_VAR (textord_blockndoc_fixed, FALSE,
00056 "Attempt whole doc/block fixed pitch");
00057 EXTERN double_VAR (textord_projection_scale, 0.200, "Ding rate for mid-cuts");
00058 EXTERN double_VAR (textord_balance_factor, 1.0,
00059 "Ding rate for unbalanced char cells");
00060 
00061 #define FIXED_WIDTH_MULTIPLE  5
00062 #define BLOCK_STATS_CLUSTERS  10
00063 #define MAX_ALLOWED_PITCH 100    //max pixel pitch.
00064 
00065 /**********************************************************************
00066  * compute_fixed_pitch
00067  *
00068  * Decide whether each row is fixed pitch individually.
00069  * Correlate definite and uncertain results to obtain an individual
00070  * result for each row in the TO_ROW class.
00071  **********************************************************************/
00072 
00073 void compute_fixed_pitch(ICOORD page_tr,              // top right
00074                          TO_BLOCK_LIST *port_blocks,  // input list
00075                          float gradient,              // page skew
00076                          FCOORD rotation,             // for drawing
00077                          BOOL8 testing_on) {          // correct orientation
00078   TO_BLOCK_IT block_it;          //iterator
00079   TO_BLOCK *block;               //current block;
00080   TO_ROW_IT row_it;              //row iterator
00081   TO_ROW *row;                   //current row
00082   int block_index;               //block number
00083   int row_index;                 //row number
00084 
00085 #ifndef GRAPHICS_DISABLED
00086   if (textord_show_initial_words && testing_on) {
00087     if (to_win == NULL)
00088       create_to_win(page_tr);
00089   }
00090 #endif
00091 
00092   block_it.set_to_list (port_blocks);
00093   block_index = 1;
00094   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00095   block_it.forward ()) {
00096     block = block_it.data ();
00097     compute_block_pitch(block, rotation, block_index, testing_on);
00098     block_index++;
00099   }
00100 
00101   if (!try_doc_fixed (page_tr, port_blocks, gradient)) {
00102     block_index = 1;
00103     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00104     block_it.forward ()) {
00105       block = block_it.data ();
00106       if (!try_block_fixed (block, block_index))
00107         try_rows_fixed(block, block_index, testing_on);
00108       block_index++;
00109     }
00110   }
00111 
00112   block_index = 1;
00113   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00114        block_it.forward()) {
00115     block = block_it.data ();
00116     POLY_BLOCK* pb = block->block->poly_block();
00117     if (pb != NULL && !pb->IsText()) continue;  // Non-text doesn't exist!
00118     row_it.set_to_list (block->get_rows ());
00119     row_index = 1;
00120     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00121       row = row_it.data ();
00122       fix_row_pitch(row, block, port_blocks, row_index, block_index);
00123       row_index++;
00124     }
00125     block_index++;
00126   }
00127 #ifndef GRAPHICS_DISABLED
00128   if (textord_show_initial_words && testing_on) {
00129     ScrollView::Update();
00130   }
00131 #endif
00132 }
00133 
00134 
00135 /**********************************************************************
00136  * fix_row_pitch
00137  *
00138  * Get a pitch_decision for this row by voting among similar rows in the
00139  * block, then similar rows over all the page, or any other rows at all.
00140  **********************************************************************/
00141 
00142 void fix_row_pitch(TO_ROW *bad_row,        // row to fix
00143                    TO_BLOCK *bad_block,    // block of bad_row
00144                    TO_BLOCK_LIST *blocks,  // blocks to scan
00145                    inT32 row_target,       // number of row
00146                    inT32 block_target) {   // number of block
00147   inT16 mid_cuts;
00148   int block_votes;               //votes in block
00149   int like_votes;                //votes over page
00150   int other_votes;               //votes of unlike blocks
00151   int block_index;               //number of block
00152   int row_index;                 //number of row
00153   int maxwidth;                  //max pitch
00154   TO_BLOCK_IT block_it = blocks; //block iterator
00155   TO_ROW_IT row_it;
00156   TO_BLOCK *block;               //current block
00157   TO_ROW *row;                   //current row
00158   float sp_sd;                   //space deviation
00159   STATS block_stats;             //pitches in block
00160   STATS like_stats;              //pitches in page
00161 
00162   block_votes = like_votes = other_votes = 0;
00163   maxwidth = (inT32) ceil (bad_row->xheight * textord_words_maxspace);
00164   if (bad_row->pitch_decision != PITCH_DEF_FIXED
00165   && bad_row->pitch_decision != PITCH_DEF_PROP) {
00166     block_stats.set_range (0, maxwidth);
00167     like_stats.set_range (0, maxwidth);
00168     block_index = 1;
00169     for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00170          block_it.forward()) {
00171       block = block_it.data();
00172       POLY_BLOCK* pb = block->block->poly_block();
00173       if (pb != NULL && !pb->IsText()) continue;  // Non text doesn't exist!
00174       row_index = 1;
00175       row_it.set_to_list (block->get_rows ());
00176       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
00177       row_it.forward ()) {
00178         row = row_it.data ();
00179         if ((bad_row->all_caps
00180           && row->xheight + row->ascrise
00181           <
00182           (bad_row->xheight + bad_row->ascrise) * (1 +
00183           textord_pitch_rowsimilarity)
00184           && row->xheight + row->ascrise >
00185           (bad_row->xheight + bad_row->ascrise) * (1 -
00186           textord_pitch_rowsimilarity))
00187           || (!bad_row->all_caps
00188           && row->xheight <
00189           bad_row->xheight * (1 + textord_pitch_rowsimilarity)
00190           && row->xheight >
00191         bad_row->xheight * (1 - textord_pitch_rowsimilarity))) {
00192           if (block_index == block_target) {
00193             if (row->pitch_decision == PITCH_DEF_FIXED) {
00194               block_votes += textord_words_veto_power;
00195               block_stats.add ((inT32) row->fixed_pitch,
00196                 textord_words_veto_power);
00197             }
00198             else if (row->pitch_decision == PITCH_MAYBE_FIXED
00199             || row->pitch_decision == PITCH_CORR_FIXED) {
00200               block_votes++;
00201               block_stats.add ((inT32) row->fixed_pitch, 1);
00202             }
00203             else if (row->pitch_decision == PITCH_DEF_PROP)
00204               block_votes -= textord_words_veto_power;
00205             else if (row->pitch_decision == PITCH_MAYBE_PROP
00206               || row->pitch_decision == PITCH_CORR_PROP)
00207               block_votes--;
00208           }
00209           else {
00210             if (row->pitch_decision == PITCH_DEF_FIXED) {
00211               like_votes += textord_words_veto_power;
00212               like_stats.add ((inT32) row->fixed_pitch,
00213                 textord_words_veto_power);
00214             }
00215             else if (row->pitch_decision == PITCH_MAYBE_FIXED
00216             || row->pitch_decision == PITCH_CORR_FIXED) {
00217               like_votes++;
00218               like_stats.add ((inT32) row->fixed_pitch, 1);
00219             }
00220             else if (row->pitch_decision == PITCH_DEF_PROP)
00221               like_votes -= textord_words_veto_power;
00222             else if (row->pitch_decision == PITCH_MAYBE_PROP
00223               || row->pitch_decision == PITCH_CORR_PROP)
00224               like_votes--;
00225           }
00226         }
00227         else {
00228           if (row->pitch_decision == PITCH_DEF_FIXED)
00229             other_votes += textord_words_veto_power;
00230           else if (row->pitch_decision == PITCH_MAYBE_FIXED
00231             || row->pitch_decision == PITCH_CORR_FIXED)
00232             other_votes++;
00233           else if (row->pitch_decision == PITCH_DEF_PROP)
00234             other_votes -= textord_words_veto_power;
00235           else if (row->pitch_decision == PITCH_MAYBE_PROP
00236             || row->pitch_decision == PITCH_CORR_PROP)
00237             other_votes--;
00238         }
00239         row_index++;
00240       }
00241       block_index++;
00242     }
00243     if (block_votes > textord_words_veto_power) {
00244       bad_row->fixed_pitch = block_stats.ile (0.5);
00245       bad_row->pitch_decision = PITCH_CORR_FIXED;
00246     }
00247     else if (block_votes <= textord_words_veto_power && like_votes > 0) {
00248       bad_row->fixed_pitch = like_stats.ile (0.5);
00249       bad_row->pitch_decision = PITCH_CORR_FIXED;
00250     }
00251     else {
00252       bad_row->pitch_decision = PITCH_CORR_PROP;
00253       if (block_votes == 0 && like_votes == 0 && other_votes > 0
00254         && (textord_debug_pitch_test || textord_debug_pitch_metric))
00255         tprintf
00256           ("Warning:row %d of block %d set prop with no like rows against trend\n",
00257           row_target, block_target);
00258     }
00259   }
00260   if (textord_debug_pitch_metric) {
00261     tprintf(":b_votes=%d:l_votes=%d:o_votes=%d",
00262             block_votes, like_votes, other_votes);
00263     tprintf("x=%g:asc=%g\n", bad_row->xheight, bad_row->ascrise);
00264   }
00265   if (bad_row->pitch_decision == PITCH_CORR_FIXED) {
00266     if (bad_row->fixed_pitch < textord_min_xheight) {
00267       if (block_votes > 0)
00268         bad_row->fixed_pitch = block_stats.ile (0.5);
00269       else if (block_votes == 0 && like_votes > 0)
00270         bad_row->fixed_pitch = like_stats.ile (0.5);
00271       else {
00272         tprintf
00273           ("Warning:guessing pitch as xheight on row %d, block %d\n",
00274           row_target, block_target);
00275         bad_row->fixed_pitch = bad_row->xheight;
00276       }
00277     }
00278     if (bad_row->fixed_pitch < textord_min_xheight)
00279       bad_row->fixed_pitch = (float) textord_min_xheight;
00280     bad_row->kern_size = bad_row->fixed_pitch / 4;
00281     bad_row->min_space = (inT32) (bad_row->fixed_pitch * 0.6);
00282     bad_row->max_nonspace = (inT32) (bad_row->fixed_pitch * 0.4);
00283     bad_row->space_threshold =
00284       (bad_row->min_space + bad_row->max_nonspace) / 2;
00285     bad_row->space_size = bad_row->fixed_pitch;
00286     if (bad_row->char_cells.empty() && !bad_row->blob_list()->empty()) {
00287       tune_row_pitch (bad_row, &bad_row->projection,
00288         bad_row->projection_left, bad_row->projection_right,
00289         (bad_row->fixed_pitch +
00290         bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch,
00291         sp_sd, mid_cuts, &bad_row->char_cells, FALSE);
00292     }
00293   }
00294   else if (bad_row->pitch_decision == PITCH_CORR_PROP
00295   || bad_row->pitch_decision == PITCH_DEF_PROP) {
00296     bad_row->fixed_pitch = 0.0f;
00297     bad_row->char_cells.clear ();
00298   }
00299 }
00300 
00301 
00302 /**********************************************************************
00303  * compute_block_pitch
00304  *
00305  * Decide whether each block is fixed pitch individually.
00306  **********************************************************************/
00307 
00308 void compute_block_pitch(TO_BLOCK *block,     // input list
00309                          FCOORD rotation,     // for drawing
00310                          inT32 block_index,   // block number
00311                          BOOL8 testing_on) {  // correct orientation
00312    TBOX block_box;                 //bounding box
00313 
00314   block_box = block->block->bounding_box ();
00315   if (testing_on && textord_debug_pitch_test) {
00316     tprintf ("Block %d at (%d,%d)->(%d,%d)\n",
00317       block_index,
00318       block_box.left (), block_box.bottom (),
00319       block_box.right (), block_box.top ());
00320   }
00321   block->min_space = (inT32) floor (block->xheight
00322     * textord_words_default_minspace);
00323   block->max_nonspace = (inT32) ceil (block->xheight
00324     * textord_words_default_nonspace);
00325   block->fixed_pitch = 0.0f;
00326   block->space_size = (float) block->min_space;
00327   block->kern_size = (float) block->max_nonspace;
00328   block->pr_nonsp = block->xheight * words_default_prop_nonspace;
00329   block->pr_space = block->pr_nonsp * textord_spacesize_ratioprop;
00330   if (!block->get_rows ()->empty ()) {
00331     ASSERT_HOST (block->xheight > 0);
00332     find_repeated_chars(block, textord_show_initial_words && testing_on);
00333 #ifndef GRAPHICS_DISABLED
00334     if (textord_show_initial_words && testing_on)
00335       //overlap_picture_ops(TRUE);
00336       ScrollView::Update();
00337 #endif
00338     compute_rows_pitch(block,
00339                        block_index,
00340                        textord_debug_pitch_test &&testing_on);
00341   }
00342 }
00343 
00344 
00345 /**********************************************************************
00346  * compute_rows_pitch
00347  *
00348  * Decide whether each row is fixed pitch individually.
00349  **********************************************************************/
00350 
00351 BOOL8 compute_rows_pitch(                    //find line stats
00352                          TO_BLOCK *block,    //block to do
00353                          inT32 block_index,  //block number
00354                          BOOL8 testing_on    //correct orientation
00355                         ) {
00356   inT32 maxwidth;                //of spaces
00357   TO_ROW *row;                   //current row
00358   inT32 row_index;               //row number.
00359   float lower, upper;            //cluster thresholds
00360   TO_ROW_IT row_it = block->get_rows ();
00361 
00362   row_index = 1;
00363   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00364     row = row_it.data ();
00365     ASSERT_HOST (row->xheight > 0);
00366     row->compute_vertical_projection ();
00367     maxwidth = (inT32) ceil (row->xheight * textord_words_maxspace);
00368     if (row_pitch_stats (row, maxwidth, testing_on)
00369       && find_row_pitch (row, maxwidth,
00370       textord_dotmatrix_gap + 1, block, block_index,
00371     row_index, testing_on)) {
00372       if (row->fixed_pitch == 0) {
00373         lower = row->pr_nonsp;
00374         upper = row->pr_space;
00375         row->space_size = upper;
00376         row->kern_size = lower;
00377       }
00378     }
00379     else {
00380       row->fixed_pitch = 0.0f;   //insufficient data
00381       row->pitch_decision = PITCH_DUNNO;
00382     }
00383     row_index++;
00384   }
00385   return FALSE;
00386 }
00387 
00388 
00389 /**********************************************************************
00390  * try_doc_fixed
00391  *
00392  * Attempt to call the entire document fixed pitch.
00393  **********************************************************************/
00394 
00395 BOOL8 try_doc_fixed(                             //determine pitch
00396                     ICOORD page_tr,              //top right
00397                     TO_BLOCK_LIST *port_blocks,  //input list
00398                     float gradient               //page skew
00399                    ) {
00400   inT16 master_x;                //uniform shifts
00401   inT16 pitch;                   //median pitch.
00402   int x;                         //profile coord
00403   int prop_blocks;               //correct counts
00404   int fixed_blocks;
00405   int total_row_count;           //total in page
00406                                  //iterator
00407   TO_BLOCK_IT block_it = port_blocks;
00408   TO_BLOCK *block;               //current block;
00409   TO_ROW_IT row_it;              //row iterator
00410   TO_ROW *row;                   //current row
00411   inT16 projection_left;         //edges
00412   inT16 projection_right;
00413   inT16 row_left;                //edges of row
00414   inT16 row_right;
00415   ICOORDELT_LIST *master_cells;  //cells for page
00416   float master_y;                //uniform shifts
00417   float shift_factor;            //page skew correction
00418   float row_shift;               //shift for row
00419   float final_pitch;             //output pitch
00420   float row_y;                   //baseline
00421   STATS projection;              //entire page
00422   STATS pitches (0, MAX_ALLOWED_PITCH);
00423   //for median
00424   float sp_sd;                   //space sd
00425   inT16 mid_cuts;                //no of cheap cuts
00426   float pitch_sd;                //sync rating
00427 
00428   if (block_it.empty ()
00429     //      || block_it.data()==block_it.data_relative(1)
00430     || !textord_blockndoc_fixed)
00431     return FALSE;
00432   shift_factor = gradient / (gradient * gradient + 1);
00433   row_it.set_to_list (block_it.data ()->get_rows ());
00434   master_x = row_it.data ()->projection_left;
00435   master_y = row_it.data ()->baseline.y (master_x);
00436   projection_left = MAX_INT16;
00437   projection_right = -MAX_INT16;
00438   prop_blocks = 0;
00439   fixed_blocks = 0;
00440   total_row_count = 0;
00441 
00442   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00443   block_it.forward ()) {
00444     block = block_it.data ();
00445     row_it.set_to_list (block->get_rows ());
00446     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00447       row = row_it.data ();
00448       total_row_count++;
00449       if (row->fixed_pitch > 0)
00450         pitches.add ((inT32) (row->fixed_pitch), 1);
00451       //find median
00452       row_y = row->baseline.y (master_x);
00453       row_left =
00454         (inT16) (row->projection_left -
00455         shift_factor * (master_y - row_y));
00456       row_right =
00457         (inT16) (row->projection_right -
00458         shift_factor * (master_y - row_y));
00459       if (row_left < projection_left)
00460         projection_left = row_left;
00461       if (row_right > projection_right)
00462         projection_right = row_right;
00463     }
00464   }
00465   if (pitches.get_total () == 0)
00466     return FALSE;
00467   projection.set_range (projection_left, projection_right);
00468 
00469   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00470   block_it.forward ()) {
00471     block = block_it.data ();
00472     row_it.set_to_list (block->get_rows ());
00473     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00474       row = row_it.data ();
00475       row_y = row->baseline.y (master_x);
00476       row_left =
00477         (inT16) (row->projection_left -
00478         shift_factor * (master_y - row_y));
00479       for (x = row->projection_left; x < row->projection_right;
00480       x++, row_left++) {
00481         projection.add (row_left, row->projection.pile_count (x));
00482       }
00483     }
00484   }
00485 
00486   row_it.set_to_list (block_it.data ()->get_rows ());
00487   row = row_it.data ();
00488 #ifndef GRAPHICS_DISABLED
00489   if (textord_show_page_cuts && to_win != NULL)
00490     projection.plot (to_win, projection_left,
00491       row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
00492 #endif
00493   final_pitch = pitches.ile (0.5);
00494   pitch = (inT16) final_pitch;
00495   pitch_sd =
00496     tune_row_pitch (row, &projection, projection_left, projection_right,
00497     pitch * 0.75, final_pitch, sp_sd, mid_cuts,
00498     &row->char_cells, FALSE);
00499 
00500   if (textord_debug_pitch_metric)
00501     tprintf
00502       ("try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n",
00503       prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd,
00504       pitch_sd / total_row_count, pitch_sd / pitch,
00505       pitch_sd / total_row_count / pitch);
00506 
00507 #ifndef GRAPHICS_DISABLED
00508   if (textord_show_page_cuts && to_win != NULL) {
00509     master_cells = &row->char_cells;
00510     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00511     block_it.forward ()) {
00512       block = block_it.data ();
00513       row_it.set_to_list (block->get_rows ());
00514       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
00515       row_it.forward ()) {
00516         row = row_it.data ();
00517         row_y = row->baseline.y (master_x);
00518         row_shift = shift_factor * (master_y - row_y);
00519         plot_row_cells(to_win, ScrollView::GOLDENROD, row, row_shift, master_cells);
00520       }
00521     }
00522   }
00523 #endif
00524   row->char_cells.clear ();
00525   return FALSE;
00526 }
00527 
00528 
00529 /**********************************************************************
00530  * try_block_fixed
00531  *
00532  * Try to call the entire block fixed.
00533  **********************************************************************/
00534 
00535 BOOL8 try_block_fixed(                   //find line stats
00536                       TO_BLOCK *block,   //block to do
00537                       inT32 block_index  //block number
00538                      ) {
00539   return FALSE;
00540 }
00541 
00542 
00543 /**********************************************************************
00544  * try_rows_fixed
00545  *
00546  * Decide whether each row is fixed pitch individually.
00547  **********************************************************************/
00548 
00549 BOOL8 try_rows_fixed(                    //find line stats
00550                      TO_BLOCK *block,    //block to do
00551                      inT32 block_index,  //block number
00552                      BOOL8 testing_on    //correct orientation
00553                     ) {
00554   TO_ROW *row;                   //current row
00555   inT32 row_index;               //row number.
00556   inT32 def_fixed = 0;           //counters
00557   inT32 def_prop = 0;
00558   inT32 maybe_fixed = 0;
00559   inT32 maybe_prop = 0;
00560   inT32 dunno = 0;
00561   inT32 corr_fixed = 0;
00562   inT32 corr_prop = 0;
00563   float lower, upper;            //cluster thresholds
00564   TO_ROW_IT row_it = block->get_rows ();
00565 
00566   row_index = 1;
00567   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00568     row = row_it.data ();
00569     ASSERT_HOST (row->xheight > 0);
00570     if (row->fixed_pitch > 0 &&
00571         fixed_pitch_row(row, block->block, block_index)) {
00572       if (row->fixed_pitch == 0) {
00573         lower = row->pr_nonsp;
00574         upper = row->pr_space;
00575         row->space_size = upper;
00576         row->kern_size = lower;
00577       }
00578     }
00579     row_index++;
00580   }
00581   count_block_votes(block,
00582                     def_fixed,
00583                     def_prop,
00584                     maybe_fixed,
00585                     maybe_prop,
00586                     corr_fixed,
00587                     corr_prop,
00588                     dunno);
00589   if (testing_on
00590     && (textord_debug_pitch_test
00591   || textord_blocksall_prop || textord_blocksall_fixed)) {
00592     tprintf ("Initially:");
00593     print_block_counts(block, block_index);
00594   }
00595   if (def_fixed > def_prop * textord_words_veto_power)
00596     block->pitch_decision = PITCH_DEF_FIXED;
00597   else if (def_prop > def_fixed * textord_words_veto_power)
00598     block->pitch_decision = PITCH_DEF_PROP;
00599   else if (def_fixed > 0 || def_prop > 0)
00600     block->pitch_decision = PITCH_DUNNO;
00601   else if (maybe_fixed > maybe_prop * textord_words_veto_power)
00602     block->pitch_decision = PITCH_MAYBE_FIXED;
00603   else if (maybe_prop > maybe_fixed * textord_words_veto_power)
00604     block->pitch_decision = PITCH_MAYBE_PROP;
00605   else
00606     block->pitch_decision = PITCH_DUNNO;
00607   return FALSE;
00608 }
00609 
00610 
00611 /**********************************************************************
00612  * print_block_counts
00613  *
00614  * Count up how many rows have what decision and print the results.
00615  **********************************************************************/
00616 
00617 void print_block_counts(                   //find line stats
00618                         TO_BLOCK *block,   //block to do
00619                         inT32 block_index  //block number
00620                        ) {
00621   inT32 def_fixed = 0;           //counters
00622   inT32 def_prop = 0;
00623   inT32 maybe_fixed = 0;
00624   inT32 maybe_prop = 0;
00625   inT32 dunno = 0;
00626   inT32 corr_fixed = 0;
00627   inT32 corr_prop = 0;
00628 
00629   count_block_votes(block,
00630                     def_fixed,
00631                     def_prop,
00632                     maybe_fixed,
00633                     maybe_prop,
00634                     corr_fixed,
00635                     corr_prop,
00636                     dunno);
00637   tprintf ("Block %d has (%d,%d,%d)",
00638     block_index, def_fixed, maybe_fixed, corr_fixed);
00639   if (textord_blocksall_prop && (def_fixed || maybe_fixed || corr_fixed))
00640     tprintf (" (Wrongly)");
00641   tprintf (" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop);
00642   if (textord_blocksall_fixed && (def_prop || maybe_prop || corr_prop))
00643     tprintf (" (Wrongly)");
00644   tprintf (" prop, %d dunno\n", dunno);
00645 }
00646 
00647 
00648 /**********************************************************************
00649  * count_block_votes
00650  *
00651  * Count the number of rows in the block with each kind of pitch_decision.
00652  **********************************************************************/
00653 
00654 void count_block_votes(                   //find line stats
00655                        TO_BLOCK *block,   //block to do
00656                        inT32 &def_fixed,  //add to counts
00657                        inT32 &def_prop,
00658                        inT32 &maybe_fixed,
00659                        inT32 &maybe_prop,
00660                        inT32 &corr_fixed,
00661                        inT32 &corr_prop,
00662                        inT32 &dunno) {
00663   TO_ROW *row;                   //current row
00664   TO_ROW_IT row_it = block->get_rows ();
00665 
00666   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00667     row = row_it.data ();
00668     switch (row->pitch_decision) {
00669       case PITCH_DUNNO:
00670         dunno++;
00671         break;
00672       case PITCH_DEF_PROP:
00673         def_prop++;
00674         break;
00675       case PITCH_MAYBE_PROP:
00676         maybe_prop++;
00677         break;
00678       case PITCH_DEF_FIXED:
00679         def_fixed++;
00680         break;
00681       case PITCH_MAYBE_FIXED:
00682         maybe_fixed++;
00683         break;
00684       case PITCH_CORR_PROP:
00685         corr_prop++;
00686         break;
00687       case PITCH_CORR_FIXED:
00688         corr_fixed++;
00689         break;
00690     }
00691   }
00692 }
00693 
00694 
00695 /**********************************************************************
00696  * row_pitch_stats
00697  *
00698  * Decide whether each row is fixed pitch individually.
00699  **********************************************************************/
00700 
00701 BOOL8 row_pitch_stats(                  //find line stats
00702                       TO_ROW *row,      //current row
00703                       inT32 maxwidth,   //of spaces
00704                       BOOL8 testing_on  //correct orientation
00705                      ) {
00706   BLOBNBOX *blob;                //current blob
00707   int gap_index;                 //current gap
00708   inT32 prev_x;                  //end of prev blob
00709   inT32 cluster_count;           //no of clusters
00710   inT32 prev_count;              //of clusters
00711   inT32 smooth_factor;           //for smoothing stats
00712   TBOX blob_box;                  //bounding box
00713   float lower, upper;            //cluster thresholds
00714                                  //gap sizes
00715   float gaps[BLOCK_STATS_CLUSTERS];
00716                                  //blobs
00717   BLOBNBOX_IT blob_it = row->blob_list ();
00718   STATS gap_stats (0, maxwidth);
00719   STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
00720   //clusters
00721 
00722   smooth_factor =
00723     (inT32) (row->xheight * textord_wordstats_smooth_factor + 1.5);
00724   if (!blob_it.empty ()) {
00725     prev_x = blob_it.data ()->bounding_box ().right ();
00726     blob_it.forward ();
00727     while (!blob_it.at_first ()) {
00728       blob = blob_it.data ();
00729       if (!blob->joined_to_prev ()) {
00730         blob_box = blob->bounding_box ();
00731         if (blob_box.left () - prev_x < maxwidth)
00732           gap_stats.add (blob_box.left () - prev_x, 1);
00733         prev_x = blob_box.right ();
00734       }
00735       blob_it.forward ();
00736     }
00737   }
00738   if (gap_stats.get_total () == 0) {
00739     return FALSE;
00740   }
00741   cluster_count = 0;
00742   lower = row->xheight * words_initial_lower;
00743   upper = row->xheight * words_initial_upper;
00744   gap_stats.smooth (smooth_factor);
00745   do {
00746     prev_count = cluster_count;
00747     cluster_count = gap_stats.cluster (lower, upper,
00748       textord_spacesize_ratioprop,
00749       BLOCK_STATS_CLUSTERS, cluster_stats);
00750   }
00751   while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
00752   if (cluster_count < 1) {
00753     return FALSE;
00754   }
00755   for (gap_index = 0; gap_index < cluster_count; gap_index++)
00756     gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
00757   //get medians
00758   if (testing_on) {
00759     tprintf ("cluster_count=%d:", cluster_count);
00760     for (gap_index = 0; gap_index < cluster_count; gap_index++)
00761       tprintf (" %g(%d)", gaps[gap_index],
00762         cluster_stats[gap_index + 1].get_total ());
00763     tprintf ("\n");
00764   }
00765   qsort (gaps, cluster_count, sizeof (float), sort_floats);
00766 
00767   //Try to find proportional non-space and space for row.
00768   lower = row->xheight * words_default_prop_nonspace;
00769   upper = row->xheight * textord_words_min_minspace;
00770   for (gap_index = 0; gap_index < cluster_count
00771     && gaps[gap_index] < lower; gap_index++);
00772   if (gap_index == 0) {
00773     if (testing_on)
00774       tprintf ("No clusters below nonspace threshold!!\n");
00775     if (cluster_count > 1) {
00776       row->pr_nonsp = gaps[0];
00777       row->pr_space = gaps[1];
00778     }
00779     else {
00780       row->pr_nonsp = lower;
00781       row->pr_space = gaps[0];
00782     }
00783   }
00784   else {
00785     row->pr_nonsp = gaps[gap_index - 1];
00786     while (gap_index < cluster_count && gaps[gap_index] < upper)
00787       gap_index++;
00788     if (gap_index == cluster_count) {
00789       if (testing_on)
00790         tprintf ("No clusters above nonspace threshold!!\n");
00791       row->pr_space = lower * textord_spacesize_ratioprop;
00792     }
00793     else
00794       row->pr_space = gaps[gap_index];
00795   }
00796 
00797   //Now try to find the fixed pitch space and non-space.
00798   upper = row->xheight * words_default_fixed_space;
00799   for (gap_index = 0; gap_index < cluster_count
00800     && gaps[gap_index] < upper; gap_index++);
00801   if (gap_index == 0) {
00802     if (testing_on)
00803       tprintf ("No clusters below space threshold!!\n");
00804     row->fp_nonsp = upper;
00805     row->fp_space = gaps[0];
00806   }
00807   else {
00808     row->fp_nonsp = gaps[gap_index - 1];
00809     if (gap_index == cluster_count) {
00810       if (testing_on)
00811         tprintf ("No clusters above space threshold!!\n");
00812       row->fp_space = row->xheight;
00813     }
00814     else
00815       row->fp_space = gaps[gap_index];
00816   }
00817   if (testing_on) {
00818     tprintf
00819       ("Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, fp_space=%g\n",
00820       row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space);
00821   }
00822   return TRUE;                   //computed some stats
00823 }
00824 
00825 
00826 /**********************************************************************
00827  * find_row_pitch
00828  *
00829  * Check to see if this row could be fixed pitch using the given spacings.
00830  * Blobs with gaps smaller than the lower threshold are assumed to be one.
00831  * The larger threshold is the word gap threshold.
00832  **********************************************************************/
00833 
00834 BOOL8 find_row_pitch(                    //find lines
00835                      TO_ROW *row,        //row to do
00836                      inT32 maxwidth,     //max permitted space
00837                      inT32 dm_gap,       //ignorable gaps
00838                      TO_BLOCK *block,    //block of row
00839                      inT32 block_index,  //block_number
00840                      inT32 row_index,    //number of row
00841                      BOOL8 testing_on    //correct orientation
00842                     ) {
00843   BOOL8 used_dm_model;           //looks lik dot matrix
00844   float min_space;               //estimate threshold
00845   float non_space;               //gap size
00846   float gap_iqr;                 //interquartile range
00847   float pitch_iqr;
00848   float dm_gap_iqr;              //interquartile range
00849   float dm_pitch_iqr;
00850   float dm_pitch;                //pitch with dm on
00851   float pitch;                   //revised estimate
00852   float initial_pitch;           //guess at pitch
00853   STATS gap_stats (0, maxwidth);
00854                                  //centre-centre
00855   STATS pitch_stats (0, maxwidth);
00856 
00857   row->fixed_pitch = 0.0f;
00858   initial_pitch = row->fp_space;
00859   if (initial_pitch > row->xheight * (1 + words_default_fixed_limit))
00860     initial_pitch = row->xheight;//keep pitch decent
00861   non_space = row->fp_nonsp;
00862   if (non_space > initial_pitch)
00863     non_space = initial_pitch;
00864   min_space = (initial_pitch + non_space) / 2;
00865 
00866   if (!count_pitch_stats (row, &gap_stats, &pitch_stats,
00867   initial_pitch, min_space, TRUE, FALSE, dm_gap)) {
00868     dm_gap_iqr = 0.0001;
00869     dm_pitch_iqr = maxwidth * 2.0f;
00870     dm_pitch = initial_pitch;
00871   }
00872   else {
00873     dm_gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
00874     dm_pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
00875     dm_pitch = pitch_stats.ile (0.5);
00876   }
00877   gap_stats.clear ();
00878   pitch_stats.clear ();
00879   if (!count_pitch_stats (row, &gap_stats, &pitch_stats,
00880   initial_pitch, min_space, TRUE, FALSE, 0)) {
00881     gap_iqr = 0.0001;
00882     pitch_iqr = maxwidth * 3.0f;
00883   }
00884   else {
00885     gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
00886     pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
00887     if (testing_on)
00888       tprintf
00889         ("First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n",
00890         initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5));
00891     initial_pitch = pitch_stats.ile (0.5);
00892     if (min_space > initial_pitch
00893       && count_pitch_stats (row, &gap_stats, &pitch_stats,
00894     initial_pitch, initial_pitch, TRUE, FALSE, 0)) {
00895       min_space = initial_pitch;
00896       gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
00897       pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
00898       if (testing_on)
00899         tprintf
00900           ("Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n",
00901           initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5));
00902       initial_pitch = pitch_stats.ile (0.5);
00903     }
00904   }
00905   if (textord_debug_pitch_metric)
00906     tprintf("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:",
00907             block_index, row_index, 'X',
00908             pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr,
00909             pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth ? 'D' :
00910               (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ? 'S' : 'M'));
00911   if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) {
00912     row->pitch_decision = PITCH_DUNNO;
00913     if (textord_debug_pitch_metric)
00914       tprintf ("\n");
00915     return FALSE;                //insufficient data
00916   }
00917   if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) {
00918     if (testing_on)
00919       tprintf
00920         ("Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n",
00921         pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
00922     gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
00923     pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
00924     pitch = pitch_stats.ile (0.5);
00925     used_dm_model = FALSE;
00926   }
00927   else {
00928     if (testing_on)
00929       tprintf
00930         ("Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n",
00931         pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
00932     gap_iqr = dm_gap_iqr;
00933     pitch_iqr = dm_pitch_iqr;
00934     pitch = dm_pitch;
00935     used_dm_model = TRUE;
00936   }
00937   if (textord_debug_pitch_metric) {
00938     tprintf ("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:",
00939       pitch_iqr, gap_iqr, pitch);
00940     tprintf ("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:",
00941       pitch_iqr / gap_iqr, pitch_iqr / block->xheight,
00942       pitch_iqr < gap_iqr * textord_fpiqr_ratio
00943       && pitch_iqr < block->xheight * textord_max_pitch_iqr
00944       && pitch < block->xheight * textord_words_default_maxspace
00945       ? 'F' : 'P');
00946   }
00947   if (pitch_iqr < gap_iqr * textord_fpiqr_ratio
00948     && pitch_iqr < block->xheight * textord_max_pitch_iqr
00949     && pitch < block->xheight * textord_words_default_maxspace)
00950     row->pitch_decision = PITCH_MAYBE_FIXED;
00951   else
00952     row->pitch_decision = PITCH_MAYBE_PROP;
00953   row->fixed_pitch = pitch;
00954   row->kern_size = gap_stats.ile (0.5);
00955   row->min_space = (inT32) (row->fixed_pitch + non_space) / 2;
00956   if (row->min_space > row->fixed_pitch)
00957     row->min_space = (inT32) row->fixed_pitch;
00958   row->max_nonspace = row->min_space;
00959   row->space_size = row->fixed_pitch;
00960   row->space_threshold = (row->max_nonspace + row->min_space) / 2;
00961   row->used_dm_model = used_dm_model;
00962   return TRUE;
00963 }
00964 
00965 
00966 /**********************************************************************
00967  * fixed_pitch_row
00968  *
00969  * Check to see if this row could be fixed pitch using the given spacings.
00970  * Blobs with gaps smaller than the lower threshold are assumed to be one.
00971  * The larger threshold is the word gap threshold.
00972  **********************************************************************/
00973 
00974 BOOL8 fixed_pitch_row(TO_ROW *row,       // row to do
00975                       BLOCK* block,
00976                       inT32 block_index  // block_number
00977                      ) {
00978   const char *res_string;        // pitch result
00979   inT16 mid_cuts;                // no of cheap cuts
00980   float non_space;               // gap size
00981   float pitch_sd;                // error on pitch
00982   float sp_sd = 0.0f;            // space sd
00983 
00984   non_space = row->fp_nonsp;
00985   if (non_space > row->fixed_pitch)
00986     non_space = row->fixed_pitch;
00987   POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
00988   if (textord_all_prop || (pb != NULL && !pb->IsText())) {
00989     // Set the decision to definitely proportional.
00990     pitch_sd = textord_words_def_prop * row->fixed_pitch;
00991     row->pitch_decision = PITCH_DEF_PROP;
00992   } else {
00993     pitch_sd = tune_row_pitch (row, &row->projection, row->projection_left,
00994                                row->projection_right,
00995                                (row->fixed_pitch + non_space * 3) / 4,
00996                                row->fixed_pitch, sp_sd, mid_cuts,
00997                                &row->char_cells,
00998                                block_index == textord_debug_block);
00999     if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch
01000       && ((pitsync_linear_version & 3) < 3
01001       || ((pitsync_linear_version & 3) >= 3 && (row->used_dm_model
01002       || sp_sd > 20
01003     || (pitch_sd == 0 && sp_sd > 10))))) {
01004       if (pitch_sd < textord_words_def_fixed * row->fixed_pitch
01005         && !row->all_caps
01006         && ((pitsync_linear_version & 3) < 3 || sp_sd > 20))
01007         row->pitch_decision = PITCH_DEF_FIXED;
01008       else
01009         row->pitch_decision = PITCH_MAYBE_FIXED;
01010     }
01011     else if ((pitsync_linear_version & 3) < 3
01012       || sp_sd > 20
01013       || mid_cuts > 0
01014       || pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) {
01015       if (pitch_sd < textord_words_def_prop * row->fixed_pitch)
01016         row->pitch_decision = PITCH_MAYBE_PROP;
01017       else
01018         row->pitch_decision = PITCH_DEF_PROP;
01019     }
01020     else
01021       row->pitch_decision = PITCH_DUNNO;
01022   }
01023 
01024   if (textord_debug_pitch_metric) {
01025     res_string = "??";
01026     switch (row->pitch_decision) {
01027       case PITCH_DEF_PROP:
01028         res_string = "DP";
01029         break;
01030       case PITCH_MAYBE_PROP:
01031         res_string = "MP";
01032         break;
01033       case PITCH_DEF_FIXED:
01034         res_string = "DF";
01035         break;
01036       case PITCH_MAYBE_FIXED:
01037         res_string = "MF";
01038         break;
01039       default:
01040         res_string = "??";
01041     }
01042     tprintf (":sd/p=%g:occ=%g:init_res=%s\n",
01043       pitch_sd / row->fixed_pitch, sp_sd, res_string);
01044   }
01045   return TRUE;
01046 }
01047 
01048 
01049 /**********************************************************************
01050  * count_pitch_stats
01051  *
01052  * Count up the gap and pitch stats on the block to see if it is fixed pitch.
01053  * Blobs with gaps smaller than the lower threshold are assumed to be one.
01054  * The larger threshold is the word gap threshold.
01055  * The return value indicates whether there were any decent values to use.
01056  **********************************************************************/
01057 
01058 BOOL8 count_pitch_stats(                       //find lines
01059                         TO_ROW *row,           //row to do
01060                         STATS *gap_stats,      //blob gaps
01061                         STATS *pitch_stats,    //centre-centre stats
01062                         float initial_pitch,   //guess at pitch
01063                         float min_space,       //estimate space size
01064                         BOOL8 ignore_outsize,  //discard big objects
01065                         BOOL8 split_outsize,   //split big objects
01066                         inT32 dm_gap           //ignorable gaps
01067                        ) {
01068   BOOL8 prev_valid;              //not word broken
01069   BLOBNBOX *blob;                //current blob
01070                                  //blobs
01071   BLOBNBOX_IT blob_it = row->blob_list ();
01072   inT32 prev_right;              //end of prev blob
01073   inT32 prev_centre;             //centre of previous blob
01074   inT32 x_centre;                //centre of this blob
01075   inT32 blob_width;              //width of blob
01076   inT32 width_units;             //no of widths in blob
01077   float width;                   //blob width
01078   TBOX blob_box;                  //bounding box
01079   TBOX joined_box;                //of super blob
01080 
01081   gap_stats->clear ();
01082   pitch_stats->clear ();
01083   if (blob_it.empty ())
01084     return FALSE;
01085   prev_valid = FALSE;
01086   prev_centre = 0;
01087   prev_right = 0;                //stop compiler warning
01088   joined_box = blob_it.data ()->bounding_box ();
01089   do {
01090     blob_it.forward ();
01091     blob = blob_it.data ();
01092     if (!blob->joined_to_prev ()) {
01093       blob_box = blob->bounding_box ();
01094       if ((blob_box.left () - joined_box.right () < dm_gap
01095         && !blob_it.at_first ())
01096         || blob->cblob() == NULL)
01097         joined_box += blob_box;  //merge blobs
01098       else {
01099         blob_width = joined_box.width ();
01100         if (split_outsize) {
01101           width_units =
01102             (inT32) floor ((float) blob_width / initial_pitch + 0.5);
01103           if (width_units < 1)
01104             width_units = 1;
01105           width_units--;
01106         }
01107         else if (ignore_outsize) {
01108           width = (float) blob_width / initial_pitch;
01109           width_units = width < 1 + words_default_fixed_limit
01110             && width > 1 - words_default_fixed_limit ? 0 : -1;
01111         }
01112         else
01113           width_units = 0;       //everything in
01114         x_centre = (inT32) (joined_box.left ()
01115           + (blob_width -
01116           width_units * initial_pitch) / 2);
01117         if (prev_valid && width_units >= 0) {
01118           //                                              if (width_units>0)
01119           //                                              {
01120           //                                                      tprintf("wu=%d, width=%d, xc=%d, adding %d\n",
01121           //                                                              width_units,blob_width,x_centre,x_centre-prev_centre);
01122           //                                              }
01123           gap_stats->add (joined_box.left () - prev_right, 1);
01124           pitch_stats->add (x_centre - prev_centre, 1);
01125         }
01126         prev_centre = (inT32) (x_centre + width_units * initial_pitch);
01127         prev_right = joined_box.right ();
01128         prev_valid = blob_box.left () - joined_box.right () < min_space;
01129         prev_valid = prev_valid && width_units >= 0;
01130         joined_box = blob_box;
01131       }
01132     }
01133   }
01134   while (!blob_it.at_first ());
01135   return gap_stats->get_total () >= 3;
01136 }
01137 
01138 
01139 /**********************************************************************
01140  * tune_row_pitch
01141  *
01142  * Use a dp algorithm to fit the character cells and return the sd of
01143  * the cell size over the row.
01144  **********************************************************************/
01145 
01146 float tune_row_pitch(                             //find fp cells
01147                      TO_ROW *row,                 //row to do
01148                      STATS *projection,           //vertical projection
01149                      inT16 projection_left,       //edge of projection
01150                      inT16 projection_right,      //edge of projection
01151                      float space_size,            //size of blank
01152                      float &initial_pitch,        //guess at pitch
01153                      float &best_sp_sd,           //space sd
01154                      inT16 &best_mid_cuts,        //no of cheap cuts
01155                      ICOORDELT_LIST *best_cells,  //row cells
01156                      BOOL8 testing_on             //inidividual words
01157                     ) {
01158   int pitch_delta;               //offset pitch
01159   inT16 mid_cuts;                //cheap cuts
01160   float pitch_sd;                //current sd
01161   float best_sd;                 //best result
01162   float best_pitch;              //pitch for best result
01163   float initial_sd;              //starting error
01164   float sp_sd;                   //space sd
01165   ICOORDELT_LIST test_cells;     //row cells
01166   ICOORDELT_IT best_it;          //start of best list
01167 
01168   if (textord_fast_pitch_test)
01169     return tune_row_pitch2 (row, projection, projection_left,
01170       projection_right, space_size, initial_pitch,
01171       best_sp_sd,
01172     //space sd
01173       best_mid_cuts, best_cells, testing_on);
01174   if (textord_disable_pitch_test) {
01175     best_sp_sd = initial_pitch;
01176     return initial_pitch;
01177   }
01178   initial_sd =
01179     compute_pitch_sd(row,
01180                      projection,
01181                      projection_left,
01182                      projection_right,
01183                      space_size,
01184                      initial_pitch,
01185                      best_sp_sd,
01186                      best_mid_cuts,
01187                      best_cells,
01188                      testing_on);
01189   best_sd = initial_sd;
01190   best_pitch = initial_pitch;
01191   if (testing_on)
01192     tprintf ("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd);
01193   for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
01194     pitch_sd =
01195       compute_pitch_sd (row, projection, projection_left, projection_right,
01196       space_size, initial_pitch + pitch_delta, sp_sd,
01197       mid_cuts, &test_cells, testing_on);
01198     if (testing_on)
01199       tprintf ("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta,
01200         pitch_sd);
01201     if (pitch_sd < best_sd) {
01202       best_sd = pitch_sd;
01203       best_mid_cuts = mid_cuts;
01204       best_sp_sd = sp_sd;
01205       best_pitch = initial_pitch + pitch_delta;
01206       best_cells->clear ();
01207       best_it.set_to_list (best_cells);
01208       best_it.add_list_after (&test_cells);
01209     }
01210     else
01211       test_cells.clear ();
01212     if (pitch_sd > initial_sd)
01213       break;                     //getting worse
01214   }
01215   for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
01216     pitch_sd =
01217       compute_pitch_sd (row, projection, projection_left, projection_right,
01218       space_size, initial_pitch - pitch_delta, sp_sd,
01219       mid_cuts, &test_cells, testing_on);
01220     if (testing_on)
01221       tprintf ("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta,
01222         pitch_sd);
01223     if (pitch_sd < best_sd) {
01224       best_sd = pitch_sd;
01225       best_mid_cuts = mid_cuts;
01226       best_sp_sd = sp_sd;
01227       best_pitch = initial_pitch - pitch_delta;
01228       best_cells->clear ();
01229       best_it.set_to_list (best_cells);
01230       best_it.add_list_after (&test_cells);
01231     }
01232     else
01233       test_cells.clear ();
01234     if (pitch_sd > initial_sd)
01235       break;
01236   }
01237   initial_pitch = best_pitch;
01238 
01239   if (textord_debug_pitch_metric)
01240     print_pitch_sd(row,
01241                    projection,
01242                    projection_left,
01243                    projection_right,
01244                    space_size,
01245                    best_pitch);
01246 
01247   return best_sd;
01248 }
01249 
01250 
01251 /**********************************************************************
01252  * tune_row_pitch
01253  *
01254  * Use a dp algorithm to fit the character cells and return the sd of
01255  * the cell size over the row.
01256  **********************************************************************/
01257 
01258 float tune_row_pitch2(                             //find fp cells
01259                       TO_ROW *row,                 //row to do
01260                       STATS *projection,           //vertical projection
01261                       inT16 projection_left,       //edge of projection
01262                       inT16 projection_right,      //edge of projection
01263                       float space_size,            //size of blank
01264                       float &initial_pitch,        //guess at pitch
01265                       float &best_sp_sd,           //space sd
01266                       inT16 &best_mid_cuts,        //no of cheap cuts
01267                       ICOORDELT_LIST *best_cells,  //row cells
01268                       BOOL8 testing_on             //inidividual words
01269                      ) {
01270   int pitch_delta;               //offset pitch
01271   inT16 pixel;                   //pixel coord
01272   inT16 best_pixel;              //pixel coord
01273   inT16 best_delta;              //best pitch
01274   inT16 best_pitch;              //best pitch
01275   inT16 start;                   //of good range
01276   inT16 end;                     //of good range
01277   inT32 best_count;              //lowest sum
01278   float best_sd;                 //best result
01279   STATS *sum_proj;               //summed projection
01280 
01281   best_sp_sd = initial_pitch;
01282 
01283   best_pitch = static_cast<int>(initial_pitch);
01284   if (textord_disable_pitch_test || best_pitch <= textord_pitch_range) {
01285     return initial_pitch;
01286   }
01287   sum_proj = new STATS[textord_pitch_range * 2 + 1];
01288   if (sum_proj == NULL)
01289     return initial_pitch;
01290 
01291   for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
01292     pitch_delta++)
01293   sum_proj[textord_pitch_range + pitch_delta].set_range (0,
01294       best_pitch +
01295       pitch_delta + 1);
01296   for (pixel = projection_left; pixel <= projection_right; pixel++) {
01297     for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
01298          pitch_delta++) {
01299       sum_proj[textord_pitch_range + pitch_delta].add(
01300           (pixel - projection_left) % (best_pitch + pitch_delta),
01301           projection->pile_count(pixel));
01302     }
01303   }
01304   best_count = sum_proj[textord_pitch_range].pile_count (0);
01305   best_delta = 0;
01306   best_pixel = 0;
01307   for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
01308   pitch_delta++) {
01309     for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) {
01310       if (sum_proj[textord_pitch_range + pitch_delta].pile_count (pixel)
01311       < best_count) {
01312         best_count =
01313           sum_proj[textord_pitch_range +
01314           pitch_delta].pile_count (pixel);
01315         best_delta = pitch_delta;
01316         best_pixel = pixel;
01317       }
01318     }
01319   }
01320   if (testing_on)
01321     tprintf ("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n",
01322       initial_pitch, best_delta, best_count);
01323   best_pitch += best_delta;
01324   initial_pitch = best_pitch;
01325   best_count++;
01326   best_count += best_count;
01327   for (start = best_pixel - 2; start > best_pixel - best_pitch
01328     && sum_proj[textord_pitch_range +
01329     best_delta].pile_count (start % best_pitch) <= best_count;
01330     start--);
01331   for (end = best_pixel + 2;
01332     end < best_pixel + best_pitch
01333     && sum_proj[textord_pitch_range +
01334     best_delta].pile_count (end % best_pitch) <= best_count;
01335     end++);
01336 
01337   best_sd =
01338     compute_pitch_sd(row,
01339                      projection,
01340                      projection_left,
01341                      projection_right,
01342                      space_size,
01343                      initial_pitch,
01344                      best_sp_sd,
01345                      best_mid_cuts,
01346                      best_cells,
01347                      testing_on,
01348                      start,
01349                      end);
01350   if (testing_on)
01351     tprintf ("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch,
01352       best_sd);
01353 
01354   if (textord_debug_pitch_metric)
01355     print_pitch_sd(row,
01356                    projection,
01357                    projection_left,
01358                    projection_right,
01359                    space_size,
01360                    initial_pitch);
01361 
01362   delete[]sum_proj;
01363 
01364   return best_sd;
01365 }
01366 
01367 
01368 /**********************************************************************
01369  * compute_pitch_sd
01370  *
01371  * Use a dp algorithm to fit the character cells and return the sd of
01372  * the cell size over the row.
01373  **********************************************************************/
01374 
01375 float compute_pitch_sd(                            //find fp cells
01376                        TO_ROW *row,                //row to do
01377                        STATS *projection,          //vertical projection
01378                        inT16 projection_left,      //edge
01379                        inT16 projection_right,     //edge
01380                        float space_size,           //size of blank
01381                        float initial_pitch,        //guess at pitch
01382                        float &sp_sd,               //space sd
01383                        inT16 &mid_cuts,            //no of free cuts
01384                        ICOORDELT_LIST *row_cells,  //list of chop pts
01385                        BOOL8 testing_on,           //inidividual words
01386                        inT16 start,                //start of good range
01387                        inT16 end                   //end of good range
01388                       ) {
01389   inT16 occupation;              //no of cells in word.
01390                                  //blobs
01391   BLOBNBOX_IT blob_it = row->blob_list ();
01392   BLOBNBOX_IT start_it;          //start of word
01393   BLOBNBOX_IT plot_it;           //for plotting
01394   inT16 blob_count;              //no of blobs
01395   TBOX blob_box;                  //bounding box
01396   TBOX prev_box;                  //of super blob
01397   inT32 prev_right;              //of word sync
01398   int scale_factor;              //on scores for big words
01399   inT32 sp_count;                //spaces
01400   FPSEGPT_LIST seg_list;         //char cells
01401   FPSEGPT_IT seg_it;             //iterator
01402   inT16 segpos;                  //position of segment
01403   inT16 cellpos;                 //previous cell boundary
01404                                  //iterator
01405   ICOORDELT_IT cell_it = row_cells;
01406   ICOORDELT *cell;               //new cell
01407   double sqsum;                  //sum of squares
01408   double spsum;                  //of spaces
01409   double sp_var;                 //space error
01410   double word_sync;              //result for word
01411   inT32 total_count;             //total blobs
01412 
01413   if ((pitsync_linear_version & 3) > 1) {
01414     word_sync = compute_pitch_sd2 (row, projection, projection_left,
01415       projection_right, initial_pitch,
01416       occupation, mid_cuts, row_cells,
01417       testing_on, start, end);
01418     sp_sd = occupation;
01419     return word_sync;
01420   }
01421   mid_cuts = 0;
01422   cellpos = 0;
01423   total_count = 0;
01424   sqsum = 0;
01425   sp_count = 0;
01426   spsum = 0;
01427   prev_right = -1;
01428   if (blob_it.empty ())
01429     return space_size * 10;
01430 #ifndef GRAPHICS_DISABLED
01431   if (testing_on && to_win != NULL) {
01432     blob_box = blob_it.data ()->bounding_box ();
01433     projection->plot (to_win, projection_left,
01434       row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
01435   }
01436 #endif
01437   start_it = blob_it;
01438   blob_count = 0;
01439   blob_box = box_next (&blob_it);//first blob
01440   blob_it.mark_cycle_pt ();
01441   do {
01442     for (; blob_count > 0; blob_count--)
01443       box_next(&start_it);
01444     do {
01445       prev_box = blob_box;
01446       blob_count++;
01447       blob_box = box_next (&blob_it);
01448     }
01449     while (!blob_it.cycled_list ()
01450       && blob_box.left () - prev_box.right () < space_size);
01451     plot_it = start_it;
01452     if (pitsync_linear_version & 3)
01453       word_sync =
01454         check_pitch_sync2 (&start_it, blob_count, (inT16) initial_pitch, 2,
01455         projection, projection_left, projection_right,
01456         row->xheight * textord_projection_scale,
01457         occupation, &seg_list, start, end);
01458     else
01459       word_sync =
01460         check_pitch_sync (&start_it, blob_count, (inT16) initial_pitch, 2,
01461         projection, &seg_list);
01462     if (testing_on) {
01463       tprintf ("Word ending at (%d,%d), len=%d, sync rating=%g, ",
01464         prev_box.right (), prev_box.top (),
01465         seg_list.length () - 1, word_sync);
01466       seg_it.set_to_list (&seg_list);
01467       for (seg_it.mark_cycle_pt (); !seg_it.cycled_list ();
01468       seg_it.forward ()) {
01469         if (seg_it.data ()->faked)
01470           tprintf ("(F)");
01471         tprintf ("%d, ", seg_it.data ()->position ());
01472         //                              tprintf("C=%g, s=%g, sq=%g\n",
01473         //                                      seg_it.data()->cost_function(),
01474         //                                      seg_it.data()->sum(),
01475         //                                      seg_it.data()->squares());
01476       }
01477       tprintf ("\n");
01478     }
01479 #ifndef GRAPHICS_DISABLED
01480     if (textord_show_fixed_cuts && blob_count > 0 && to_win != NULL)
01481       plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
01482 #endif
01483     seg_it.set_to_list (&seg_list);
01484     if (prev_right >= 0) {
01485       sp_var = seg_it.data ()->position () - prev_right;
01486       sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch;
01487       sp_var *= sp_var;
01488       spsum += sp_var;
01489       sp_count++;
01490     }
01491     for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
01492       segpos = seg_it.data ()->position ();
01493       if (cell_it.empty () || segpos > cellpos + initial_pitch / 2) {
01494                                  //big gap
01495         while (!cell_it.empty () && segpos > cellpos + initial_pitch * 3 / 2) {
01496           cell = new ICOORDELT (cellpos + (inT16) initial_pitch, 0);
01497           cell_it.add_after_then_move (cell);
01498           cellpos += (inT16) initial_pitch;
01499         }
01500                                  //make new one
01501         cell = new ICOORDELT (segpos, 0);
01502         cell_it.add_after_then_move (cell);
01503         cellpos = segpos;
01504       }
01505       else if (segpos > cellpos - initial_pitch / 2) {
01506         cell = cell_it.data ();
01507                                  //average positions
01508         cell->set_x ((cellpos + segpos) / 2);
01509         cellpos = cell->x ();
01510       }
01511     }
01512     seg_it.move_to_last ();
01513     prev_right = seg_it.data ()->position ();
01514     if (textord_pitch_scalebigwords) {
01515       scale_factor = (seg_list.length () - 2) / 2;
01516       if (scale_factor < 1)
01517         scale_factor = 1;
01518     }
01519     else
01520       scale_factor = 1;
01521     sqsum += word_sync * scale_factor;
01522     total_count += (seg_list.length () - 1) * scale_factor;
01523     seg_list.clear ();
01524   }
01525   while (!blob_it.cycled_list ());
01526   sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0;
01527   return total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10;
01528 }
01529 
01530 
01531 /**********************************************************************
01532  * compute_pitch_sd2
01533  *
01534  * Use a dp algorithm to fit the character cells and return the sd of
01535  * the cell size over the row.
01536  **********************************************************************/
01537 
01538 float compute_pitch_sd2(                            //find fp cells
01539                         TO_ROW *row,                //row to do
01540                         STATS *projection,          //vertical projection
01541                         inT16 projection_left,      //edge
01542                         inT16 projection_right,     //edge
01543                         float initial_pitch,        //guess at pitch
01544                         inT16 &occupation,          //no of occupied cells
01545                         inT16 &mid_cuts,            //no of free cuts
01546                         ICOORDELT_LIST *row_cells,  //list of chop pts
01547                         BOOL8 testing_on,           //inidividual words
01548                         inT16 start,                //start of good range
01549                         inT16 end                   //end of good range
01550                        ) {
01551                                  //blobs
01552   BLOBNBOX_IT blob_it = row->blob_list ();
01553   BLOBNBOX_IT plot_it;
01554   inT16 blob_count;              //no of blobs
01555   TBOX blob_box;                  //bounding box
01556   FPSEGPT_LIST seg_list;         //char cells
01557   FPSEGPT_IT seg_it;             //iterator
01558   inT16 segpos;                  //position of segment
01559                                  //iterator
01560   ICOORDELT_IT cell_it = row_cells;
01561   ICOORDELT *cell;               //new cell
01562   double word_sync;              //result for word
01563 
01564   mid_cuts = 0;
01565   if (blob_it.empty ()) {
01566     occupation = 0;
01567     return initial_pitch * 10;
01568   }
01569 #ifndef GRAPHICS_DISABLED
01570   if (testing_on && to_win != NULL) {
01571     projection->plot (to_win, projection_left,
01572       row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
01573   }
01574 #endif
01575   blob_count = 0;
01576   blob_it.mark_cycle_pt ();
01577   do {
01578                                  //first blob
01579     blob_box = box_next (&blob_it);
01580     blob_count++;
01581   }
01582   while (!blob_it.cycled_list ());
01583   plot_it = blob_it;
01584   word_sync = check_pitch_sync2 (&blob_it, blob_count, (inT16) initial_pitch,
01585     2, projection, projection_left,
01586     projection_right,
01587     row->xheight * textord_projection_scale,
01588     occupation, &seg_list, start, end);
01589   if (testing_on) {
01590     tprintf ("Row ending at (%d,%d), len=%d, sync rating=%g, ",
01591       blob_box.right (), blob_box.top (),
01592       seg_list.length () - 1, word_sync);
01593     seg_it.set_to_list (&seg_list);
01594     for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
01595       if (seg_it.data ()->faked)
01596         tprintf ("(F)");
01597       tprintf ("%d, ", seg_it.data ()->position ());
01598       //                              tprintf("C=%g, s=%g, sq=%g\n",
01599       //                                      seg_it.data()->cost_function(),
01600       //                                      seg_it.data()->sum(),
01601       //                                      seg_it.data()->squares());
01602     }
01603     tprintf ("\n");
01604   }
01605 #ifndef GRAPHICS_DISABLED
01606   if (textord_show_fixed_cuts && blob_count > 0 && to_win != NULL)
01607     plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
01608 #endif
01609   seg_it.set_to_list (&seg_list);
01610   for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
01611     segpos = seg_it.data ()->position ();
01612                                  //make new one
01613     cell = new ICOORDELT (segpos, 0);
01614     cell_it.add_after_then_move (cell);
01615     if (seg_it.at_last ())
01616       mid_cuts = seg_it.data ()->cheap_cuts ();
01617   }
01618   seg_list.clear ();
01619   return occupation > 0 ? sqrt (word_sync / occupation) : initial_pitch * 10;
01620 }
01621 
01622 
01623 /**********************************************************************
01624  * print_pitch_sd
01625  *
01626  * Use a dp algorithm to fit the character cells and return the sd of
01627  * the cell size over the row.
01628  **********************************************************************/
01629 
01630 void print_pitch_sd(                        //find fp cells
01631                     TO_ROW *row,            //row to do
01632                     STATS *projection,      //vertical projection
01633                     inT16 projection_left,  //edges //size of blank
01634                     inT16 projection_right,
01635                     float space_size,
01636                     float initial_pitch     //guess at pitch
01637                    ) {
01638   const char *res2;              //pitch result
01639   inT16 occupation;              //used cells
01640   float sp_sd;                   //space sd
01641                                  //blobs
01642   BLOBNBOX_IT blob_it = row->blob_list ();
01643   BLOBNBOX_IT start_it;          //start of word
01644   BLOBNBOX_IT row_start;         //start of row
01645   inT16 blob_count;              //no of blobs
01646   inT16 total_blob_count;        //total blobs in line
01647   TBOX blob_box;                  //bounding box
01648   TBOX prev_box;                  //of super blob
01649   inT32 prev_right;              //of word sync
01650   int scale_factor;              //on scores for big words
01651   inT32 sp_count;                //spaces
01652   FPSEGPT_LIST seg_list;         //char cells
01653   FPSEGPT_IT seg_it;             //iterator
01654   double sqsum;                  //sum of squares
01655   double spsum;                  //of spaces
01656   double sp_var;                 //space error
01657   double word_sync;              //result for word
01658   double total_count;            //total cuts
01659 
01660   if (blob_it.empty ())
01661     return;
01662   row_start = blob_it;
01663   total_blob_count = 0;
01664 
01665   total_count = 0;
01666   sqsum = 0;
01667   sp_count = 0;
01668   spsum = 0;
01669   prev_right = -1;
01670   blob_it = row_start;
01671   start_it = blob_it;
01672   blob_count = 0;
01673   blob_box = box_next (&blob_it);//first blob
01674   blob_it.mark_cycle_pt ();
01675   do {
01676     for (; blob_count > 0; blob_count--)
01677       box_next(&start_it);
01678     do {
01679       prev_box = blob_box;
01680       blob_count++;
01681       blob_box = box_next (&blob_it);
01682     }
01683     while (!blob_it.cycled_list ()
01684       && blob_box.left () - prev_box.right () < space_size);
01685     word_sync =
01686       check_pitch_sync2 (&start_it, blob_count, (inT16) initial_pitch, 2,
01687       projection, projection_left, projection_right,
01688       row->xheight * textord_projection_scale,
01689       occupation, &seg_list, 0, 0);
01690     total_blob_count += blob_count;
01691     seg_it.set_to_list (&seg_list);
01692     if (prev_right >= 0) {
01693       sp_var = seg_it.data ()->position () - prev_right;
01694       sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch;
01695       sp_var *= sp_var;
01696       spsum += sp_var;
01697       sp_count++;
01698     }
01699     seg_it.move_to_last ();
01700     prev_right = seg_it.data ()->position ();
01701     if (textord_pitch_scalebigwords) {
01702       scale_factor = (seg_list.length () - 2) / 2;
01703       if (scale_factor < 1)
01704         scale_factor = 1;
01705     }
01706     else
01707       scale_factor = 1;
01708     sqsum += word_sync * scale_factor;
01709     total_count += (seg_list.length () - 1) * scale_factor;
01710     seg_list.clear ();
01711   }
01712   while (!blob_it.cycled_list ());
01713   sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0;
01714   word_sync = total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10;
01715   tprintf ("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:",
01716     word_sync, word_sync / initial_pitch, sp_sd,
01717     word_sync < textord_words_pitchsd_threshold * initial_pitch
01718     ? 'F' : 'P');
01719 
01720   start_it = row_start;
01721   blob_it = row_start;
01722   word_sync =
01723     check_pitch_sync2 (&blob_it, total_blob_count, (inT16) initial_pitch, 2,
01724     projection, projection_left, projection_right,
01725     row->xheight * textord_projection_scale, occupation,
01726     &seg_list, 0, 0);
01727   if (occupation > 1)
01728     word_sync /= occupation;
01729   word_sync = sqrt (word_sync);
01730 
01731 #ifndef GRAPHICS_DISABLED
01732   if (textord_show_row_cuts && to_win != NULL)
01733     plot_fp_cells2(to_win, ScrollView::CORAL, row, &seg_list);
01734 #endif
01735   seg_list.clear ();
01736   if (word_sync < textord_words_pitchsd_threshold * initial_pitch) {
01737     if (word_sync < textord_words_def_fixed * initial_pitch
01738       && !row->all_caps)
01739       res2 = "DF";
01740     else
01741       res2 = "MF";
01742   }
01743   else
01744     res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP";
01745   tprintf
01746     ("row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, all_caps=%d\n",
01747     word_sync, word_sync / initial_pitch,
01748     word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P',
01749     occupation, res2, initial_pitch, row->fixed_pitch, row->all_caps);
01750 }
01751 
01752 /**********************************************************************
01753  * find_repeated_chars
01754  *
01755  * Extract marked leader blobs and put them
01756  * into words in advance of fixed pitch checking and word generation.
01757  **********************************************************************/
01758 void find_repeated_chars(TO_BLOCK *block,       // Block to search.
01759                          BOOL8 testing_on) {    // Debug mode.
01760   POLY_BLOCK* pb = block->block->poly_block();
01761   if (pb != NULL && !pb->IsText())
01762     return;  // Don't find repeated chars in non-text blocks.
01763 
01764   TO_ROW *row;
01765   BLOBNBOX_IT box_it;
01766   BLOBNBOX_IT search_it;         // forward search
01767   WERD_IT word_it;               // new words
01768   WERD *word;                    // new word
01769   TBOX word_box;                 // for plotting
01770   int blobcount, repeated_set;
01771 
01772   TO_ROW_IT row_it = block->get_rows();
01773   if (row_it.empty()) return;  // empty block
01774   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
01775     row = row_it.data();
01776     box_it.set_to_list(row->blob_list());
01777     if (box_it.empty())  continue; // no blobs in this row
01778     if (!row->rep_chars_marked()) {
01779       mark_repeated_chars(row);
01780     }
01781     if (row->num_repeated_sets() == 0) continue;  // nothing to do for this row
01782     word_it.set_to_list(&row->rep_words);
01783     do {
01784       if (box_it.data()->repeated_set() != 0 &&
01785           !box_it.data()->joined_to_prev()) {
01786         blobcount = 1;
01787         repeated_set = box_it.data()->repeated_set();
01788         search_it = box_it;
01789         search_it.forward();
01790         while (!search_it.at_first() &&
01791                search_it.data()->repeated_set() == repeated_set) {
01792           blobcount++;
01793           search_it.forward();
01794         }
01795         // After the call to make_real_word() all the blobs from this
01796         // repeated set will be removed from the blob list. box_it will be
01797         // set to point to the blob after the end of the extracted sequence.
01798         word = make_real_word(&box_it, blobcount, box_it.at_first(), 1);
01799         if (!box_it.empty() && box_it.data()->joined_to_prev()) {
01800           tprintf("Bad box joined to prev at");
01801           box_it.data()->bounding_box().print();
01802           tprintf("After repeated word:");
01803           word->bounding_box().print();
01804         }
01805         ASSERT_HOST(box_it.empty() || !box_it.data()->joined_to_prev());
01806         word->set_flag(W_REP_CHAR, true);
01807         word->set_flag(W_DONT_CHOP, true);
01808         word_it.add_after_then_move(word);
01809       } else {
01810         box_it.forward();
01811       }
01812     } while (!box_it.at_first());
01813   }
01814 }
01815 
01816 
01817 /**********************************************************************
01818  * plot_fp_word
01819  *
01820  * Plot a block of words as if fixed pitch.
01821  **********************************************************************/
01822 
01823 #ifndef GRAPHICS_DISABLED
01824 void plot_fp_word(                  //draw block of words
01825                   TO_BLOCK *block,  //block to draw
01826                   float pitch,      //pitch to draw with
01827                   float nonspace    //for space threshold
01828                  ) {
01829   TO_ROW *row;                   //current row
01830   TO_ROW_IT row_it = block->get_rows ();
01831 
01832   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01833     row = row_it.data ();
01834     row->min_space = (inT32) ((pitch + nonspace) / 2);
01835     row->max_nonspace = row->min_space;
01836     row->space_threshold = row->min_space;
01837     plot_word_decisions (to_win, (inT16) pitch, row);
01838   }
01839 }
01840 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines