tesseract 3.04.01

textord/makerow.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        makerow.cpp  (Formerly makerows.c)
00003  * Description: Code to arrange blobs into rows of text.
00004  * Author:              Ray Smith
00005  * Created:             Mon Sep 21 14:34:48 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef __UNIX__
00021 #include          <assert.h>
00022 #endif
00023 #include          "stderr.h"
00024 #include          "blobbox.h"
00025 #include          "ccstruct.h"
00026 #include          "detlinefit.h"
00027 #include          "statistc.h"
00028 #include          "drawtord.h"
00029 #include          "blkocc.h"
00030 #include          "sortflts.h"
00031 #include          "oldbasel.h"
00032 #include          "textord.h"
00033 #include          "tordmain.h"
00034 #include          "underlin.h"
00035 #include          "makerow.h"
00036 #include          "tprintf.h"
00037 #include          "tovars.h"
00038 
00039 // Include automatically generated configuration file if running autoconf.
00040 #ifdef HAVE_CONFIG_H
00041 #include "config_auto.h"
00042 #endif
00043 
00044 BOOL_VAR(textord_heavy_nr, FALSE, "Vigorously remove noise");
00045 BOOL_VAR(textord_show_initial_rows, FALSE, "Display row accumulation");
00046 BOOL_VAR(textord_show_parallel_rows, FALSE, "Display page correlated rows");
00047 BOOL_VAR(textord_show_expanded_rows, FALSE, "Display rows after expanding");
00048 BOOL_VAR(textord_show_final_rows, FALSE, "Display rows after final fitting");
00049 BOOL_VAR(textord_show_final_blobs, FALSE, "Display blob bounds after pre-ass");
00050 BOOL_VAR(textord_test_landscape, FALSE, "Tests refer to land/port");
00051 BOOL_VAR(textord_parallel_baselines, TRUE, "Force parallel baselines");
00052 BOOL_VAR(textord_straight_baselines, FALSE, "Force straight baselines");
00053 BOOL_VAR(textord_old_baselines, TRUE, "Use old baseline algorithm");
00054 BOOL_VAR(textord_old_xheight, FALSE, "Use old xheight algorithm");
00055 BOOL_VAR(textord_fix_xheight_bug, TRUE, "Use spline baseline");
00056 BOOL_VAR(textord_fix_makerow_bug, TRUE, "Prevent multiple baselines");
00057 BOOL_VAR(textord_debug_xheights, FALSE, "Test xheight algorithms");
00058 BOOL_VAR(textord_biased_skewcalc, TRUE, "Bias skew estimates with line length");
00059 BOOL_VAR(textord_interpolating_skew, TRUE, "Interpolate across gaps");
00060 INT_VAR(textord_skewsmooth_offset, 4, "For smooth factor");
00061 INT_VAR(textord_skewsmooth_offset2, 1, "For smooth factor");
00062 INT_VAR(textord_test_x, -MAX_INT32, "coord of test pt");
00063 INT_VAR(textord_test_y, -MAX_INT32, "coord of test pt");
00064 INT_VAR(textord_min_blobs_in_row, 4, "Min blobs before gradient counted");
00065 INT_VAR(textord_spline_minblobs, 8, "Min blobs in each spline segment");
00066 INT_VAR(textord_spline_medianwin, 6, "Size of window for spline segmentation");
00067 INT_VAR(textord_max_blob_overlaps, 4,
00068         "Max number of blobs a big blob can overlap");
00069 INT_VAR(textord_min_xheight, 10, "Min credible pixel xheight");
00070 double_VAR(textord_spline_shift_fraction, 0.02,
00071            "Fraction of line spacing for quad");
00072 double_VAR(textord_spline_outlier_fraction, 0.1,
00073            "Fraction of line spacing for outlier");
00074 double_VAR(textord_skew_ile, 0.5, "Ile of gradients for page skew");
00075 double_VAR(textord_skew_lag, 0.02, "Lag for skew on row accumulation");
00076 double_VAR(textord_linespace_iqrlimit, 0.2, "Max iqr/median for linespace");
00077 double_VAR(textord_width_limit, 8, "Max width of blobs to make rows");
00078 double_VAR(textord_chop_width, 1.5, "Max width before chopping");
00079 double_VAR(textord_expansion_factor, 1.0,
00080            "Factor to expand rows by in expand_rows");
00081 double_VAR(textord_overlap_x, 0.375, "Fraction of linespace for good overlap");
00082 double_VAR(textord_minxh, 0.25, "fraction of linesize for min xheight");
00083 double_VAR(textord_min_linesize, 1.25, "* blob height for initial linesize");
00084 double_VAR(textord_excess_blobsize, 1.3,
00085            "New row made if blob makes row this big");
00086 double_VAR(textord_occupancy_threshold, 0.4, "Fraction of neighbourhood");
00087 double_VAR(textord_underline_width, 2.0, "Multiple of line_size for underline");
00088 double_VAR(textord_min_blob_height_fraction, 0.75,
00089            "Min blob height/top to include blob top into xheight stats");
00090 double_VAR(textord_xheight_mode_fraction, 0.4,
00091            "Min pile height to make xheight");
00092 double_VAR(textord_ascheight_mode_fraction, 0.08,
00093            "Min pile height to make ascheight");
00094 double_VAR(textord_descheight_mode_fraction, 0.08,
00095            "Min pile height to make descheight");
00096 double_VAR(textord_ascx_ratio_min, 1.25, "Min cap/xheight");
00097 double_VAR(textord_ascx_ratio_max, 1.8, "Max cap/xheight");
00098 double_VAR(textord_descx_ratio_min, 0.25, "Min desc/xheight");
00099 double_VAR(textord_descx_ratio_max, 0.6, "Max desc/xheight");
00100 double_VAR(textord_xheight_error_margin, 0.1, "Accepted variation");
00101 INT_VAR(textord_lms_line_trials, 12, "Number of linew fits to do");
00102 BOOL_VAR(textord_new_initial_xheight, TRUE, "Use test xheight mechanism");
00103 BOOL_VAR(textord_debug_blob, FALSE, "Print test blob information");
00104 
00105 #define MAX_HEIGHT_MODES  12
00106 
00107 const int kMinLeaderCount = 5;
00108 
00109 // Factored-out helper to build a single row from a list of blobs.
00110 // Returns the mean blob size.
00111 static float MakeRowFromBlobs(float line_size,
00112                               BLOBNBOX_IT* blob_it, TO_ROW_IT* row_it) {
00113   blob_it->sort(blob_x_order);
00114   blob_it->move_to_first();
00115   TO_ROW* row = NULL;
00116   float total_size = 0.0f;
00117   int blob_count = 0;
00118   // Add all the blobs to a single TO_ROW.
00119   for (; !blob_it->empty(); blob_it->forward()) {
00120     BLOBNBOX* blob = blob_it->extract();
00121     int top = blob->bounding_box().top();
00122     int bottom = blob->bounding_box().bottom();
00123     if (row == NULL) {
00124       row = new TO_ROW(blob, top, bottom, line_size);
00125       row_it->add_before_then_move(row);
00126     } else {
00127       row->add_blob(blob, top, bottom, line_size);
00128     }
00129     total_size += top - bottom;
00130     ++blob_count;
00131   }
00132   return blob_count > 0 ? total_size / blob_count : total_size;
00133 }
00134 
00135 // Helper to make a row using the children of a single blob.
00136 // Returns the mean size of the blobs created.
00137 float MakeRowFromSubBlobs(TO_BLOCK* block, C_BLOB* blob, TO_ROW_IT* row_it) {
00138   // The blobs made from the children will go in the small_blobs list.
00139   BLOBNBOX_IT bb_it(&block->small_blobs);
00140   C_OUTLINE_IT ol_it(blob->out_list());
00141   // Get the children.
00142   ol_it.set_to_list(ol_it.data()->child());
00143   if (ol_it.empty())
00144     return 0.0f;
00145   for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
00146     // Deep copy the child outline and use that to make a blob.
00147     C_BLOB* blob = new C_BLOB(C_OUTLINE::deep_copy(ol_it.data()));
00148     // Correct direction as needed.
00149     blob->CheckInverseFlagAndDirection();
00150     BLOBNBOX* bbox = new BLOBNBOX(blob);
00151     bb_it.add_after_then_move(bbox);
00152   }
00153   // Now we can make a row from the blobs.
00154   return MakeRowFromBlobs(block->line_size, &bb_it, row_it);
00155 }
00156 
00164 float make_single_row(ICOORD page_tr, bool allow_sub_blobs,
00165                       TO_BLOCK* block, TO_BLOCK_LIST* blocks) {
00166   BLOBNBOX_IT blob_it = &block->blobs;
00167   TO_ROW_IT row_it = block->get_rows();
00168 
00169   // Include all the small blobs and large blobs.
00170   blob_it.add_list_after(&block->small_blobs);
00171   blob_it.add_list_after(&block->noise_blobs);
00172   blob_it.add_list_after(&block->large_blobs);
00173   if (block->blobs.singleton() && allow_sub_blobs) {
00174     blob_it.move_to_first();
00175     float size = MakeRowFromSubBlobs(block, blob_it.data()->cblob(), &row_it);
00176     if (size > block->line_size)
00177       block->line_size = size;
00178   } else if (block->blobs.empty()) {
00179     // Make a fake blob.
00180     C_BLOB* blob = C_BLOB::FakeBlob(block->block->bounding_box());
00181     // The blobnbox owns the blob.
00182     BLOBNBOX* bblob = new BLOBNBOX(blob);
00183     blob_it.add_after_then_move(bblob);
00184   }
00185   MakeRowFromBlobs(block->line_size, &blob_it, &row_it);
00186   // Fit an LMS line to the rows.
00187   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward())
00188     fit_lms_line(row_it.data());
00189   float gradient;
00190   float fit_error;
00191   // Compute the skew based on the fitted line.
00192   compute_page_skew(blocks, gradient, fit_error);
00193   return gradient;
00194 }
00195 
00201 float make_rows(ICOORD page_tr, TO_BLOCK_LIST *port_blocks) {
00202   float port_m;                  // global skew
00203   float port_err;                // global noise
00204   TO_BLOCK_IT block_it;          // iterator
00205 
00206   block_it.set_to_list(port_blocks);
00207   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00208        block_it.forward())
00209   make_initial_textrows(page_tr, block_it.data(), FCOORD(1.0f, 0.0f),
00210       !(BOOL8) textord_test_landscape);
00211                                  // compute globally
00212   compute_page_skew(port_blocks, port_m, port_err);
00213   block_it.set_to_list(port_blocks);
00214   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
00215     cleanup_rows_making(page_tr, block_it.data(), port_m, FCOORD(1.0f, 0.0f),
00216                  block_it.data()->block->bounding_box().left(),
00217                  !(BOOL8)textord_test_landscape);
00218   }
00219   return port_m;                 // global skew
00220 }
00221 
00227 void make_initial_textrows(                  //find lines
00228                            ICOORD page_tr,
00229                            TO_BLOCK *block,  //block to do
00230                            FCOORD rotation,  //for drawing
00231                            BOOL8 testing_on  //correct orientation
00232                           ) {
00233   TO_ROW_IT row_it = block->get_rows ();
00234 
00235 #ifndef GRAPHICS_DISABLED
00236   ScrollView::Color colour;                 //of row
00237 
00238   if (textord_show_initial_rows && testing_on) {
00239     if (to_win == NULL)
00240       create_to_win(page_tr);
00241   }
00242 #endif
00243                                  //guess skew
00244   assign_blobs_to_rows (block, NULL, 0, TRUE, TRUE, textord_show_initial_rows && testing_on);
00245   row_it.move_to_first ();
00246   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
00247     fit_lms_line (row_it.data ());
00248 #ifndef GRAPHICS_DISABLED
00249   if (textord_show_initial_rows && testing_on) {
00250     colour = ScrollView::RED;
00251     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00252       plot_to_row (row_it.data (), colour, rotation);
00253       colour = (ScrollView::Color) (colour + 1);
00254       if (colour > ScrollView::MAGENTA)
00255         colour = ScrollView::RED;
00256     }
00257   }
00258 #endif
00259 }
00260 
00261 
00267 void fit_lms_line(TO_ROW *row) {
00268   float m, c;                    // fitted line
00269   tesseract::DetLineFit lms;
00270   BLOBNBOX_IT blob_it = row->blob_list();
00271 
00272   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00273     const TBOX& box = blob_it.data()->bounding_box();
00274     lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom()));
00275   }
00276   double error = lms.Fit(&m, &c);
00277   row->set_line(m, c, error);
00278 }
00279 
00280 
00287 void compute_page_skew(                        //get average gradient
00288                        TO_BLOCK_LIST *blocks,  //list of blocks
00289                        float &page_m,          //average gradient
00290                        float &page_err         //average error
00291                       ) {
00292   inT32 row_count;               //total rows
00293   inT32 blob_count;              //total_blobs
00294   inT32 row_err;                 //integer error
00295   float *gradients;              //of rows
00296   float *errors;                 //of rows
00297   inT32 row_index;               //of total
00298   TO_ROW *row;                   //current row
00299   TO_BLOCK_IT block_it = blocks; //iterator
00300   TO_ROW_IT row_it;
00301 
00302   row_count = 0;
00303   blob_count = 0;
00304   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00305        block_it.forward ()) {
00306     POLY_BLOCK* pb = block_it.data()->block->poly_block();
00307     if (pb != NULL && !pb->IsText())
00308       continue;  // Pretend non-text blocks don't exist.
00309     row_count += block_it.data ()->get_rows ()->length ();
00310     //count up rows
00311     row_it.set_to_list (block_it.data ()->get_rows ());
00312     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
00313       blob_count += row_it.data ()->blob_list ()->length ();
00314   }
00315   if (row_count == 0) {
00316     page_m = 0.0f;
00317     page_err = 0.0f;
00318     return;
00319   }
00320   gradients = (float *) alloc_mem (blob_count * sizeof (float));
00321   //get mem
00322   errors = (float *) alloc_mem (blob_count * sizeof (float));
00323   if (gradients == NULL || errors == NULL)
00324     MEMORY_OUT.error ("compute_page_skew", ABORT, NULL);
00325 
00326   row_index = 0;
00327   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00328        block_it.forward ()) {
00329     POLY_BLOCK* pb = block_it.data()->block->poly_block();
00330     if (pb != NULL && !pb->IsText())
00331       continue;  // Pretend non-text blocks don't exist.
00332     row_it.set_to_list (block_it.data ()->get_rows ());
00333     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00334       row = row_it.data ();
00335       blob_count = row->blob_list ()->length ();
00336       row_err = (inT32) ceil (row->line_error ());
00337       if (row_err <= 0)
00338         row_err = 1;
00339       if (textord_biased_skewcalc) {
00340         blob_count /= row_err;
00341         for (blob_count /= row_err; blob_count > 0; blob_count--) {
00342           gradients[row_index] = row->line_m ();
00343           errors[row_index] = row->line_error ();
00344           row_index++;
00345         }
00346       }
00347       else if (blob_count >= textord_min_blobs_in_row) {
00348                                  //get gradient
00349         gradients[row_index] = row->line_m ();
00350         errors[row_index] = row->line_error ();
00351         row_index++;
00352       }
00353     }
00354   }
00355   if (row_index == 0) {
00356                                  //desperate
00357     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00358          block_it.forward ()) {
00359       POLY_BLOCK* pb = block_it.data()->block->poly_block();
00360       if (pb != NULL && !pb->IsText())
00361         continue;  // Pretend non-text blocks don't exist.
00362       row_it.set_to_list (block_it.data ()->get_rows ());
00363       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
00364            row_it.forward ()) {
00365         row = row_it.data ();
00366         gradients[row_index] = row->line_m ();
00367         errors[row_index] = row->line_error ();
00368         row_index++;
00369       }
00370     }
00371   }
00372   row_count = row_index;
00373   row_index = choose_nth_item ((inT32) (row_count * textord_skew_ile),
00374     gradients, row_count);
00375   page_m = gradients[row_index];
00376   row_index = choose_nth_item ((inT32) (row_count * textord_skew_ile),
00377     errors, row_count);
00378   page_err = errors[row_index];
00379   free_mem(gradients);
00380   free_mem(errors);
00381 }
00382 
00383 const double kNoiseSize = 0.5;  // Fraction of xheight.
00384 const int kMinSize = 8;  // Min pixels to be xheight.
00385 
00390 static bool dot_of_i(BLOBNBOX* dot, BLOBNBOX* i, TO_ROW* row) {
00391   const TBOX& ibox = i->bounding_box();
00392   const TBOX& dotbox = dot->bounding_box();
00393 
00394   // Must overlap horizontally by enough and be high enough.
00395   int overlap = MIN(dotbox.right(), ibox.right()) -
00396                 MAX(dotbox.left(), ibox.left());
00397   if (ibox.height() <= 2 * dotbox.height() ||
00398       (overlap * 2 < ibox.width() && overlap < dotbox.width()))
00399     return false;
00400 
00401   // If the i is tall and thin then it is good.
00402   if (ibox.height() > ibox.width() * 2)
00403     return true;  // The i or ! must be tall and thin.
00404 
00405   // It might still be tall and thin, but it might be joined to something.
00406   // So search the outline for a piece of large height close to the edges
00407   // of the dot.
00408   const double kHeightFraction = 0.6;
00409   double target_height = MIN(dotbox.bottom(), ibox.top());
00410   target_height -= row->line_m()*dotbox.left() + row->line_c();
00411   target_height *= kHeightFraction;
00412   int left_min = dotbox.left() - dotbox.width();
00413   int middle = (dotbox.left() + dotbox.right())/2;
00414   int right_max = dotbox.right() + dotbox.width();
00415   int left_miny = 0;
00416   int left_maxy = 0;
00417   int right_miny = 0;
00418   int right_maxy = 0;
00419   bool found_left = false;
00420   bool found_right = false;
00421   bool in_left = false;
00422   bool in_right = false;
00423   C_BLOB* blob = i->cblob();
00424   C_OUTLINE_IT o_it = blob->out_list();
00425   for (o_it.mark_cycle_pt(); !o_it.cycled_list(); o_it.forward()) {
00426     C_OUTLINE* outline = o_it.data();
00427     int length = outline->pathlength();
00428     ICOORD pos = outline->start_pos();
00429     for (int step = 0; step < length; pos += outline->step(step++)) {
00430       int x = pos.x();
00431       int y = pos.y();
00432       if (x >= left_min && x < middle && !found_left) {
00433         // We are in the left part so find min and max y.
00434         if (in_left) {
00435           if (y > left_maxy) left_maxy = y;
00436           if (y < left_miny) left_miny = y;
00437         } else {
00438           left_maxy = left_miny = y;
00439           in_left = true;
00440         }
00441       } else if (in_left) {
00442         // We just left the left so look for size.
00443         if (left_maxy - left_miny > target_height) {
00444           if (found_right)
00445             return true;
00446           found_left = true;
00447         }
00448         in_left = false;
00449       }
00450       if (x <= right_max && x > middle && !found_right) {
00451         // We are in the right part so find min and max y.
00452         if (in_right) {
00453           if (y > right_maxy) right_maxy = y;
00454           if (y < right_miny) right_miny = y;
00455         } else {
00456           right_maxy = right_miny = y;
00457           in_right = true;
00458         }
00459       } else if (in_right) {
00460         // We just left the right so look for size.
00461         if (right_maxy - right_miny > target_height) {
00462           if (found_left)
00463             return true;
00464           found_right = true;
00465         }
00466         in_right = false;
00467       }
00468     }
00469   }
00470   return false;
00471 }
00472 
00473 void vigorous_noise_removal(TO_BLOCK* block) {
00474   TO_ROW_IT row_it = block->get_rows ();
00475   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00476     TO_ROW* row = row_it.data();
00477     BLOBNBOX_IT b_it = row->blob_list();
00478     // Estimate the xheight on the row.
00479     int max_height = 0;
00480     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00481       BLOBNBOX* blob = b_it.data();
00482       if (blob->bounding_box().height() > max_height)
00483         max_height = blob->bounding_box().height();
00484     }
00485     STATS hstats(0, max_height + 1);
00486     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00487       BLOBNBOX* blob = b_it.data();
00488       int height = blob->bounding_box().height();
00489       if (height >= kMinSize)
00490         hstats.add(blob->bounding_box().height(), 1);
00491     }
00492     float xheight = hstats.median();
00493     // Delete small objects.
00494     BLOBNBOX* prev = NULL;
00495     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00496       BLOBNBOX* blob = b_it.data();
00497       const TBOX& box = blob->bounding_box();
00498       if (box.height() < kNoiseSize * xheight) {
00499         // Small so delete unless it looks like an i dot.
00500         if (prev != NULL) {
00501           if (dot_of_i(blob, prev, row))
00502             continue;  // Looks OK.
00503         }
00504         if (!b_it.at_last()) {
00505           BLOBNBOX* next = b_it.data_relative(1);
00506           if (dot_of_i(blob, next, row))
00507             continue;  // Looks OK.
00508         }
00509         // It might be noise so get rid of it.
00510         if (blob->cblob() != NULL)
00511           delete blob->cblob();
00512         delete b_it.extract();
00513       } else {
00514         prev = blob;
00515       }
00516     }
00517   }
00518 }
00519 
00525 void cleanup_rows_making(                   //find lines
00526                   ICOORD page_tr,    //top right
00527                   TO_BLOCK *block,   //block to do
00528                   float gradient,    //gradient to fit
00529                   FCOORD rotation,   //for drawing
00530                   inT32 block_edge,  //edge of block
00531                   BOOL8 testing_on  //correct orientation
00532                  ) {
00533                                  //iterators
00534   BLOBNBOX_IT blob_it = &block->blobs;
00535   TO_ROW_IT row_it = block->get_rows ();
00536 
00537 #ifndef GRAPHICS_DISABLED
00538   if (textord_show_parallel_rows && testing_on) {
00539     if (to_win == NULL)
00540       create_to_win(page_tr);
00541   }
00542 #endif
00543                                  //get row coords
00544   fit_parallel_rows(block,
00545                     gradient,
00546                     rotation,
00547                     block_edge,
00548                     textord_show_parallel_rows &&testing_on);
00549   delete_non_dropout_rows(block,
00550                           gradient,
00551                           rotation,
00552                           block_edge,
00553                           textord_show_parallel_rows &&testing_on);
00554   expand_rows(page_tr, block, gradient, rotation, block_edge, testing_on);
00555   blob_it.set_to_list (&block->blobs);
00556   row_it.set_to_list (block->get_rows ());
00557   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
00558     blob_it.add_list_after (row_it.data ()->blob_list ());
00559   //give blobs back
00560   assign_blobs_to_rows (block, &gradient, 1, FALSE, FALSE, FALSE);
00561   //now new rows must be genuine
00562   blob_it.set_to_list (&block->blobs);
00563   blob_it.add_list_after (&block->large_blobs);
00564   assign_blobs_to_rows (block, &gradient, 2, TRUE, TRUE, FALSE);
00565   //safe to use big ones now
00566   blob_it.set_to_list (&block->blobs);
00567                                  //throw all blobs in
00568   blob_it.add_list_after (&block->noise_blobs);
00569   blob_it.add_list_after (&block->small_blobs);
00570   assign_blobs_to_rows (block, &gradient, 3, FALSE, FALSE, FALSE);
00571 }
00572 
00578 void delete_non_dropout_rows(                   //find lines
00579                              TO_BLOCK *block,   //block to do
00580                              float gradient,    //global skew
00581                              FCOORD rotation,   //deskew vector
00582                              inT32 block_edge,  //left edge
00583                              BOOL8 testing_on   //correct orientation
00584                             ) {
00585   TBOX block_box;                 //deskewed block
00586   inT32 *deltas;                 //change in occupation
00587   inT32 *occupation;             //of pixel coords
00588   inT32 max_y;                   //in block
00589   inT32 min_y;
00590   inT32 line_index;              //of scan line
00591   inT32 line_count;              //no of scan lines
00592   inT32 distance;                //to drop-out
00593   inT32 xleft;                   //of block
00594   inT32 ybottom;                 //of block
00595   TO_ROW *row;                   //current row
00596   TO_ROW_IT row_it = block->get_rows ();
00597   BLOBNBOX_IT blob_it = &block->blobs;
00598 
00599   if (row_it.length () == 0)
00600     return;                      //empty block
00601   block_box = deskew_block_coords (block, gradient);
00602   xleft = block->block->bounding_box ().left ();
00603   ybottom = block->block->bounding_box ().bottom ();
00604   min_y = block_box.bottom () - 1;
00605   max_y = block_box.top () + 1;
00606   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00607     line_index = (inT32) floor (row_it.data ()->intercept ());
00608     if (line_index <= min_y)
00609       min_y = line_index - 1;
00610     if (line_index >= max_y)
00611       max_y = line_index + 1;
00612   }
00613   line_count = max_y - min_y + 1;
00614   if (line_count <= 0)
00615     return;                      //empty block
00616   deltas = (inT32 *) alloc_mem (line_count * sizeof (inT32));
00617   occupation = (inT32 *) alloc_mem (line_count * sizeof (inT32));
00618   if (deltas == NULL || occupation == NULL)
00619     MEMORY_OUT.error ("compute_line_spacing", ABORT, NULL);
00620 
00621   compute_line_occupation(block, gradient, min_y, max_y, occupation, deltas);
00622   compute_occupation_threshold ((inT32)
00623     ceil (block->line_spacing *
00624     (tesseract::CCStruct::kDescenderFraction +
00625     tesseract::CCStruct::kAscenderFraction)),
00626     (inT32) ceil (block->line_spacing *
00627     (tesseract::CCStruct::kXHeightFraction +
00628     tesseract::CCStruct::kAscenderFraction)),
00629     max_y - min_y + 1, occupation, deltas);
00630 #ifndef GRAPHICS_DISABLED
00631   if (testing_on) {
00632     draw_occupation(xleft, ybottom, min_y, max_y, occupation, deltas);
00633   }
00634 #endif
00635   compute_dropout_distances(occupation, deltas, line_count);
00636   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00637     row = row_it.data ();
00638     line_index = (inT32) floor (row->intercept ());
00639     distance = deltas[line_index - min_y];
00640     if (find_best_dropout_row (row, distance, block->line_spacing / 2,
00641     line_index, &row_it, testing_on)) {
00642 #ifndef GRAPHICS_DISABLED
00643       if (testing_on)
00644         plot_parallel_row(row, gradient, block_edge,
00645                           ScrollView::WHITE, rotation);
00646 #endif
00647       blob_it.add_list_after (row_it.data ()->blob_list ());
00648       delete row_it.extract ();  //too far away
00649     }
00650   }
00651   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00652     blob_it.add_list_after (row_it.data ()->blob_list ());
00653   }
00654 
00655   free_mem(deltas);
00656   free_mem(occupation);
00657 }
00658 
00659 
00666 BOOL8 find_best_dropout_row(                    //find neighbours
00667                             TO_ROW *row,        //row to test
00668                             inT32 distance,     //dropout dist
00669                             float dist_limit,   //threshold distance
00670                             inT32 line_index,   //index of row
00671                             TO_ROW_IT *row_it,  //current position
00672                             BOOL8 testing_on    //correct orientation
00673                            ) {
00674   inT32 next_index;              //of neighbouring row
00675   inT32 row_offset;              //from current row
00676   inT32 abs_dist;                //absolute distance
00677   inT8 row_inc;                  //increment to row_index
00678   TO_ROW *next_row;              //nextious row
00679 
00680   if (testing_on)
00681     tprintf ("Row at %g(%g), dropout dist=%d,",
00682       row->intercept (), row->parallel_c (), distance);
00683   if (distance < 0) {
00684     row_inc = 1;
00685     abs_dist = -distance;
00686   }
00687   else {
00688     row_inc = -1;
00689     abs_dist = distance;
00690   }
00691   if (abs_dist > dist_limit) {
00692     if (testing_on) {
00693       tprintf (" too far - deleting\n");
00694     }
00695     return TRUE;
00696   }
00697   if ((distance < 0 && !row_it->at_last ())
00698   || (distance >= 0 && !row_it->at_first ())) {
00699     row_offset = row_inc;
00700     do {
00701       next_row = row_it->data_relative (row_offset);
00702       next_index = (inT32) floor (next_row->intercept ());
00703       if ((distance < 0
00704         && next_index < line_index
00705         && next_index > line_index + distance + distance)
00706         || (distance >= 0
00707         && next_index > line_index
00708       && next_index < line_index + distance + distance)) {
00709         if (testing_on) {
00710           tprintf (" nearer neighbour (%d) at %g\n",
00711             line_index + distance - next_index,
00712             next_row->intercept ());
00713         }
00714         return TRUE;             //other is nearer
00715       }
00716       else if (next_index == line_index
00717       || next_index == line_index + distance + distance) {
00718         if (row->believability () <= next_row->believability ()) {
00719           if (testing_on) {
00720             tprintf (" equal but more believable at %g (%g/%g)\n",
00721               next_row->intercept (),
00722               row->believability (),
00723               next_row->believability ());
00724           }
00725           return TRUE;           //other is more believable
00726         }
00727       }
00728       row_offset += row_inc;
00729     }
00730     while ((next_index == line_index
00731       || next_index == line_index + distance + distance)
00732       && row_offset < row_it->length ());
00733     if (testing_on)
00734       tprintf (" keeping\n");
00735   }
00736   return FALSE;
00737 }
00738 
00739 
00746 TBOX deskew_block_coords(                  //block box
00747                         TO_BLOCK *block,  //block to do
00748                         float gradient    //global skew
00749                        ) {
00750   TBOX result;                    //block bounds
00751   TBOX blob_box;                  //of block
00752   FCOORD rotation;               //deskew vector
00753   float length;                  //of gradient vector
00754   TO_ROW_IT row_it = block->get_rows ();
00755   TO_ROW *row;                   //current row
00756   BLOBNBOX *blob;                //current blob
00757   BLOBNBOX_IT blob_it;           //iterator
00758 
00759   length = sqrt (gradient * gradient + 1);
00760   rotation = FCOORD (1 / length, -gradient / length);
00761   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00762     row = row_it.data ();
00763     blob_it.set_to_list (row->blob_list ());
00764     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00765     blob_it.forward ()) {
00766       blob = blob_it.data ();
00767       blob_box = blob->bounding_box ();
00768       blob_box.rotate (rotation);//de-skew it
00769       result += blob_box;
00770     }
00771   }
00772   return result;
00773 }
00774 
00775 
00782 void compute_line_occupation(                    //project blobs
00783                              TO_BLOCK *block,    //block to do
00784                              float gradient,     //global skew
00785                              inT32 min_y,        //min coord in block
00786                              inT32 max_y,        //in block
00787                              inT32 *occupation,  //output projection
00788                              inT32 *deltas       //derivative
00789                             ) {
00790   inT32 line_count;              //maxy-miny+1
00791   inT32 line_index;              //of scan line
00792   int index;                     //array index for daft compilers
00793   float top, bottom;             //coords of blob
00794   inT32 width;                   //of blob
00795   TO_ROW *row;                   //current row
00796   TO_ROW_IT row_it = block->get_rows ();
00797   BLOBNBOX *blob;                //current blob
00798   BLOBNBOX_IT blob_it;           //iterator
00799   float length;                  //of skew vector
00800   TBOX blob_box;                  //bounding box
00801   FCOORD rotation;               //inverse of skew
00802 
00803   line_count = max_y - min_y + 1;
00804   length = sqrt (gradient * gradient + 1);
00805   rotation = FCOORD (1 / length, -gradient / length);
00806   for (line_index = 0; line_index < line_count; line_index++)
00807     deltas[line_index] = 0;
00808   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00809     row = row_it.data ();
00810     blob_it.set_to_list (row->blob_list ());
00811     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00812     blob_it.forward ()) {
00813       blob = blob_it.data ();
00814       blob_box = blob->bounding_box ();
00815       blob_box.rotate (rotation);//de-skew it
00816       top = blob_box.top ();
00817       bottom = blob_box.bottom ();
00818       width =
00819         (inT32) floor ((FLOAT32) (blob_box.right () - blob_box.left ()));
00820       if ((inT32) floor (bottom) < min_y
00821         || (inT32) floor (bottom) - min_y >= line_count)
00822         fprintf (stderr,
00823           "Bad y coord of bottom, " INT32FORMAT "(" INT32FORMAT ","
00824           INT32FORMAT ")\n", (inT32) floor (bottom), min_y, max_y);
00825                                  //count transitions
00826       index = (inT32) floor (bottom) - min_y;
00827       deltas[index] += width;
00828       if ((inT32) floor (top) < min_y
00829         || (inT32) floor (top) - min_y >= line_count)
00830         fprintf (stderr,
00831           "Bad y coord of top, " INT32FORMAT "(" INT32FORMAT ","
00832           INT32FORMAT ")\n", (inT32) floor (top), min_y, max_y);
00833       index = (inT32) floor (top) - min_y;
00834       deltas[index] -= width;
00835     }
00836   }
00837   occupation[0] = deltas[0];
00838   for (line_index = 1; line_index < line_count; line_index++)
00839     occupation[line_index] = occupation[line_index - 1] + deltas[line_index];
00840 }
00841 
00842 
00848 void compute_occupation_threshold(                    //project blobs
00849                                   inT32 low_window,   //below result point
00850                                   inT32 high_window,  //above result point
00851                                   inT32 line_count,   //array sizes
00852                                   inT32 *occupation,  //input projection
00853                                   inT32 *thresholds   //output thresholds
00854                                  ) {
00855   inT32 line_index;              //of thresholds line
00856   inT32 low_index;               //in occupation
00857   inT32 high_index;              //in occupation
00858   inT32 sum;                     //current average
00859   inT32 divisor;                 //to get thresholds
00860   inT32 min_index;               //of min occ
00861   inT32 min_occ;                 //min in locality
00862   inT32 test_index;              //for finding min
00863 
00864   divisor =
00865     (inT32) ceil ((low_window + high_window) / textord_occupancy_threshold);
00866   if (low_window + high_window < line_count) {
00867     for (sum = 0, high_index = 0; high_index < low_window; high_index++)
00868       sum += occupation[high_index];
00869     for (low_index = 0; low_index < high_window; low_index++, high_index++)
00870       sum += occupation[high_index];
00871     min_occ = occupation[0];
00872     min_index = 0;
00873     for (test_index = 1; test_index < high_index; test_index++) {
00874       if (occupation[test_index] <= min_occ) {
00875         min_occ = occupation[test_index];
00876         min_index = test_index;  //find min in region
00877       }
00878     }
00879     for (line_index = 0; line_index < low_window; line_index++)
00880       thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
00881     //same out to end
00882     for (low_index = 0; high_index < line_count; low_index++, high_index++) {
00883       sum -= occupation[low_index];
00884       sum += occupation[high_index];
00885       if (occupation[high_index] <= min_occ) {
00886                                  //find min in region
00887         min_occ = occupation[high_index];
00888         min_index = high_index;
00889       }
00890                                  //lost min from region
00891       if (min_index <= low_index) {
00892         min_occ = occupation[low_index + 1];
00893         min_index = low_index + 1;
00894         for (test_index = low_index + 2; test_index <= high_index;
00895         test_index++) {
00896           if (occupation[test_index] <= min_occ) {
00897             min_occ = occupation[test_index];
00898                                  //find min in region
00899             min_index = test_index;
00900           }
00901         }
00902       }
00903       thresholds[line_index++] = (sum - min_occ) / divisor + min_occ;
00904     }
00905   }
00906   else {
00907     min_occ = occupation[0];
00908     min_index = 0;
00909     for (sum = 0, low_index = 0; low_index < line_count; low_index++) {
00910       if (occupation[low_index] < min_occ) {
00911         min_occ = occupation[low_index];
00912         min_index = low_index;
00913       }
00914       sum += occupation[low_index];
00915     }
00916     line_index = 0;
00917   }
00918   for (; line_index < line_count; line_index++)
00919     thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
00920   //same out to end
00921 }
00922 
00923 
00929 void compute_dropout_distances(                    //project blobs
00930                                inT32 *occupation,  //input projection
00931                                inT32 *thresholds,  //output thresholds
00932                                inT32 line_count    //array sizes
00933                               ) {
00934   inT32 line_index;              //of thresholds line
00935   inT32 distance;                //from prev dropout
00936   inT32 next_dist;               //to next dropout
00937   inT32 back_index;              //for back filling
00938   inT32 prev_threshold;          //before overwrite
00939 
00940   distance = -line_count;
00941   line_index = 0;
00942   do {
00943     do {
00944       distance--;
00945       prev_threshold = thresholds[line_index];
00946                                  //distance from prev
00947       thresholds[line_index] = distance;
00948       line_index++;
00949     }
00950     while (line_index < line_count
00951       && (occupation[line_index] < thresholds[line_index]
00952       || occupation[line_index - 1] >= prev_threshold));
00953     if (line_index < line_count) {
00954       back_index = line_index - 1;
00955       next_dist = 1;
00956       while (next_dist < -distance && back_index >= 0) {
00957         thresholds[back_index] = next_dist;
00958         back_index--;
00959         next_dist++;
00960         distance++;
00961       }
00962       distance = 1;
00963     }
00964   }
00965   while (line_index < line_count);
00966 }
00967 
00968 
00976 void expand_rows(                   //find lines
00977                  ICOORD page_tr,    //top right
00978                  TO_BLOCK *block,   //block to do
00979                  float gradient,    //gradient to fit
00980                  FCOORD rotation,   //for drawing
00981                  inT32 block_edge,  //edge of block
00982                  BOOL8 testing_on   //correct orientation
00983                 ) {
00984   BOOL8 swallowed_row;           //eaten a neighbour
00985   float y_max, y_min;            //new row limits
00986   float y_bottom, y_top;         //allowed limits
00987   TO_ROW *test_row;              //next row
00988   TO_ROW *row;                   //current row
00989                                  //iterators
00990   BLOBNBOX_IT blob_it = &block->blobs;
00991   TO_ROW_IT row_it = block->get_rows ();
00992 
00993 #ifndef GRAPHICS_DISABLED
00994   if (textord_show_expanded_rows && testing_on) {
00995     if (to_win == NULL)
00996       create_to_win(page_tr);
00997   }
00998 #endif
00999 
01000   adjust_row_limits(block);  //shift min,max.
01001   if (textord_new_initial_xheight) {
01002     if (block->get_rows ()->length () == 0)
01003       return;
01004     compute_row_stats(block, textord_show_expanded_rows &&testing_on);
01005   }
01006   assign_blobs_to_rows (block, &gradient, 4, TRUE, FALSE, FALSE);
01007   //get real membership
01008   if (block->get_rows ()->length () == 0)
01009     return;
01010   fit_parallel_rows(block,
01011                     gradient,
01012                     rotation,
01013                     block_edge,
01014                     textord_show_expanded_rows &&testing_on);
01015   if (!textord_new_initial_xheight)
01016     compute_row_stats(block, textord_show_expanded_rows &&testing_on);
01017   row_it.move_to_last ();
01018   do {
01019     row = row_it.data ();
01020     y_max = row->max_y ();       //get current limits
01021     y_min = row->min_y ();
01022     y_bottom = row->intercept () - block->line_size * textord_expansion_factor *
01023       tesseract::CCStruct::kDescenderFraction;
01024     y_top = row->intercept () + block->line_size * textord_expansion_factor *
01025         (tesseract::CCStruct::kXHeightFraction +
01026          tesseract::CCStruct::kAscenderFraction);
01027     if (y_min > y_bottom) {      //expansion allowed
01028       if (textord_show_expanded_rows && testing_on)
01029         tprintf("Expanding bottom of row at %f from %f to %f\n",
01030                 row->intercept(), y_min, y_bottom);
01031                                  //expandable
01032       swallowed_row = TRUE;
01033       while (swallowed_row && !row_it.at_last ()) {
01034         swallowed_row = FALSE;
01035                                  //get next one
01036         test_row = row_it.data_relative (1);
01037                                  //overlaps space
01038         if (test_row->max_y () > y_bottom) {
01039           if (test_row->min_y () > y_bottom) {
01040             if (textord_show_expanded_rows && testing_on)
01041               tprintf("Eating row below at %f\n", test_row->intercept());
01042             row_it.forward ();
01043 #ifndef GRAPHICS_DISABLED
01044             if (textord_show_expanded_rows && testing_on)
01045               plot_parallel_row(test_row,
01046                                 gradient,
01047                                 block_edge,
01048                                 ScrollView::WHITE,
01049                                 rotation);
01050 #endif
01051             blob_it.set_to_list (row->blob_list ());
01052             blob_it.add_list_after (test_row->blob_list ());
01053                                  //swallow complete row
01054             delete row_it.extract ();
01055             row_it.backward ();
01056             swallowed_row = TRUE;
01057           }
01058           else if (test_row->max_y () < y_min) {
01059                                  //shorter limit
01060             y_bottom = test_row->max_y ();
01061             if (textord_show_expanded_rows && testing_on)
01062               tprintf("Truncating limit to %f due to touching row at %f\n",
01063                       y_bottom, test_row->intercept());
01064           }
01065           else {
01066             y_bottom = y_min;    //can't expand it
01067             if (textord_show_expanded_rows && testing_on)
01068               tprintf("Not expanding limit beyond %f due to touching row at %f\n",
01069                       y_bottom, test_row->intercept());
01070           }
01071         }
01072       }
01073       y_min = y_bottom;          //expand it
01074     }
01075     if (y_max < y_top) {         //expansion allowed
01076       if (textord_show_expanded_rows && testing_on)
01077         tprintf("Expanding top of row at %f from %f to %f\n",
01078                 row->intercept(), y_max, y_top);
01079       swallowed_row = TRUE;
01080       while (swallowed_row && !row_it.at_first ()) {
01081         swallowed_row = FALSE;
01082                                  //get one above
01083         test_row = row_it.data_relative (-1);
01084         if (test_row->min_y () < y_top) {
01085           if (test_row->max_y () < y_top) {
01086             if (textord_show_expanded_rows && testing_on)
01087               tprintf("Eating row above at %f\n", test_row->intercept());
01088             row_it.backward ();
01089             blob_it.set_to_list (row->blob_list ());
01090 #ifndef GRAPHICS_DISABLED
01091             if (textord_show_expanded_rows && testing_on)
01092               plot_parallel_row(test_row,
01093                                 gradient,
01094                                 block_edge,
01095                                 ScrollView::WHITE,
01096                                 rotation);
01097 #endif
01098             blob_it.add_list_after (test_row->blob_list ());
01099                                  //swallow complete row
01100             delete row_it.extract ();
01101             row_it.forward ();
01102             swallowed_row = TRUE;
01103           }
01104           else if (test_row->min_y () < y_max) {
01105                                  //shorter limit
01106             y_top = test_row->min_y ();
01107             if (textord_show_expanded_rows && testing_on)
01108               tprintf("Truncating limit to %f due to touching row at %f\n",
01109                       y_top, test_row->intercept());
01110           }
01111           else {
01112             y_top = y_max;       //can't expand it
01113             if (textord_show_expanded_rows && testing_on)
01114               tprintf("Not expanding limit beyond %f due to touching row at %f\n",
01115                       y_top, test_row->intercept());
01116           }
01117         }
01118       }
01119       y_max = y_top;
01120     }
01121                                  //new limits
01122     row->set_limits (y_min, y_max);
01123     row_it.backward ();
01124   }
01125   while (!row_it.at_last ());
01126 }
01127 
01128 
01134 void adjust_row_limits(                 //tidy limits
01135                        TO_BLOCK *block  //block to do
01136                       ) {
01137   TO_ROW *row;                   //current row
01138   float size;                    //size of row
01139   float ymax;                    //top of row
01140   float ymin;                    //bottom of row
01141   TO_ROW_IT row_it = block->get_rows ();
01142 
01143   if (textord_show_expanded_rows)
01144     tprintf("Adjusting row limits for block(%d,%d)\n",
01145             block->block->bounding_box().left(),
01146             block->block->bounding_box().top());
01147   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01148     row = row_it.data ();
01149     size = row->max_y () - row->min_y ();
01150     if (textord_show_expanded_rows)
01151       tprintf("Row at %f has min %f, max %f, size %f\n",
01152               row->intercept(), row->min_y(), row->max_y(), size);
01153     size /= tesseract::CCStruct::kXHeightFraction +
01154         tesseract::CCStruct::kAscenderFraction +
01155         tesseract::CCStruct::kDescenderFraction;
01156     ymax = size * (tesseract::CCStruct::kXHeightFraction +
01157                    tesseract::CCStruct::kAscenderFraction);
01158     ymin = -size * tesseract::CCStruct::kDescenderFraction;
01159     row->set_limits (row->intercept () + ymin, row->intercept () + ymax);
01160     row->merged = FALSE;
01161   }
01162 }
01163 
01164 
01170 void compute_row_stats(                  //find lines
01171                        TO_BLOCK *block,  //block to do
01172                        BOOL8 testing_on  //correct orientation
01173                       ) {
01174   inT32 row_index;               //of median
01175   TO_ROW *row;                   //current row
01176   TO_ROW *prev_row;              //previous row
01177   float iqr;                     //inter quartile range
01178   TO_ROW_IT row_it = block->get_rows ();
01179                                  //number of rows
01180   inT16 rowcount = row_it.length ();
01181   TO_ROW **rows;                 //for choose nth
01182 
01183   rows = (TO_ROW **) alloc_mem (rowcount * sizeof (TO_ROW *));
01184   if (rows == NULL)
01185     MEMORY_OUT.error ("compute_row_stats", ABORT, NULL);
01186   rowcount = 0;
01187   prev_row = NULL;
01188   row_it.move_to_last ();        //start at bottom
01189   do {
01190     row = row_it.data ();
01191     if (prev_row != NULL) {
01192       rows[rowcount++] = prev_row;
01193       prev_row->spacing = row->intercept () - prev_row->intercept ();
01194       if (testing_on)
01195         tprintf ("Row at %g yields spacing of %g\n",
01196           row->intercept (), prev_row->spacing);
01197     }
01198     prev_row = row;
01199     row_it.backward ();
01200   }
01201   while (!row_it.at_last ());
01202   block->key_row = prev_row;
01203   block->baseline_offset =
01204     fmod (prev_row->parallel_c (), block->line_spacing);
01205   if (testing_on)
01206     tprintf ("Blob based spacing=(%g,%g), offset=%g",
01207       block->line_size, block->line_spacing, block->baseline_offset);
01208   if (rowcount > 0) {
01209     row_index = choose_nth_item (rowcount * 3 / 4, rows, rowcount,
01210       sizeof (TO_ROW *), row_spacing_order);
01211     iqr = rows[row_index]->spacing;
01212     row_index = choose_nth_item (rowcount / 4, rows, rowcount,
01213       sizeof (TO_ROW *), row_spacing_order);
01214     iqr -= rows[row_index]->spacing;
01215     row_index = choose_nth_item (rowcount / 2, rows, rowcount,
01216       sizeof (TO_ROW *), row_spacing_order);
01217     block->key_row = rows[row_index];
01218     if (testing_on)
01219       tprintf (" row based=%g(%g)", rows[row_index]->spacing, iqr);
01220     if (rowcount > 2
01221     && iqr < rows[row_index]->spacing * textord_linespace_iqrlimit) {
01222       if (!textord_new_initial_xheight) {
01223         if (rows[row_index]->spacing < block->line_spacing
01224           && rows[row_index]->spacing > block->line_size)
01225           //within range
01226           block->line_size = rows[row_index]->spacing;
01227         //spacing=size
01228         else if (rows[row_index]->spacing > block->line_spacing)
01229           block->line_size = block->line_spacing;
01230         //too big so use max
01231       }
01232       else {
01233         if (rows[row_index]->spacing < block->line_spacing)
01234           block->line_size = rows[row_index]->spacing;
01235         else
01236           block->line_size = block->line_spacing;
01237         //too big so use max
01238       }
01239       if (block->line_size < textord_min_xheight)
01240         block->line_size = (float) textord_min_xheight;
01241       block->line_spacing = rows[row_index]->spacing;
01242       block->max_blob_size =
01243         block->line_spacing * textord_excess_blobsize;
01244     }
01245     block->baseline_offset = fmod (rows[row_index]->intercept (),
01246       block->line_spacing);
01247   }
01248   if (testing_on)
01249     tprintf ("\nEstimate line size=%g, spacing=%g, offset=%g\n",
01250       block->line_size, block->line_spacing, block->baseline_offset);
01251   free_mem(rows);
01252 }
01253 
01254 
01284 namespace tesseract {
01285 void Textord::compute_block_xheight(TO_BLOCK *block, float gradient) {
01286   TO_ROW *row;                          // current row
01287   float asc_frac_xheight = CCStruct::kAscenderFraction /
01288       CCStruct::kXHeightFraction;
01289   float desc_frac_xheight = CCStruct::kDescenderFraction /
01290       CCStruct::kXHeightFraction;
01291   inT32 min_height, max_height;         // limits on xheight
01292   TO_ROW_IT row_it = block->get_rows();
01293   if (row_it.empty()) return;  // no rows
01294 
01295   // Compute the best guess of xheight of each row individually.
01296   // Use xheight and ascrise values of the rows where ascenders were found.
01297   get_min_max_xheight(block->line_size, &min_height, &max_height);
01298   STATS row_asc_xheights(min_height, max_height + 1);
01299   STATS row_asc_ascrise(static_cast<int>(min_height * asc_frac_xheight),
01300                         static_cast<int>(max_height * asc_frac_xheight) + 1);
01301   int min_desc_height = static_cast<int>(min_height * desc_frac_xheight);
01302   int max_desc_height = static_cast<int>(max_height * desc_frac_xheight);
01303   STATS row_asc_descdrop(min_desc_height, max_desc_height + 1);
01304   STATS row_desc_xheights(min_height, max_height + 1);
01305   STATS row_desc_descdrop(min_desc_height, max_desc_height + 1);
01306   STATS row_cap_xheights(min_height, max_height + 1);
01307   STATS row_cap_floating_xheights(min_height, max_height + 1);
01308   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
01309     row = row_it.data();
01310     // Compute the xheight of this row if it has not been computed before.
01311     if (row->xheight <= 0.0) {
01312       compute_row_xheight(row, block->block->classify_rotation(),
01313                           gradient, block->line_size);
01314     }
01315     ROW_CATEGORY row_category = get_row_category(row);
01316     if (row_category == ROW_ASCENDERS_FOUND) {
01317       row_asc_xheights.add(static_cast<inT32>(row->xheight),
01318                            row->xheight_evidence);
01319       row_asc_ascrise.add(static_cast<inT32>(row->ascrise),
01320                           row->xheight_evidence);
01321       row_asc_descdrop.add(static_cast<inT32>(-row->descdrop),
01322                            row->xheight_evidence);
01323     } else if (row_category == ROW_DESCENDERS_FOUND) {
01324       row_desc_xheights.add(static_cast<inT32>(row->xheight),
01325                             row->xheight_evidence);
01326       row_desc_descdrop.add(static_cast<inT32>(-row->descdrop),
01327                             row->xheight_evidence);
01328     } else if (row_category == ROW_UNKNOWN) {
01329       fill_heights(row, gradient, min_height, max_height,
01330                    &row_cap_xheights, &row_cap_floating_xheights);
01331     }
01332   }
01333 
01334   float xheight = 0.0;
01335   float ascrise = 0.0;
01336   float descdrop = 0.0;
01337   // Compute our best guess of xheight of this block.
01338   if (row_asc_xheights.get_total() > 0) {
01339     // Determine xheight from rows where ascenders were found.
01340     xheight = row_asc_xheights.median();
01341     ascrise = row_asc_ascrise.median();
01342     descdrop = -row_asc_descdrop.median();
01343   } else if (row_desc_xheights.get_total() > 0) {
01344     // Determine xheight from rows where descenders were found.
01345     xheight = row_desc_xheights.median();
01346     descdrop = -row_desc_descdrop.median();
01347   } else if (row_cap_xheights.get_total() > 0) {
01348     // All the rows in the block were (a/de)scenderless.
01349     // Try to search for two modes in row_cap_heights that could
01350     // be the xheight and the capheight (e.g. some of the rows
01351     // were lowercase, but did not have enough (a/de)scenders.
01352     // If such two modes can not be found, this block is most
01353     // likely all caps (or all small caps, in which case the code
01354     // still works as intended).
01355     compute_xheight_from_modes(&row_cap_xheights, &row_cap_floating_xheights,
01356                                textord_single_height_mode &&
01357                                block->block->classify_rotation().y() == 0.0,
01358                                min_height, max_height, &(xheight), &(ascrise));
01359     if (ascrise == 0) {  // assume only caps in the whole block
01360       xheight = row_cap_xheights.median() * CCStruct::kXHeightCapRatio;
01361     }
01362   } else {  // default block sizes
01363     xheight = block->line_size * CCStruct::kXHeightFraction;
01364   }
01365   // Correct xheight, ascrise and descdrop if necessary.
01366   bool corrected_xheight = false;
01367   if (xheight < textord_min_xheight) {
01368     xheight = static_cast<float>(textord_min_xheight);
01369     corrected_xheight = true;
01370   }
01371   if (corrected_xheight || ascrise <= 0.0) {
01372     ascrise = xheight * asc_frac_xheight;
01373   }
01374   if (corrected_xheight || descdrop >= 0.0) {
01375     descdrop = -(xheight * desc_frac_xheight);
01376   }
01377   block->xheight = xheight;
01378 
01379   if (textord_debug_xheights) {
01380     tprintf("Block average xheight=%.4f, ascrise=%.4f, descdrop=%.4f\n",
01381             xheight, ascrise, descdrop);
01382   }
01383   // Correct xheight, ascrise, descdrop of rows based on block averages.
01384   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
01385     correct_row_xheight(row_it.data(), xheight, ascrise, descdrop);
01386   }
01387 }
01388 
01397 void Textord::compute_row_xheight(TO_ROW *row,          // row to do
01398                                   const FCOORD& rotation,
01399                                   float gradient,       // global skew
01400                                   int block_line_size) {
01401   // Find blobs representing repeated characters in rows and mark them.
01402   // This information is used for computing row xheight and at a later
01403   // stage when words are formed by make_words.
01404   if (!row->rep_chars_marked()) {
01405     mark_repeated_chars(row);
01406   }
01407 
01408   int min_height, max_height;
01409   get_min_max_xheight(block_line_size, &min_height, &max_height);
01410   STATS heights(min_height, max_height + 1);
01411   STATS floating_heights(min_height, max_height + 1);
01412   fill_heights(row, gradient, min_height, max_height,
01413                &heights, &floating_heights);
01414   row->ascrise = 0.0f;
01415   row->xheight = 0.0f;
01416   row->xheight_evidence =
01417     compute_xheight_from_modes(&heights, &floating_heights,
01418                                textord_single_height_mode &&
01419                                rotation.y() == 0.0,
01420                                min_height, max_height,
01421                                &(row->xheight), &(row->ascrise));
01422   row->descdrop = 0.0f;
01423   if (row->xheight > 0.0) {
01424     row->descdrop = static_cast<float>(
01425         compute_row_descdrop(row, gradient, row->xheight_evidence, &heights));
01426   }
01427 }
01428 
01429 }  // namespace tesseract.
01430 
01437 void fill_heights(TO_ROW *row, float gradient, int min_height,
01438                   int max_height, STATS *heights, STATS *floating_heights) {
01439   float xcentre;                 // centre of blob
01440   float top;                     // top y coord of blob
01441   float height;                  // height of blob
01442   BLOBNBOX *blob;                // current blob
01443   int repeated_set;
01444   BLOBNBOX_IT blob_it = row->blob_list();
01445   if (blob_it.empty()) return;  // no blobs in this row
01446   bool has_rep_chars =
01447     row->rep_chars_marked() && row->num_repeated_sets() > 0;
01448   do {
01449     blob = blob_it.data();
01450     if (!blob->joined_to_prev()) {
01451       xcentre = (blob->bounding_box().left() +
01452                  blob->bounding_box().right()) / 2.0f;
01453       top = blob->bounding_box().top();
01454       height = blob->bounding_box().height();
01455       if (textord_fix_xheight_bug)
01456         top -= row->baseline.y(xcentre);
01457       else
01458         top -= gradient * xcentre + row->parallel_c();
01459       if (top >= min_height && top <= max_height) {
01460         heights->add(static_cast<inT32>(floor(top + 0.5)), 1);
01461         if (height / top < textord_min_blob_height_fraction) {
01462           floating_heights->add(static_cast<inT32>(floor(top + 0.5)), 1);
01463         }
01464       }
01465     }
01466     // Skip repeated chars, since they are likely to skew the height stats.
01467     if (has_rep_chars && blob->repeated_set() != 0) {
01468       repeated_set = blob->repeated_set();
01469       blob_it.forward();
01470       while (!blob_it.at_first() &&
01471              blob_it.data()->repeated_set() == repeated_set) {
01472         blob_it.forward();
01473         if (textord_debug_xheights)
01474           tprintf("Skipping repeated char when computing xheight\n");
01475       }
01476     } else {
01477       blob_it.forward();
01478     }
01479   } while (!blob_it.at_first());
01480 }
01481 
01498 int compute_xheight_from_modes(
01499     STATS *heights, STATS *floating_heights, bool cap_only, int min_height,
01500     int max_height, float *xheight, float *ascrise) {
01501   int blob_index = heights->mode();  // find mode
01502   int blob_count = heights->pile_count(blob_index);  // get count of mode
01503   if (textord_debug_xheights) {
01504     tprintf("min_height=%d, max_height=%d, mode=%d, count=%d, total=%d\n",
01505             min_height, max_height, blob_index, blob_count,
01506             heights->get_total());
01507     heights->print();
01508     floating_heights->print();
01509   }
01510   if (blob_count == 0) return 0;
01511   int modes[MAX_HEIGHT_MODES];  // biggest piles
01512   bool in_best_pile = FALSE;
01513   int prev_size = -MAX_INT32;
01514   int best_count = 0;
01515   int mode_count = compute_height_modes(heights, min_height, max_height,
01516                                         modes, MAX_HEIGHT_MODES);
01517   if (cap_only && mode_count > 1)
01518     mode_count = 1;
01519   int x;
01520   if (textord_debug_xheights) {
01521     tprintf("found %d modes: ", mode_count);
01522     for (x = 0; x < mode_count; x++) tprintf("%d ", modes[x]);
01523     tprintf("\n");
01524   }
01525 
01526   for (x = 0; x < mode_count - 1; x++) {
01527     if (modes[x] != prev_size + 1)
01528       in_best_pile = FALSE;    // had empty height
01529     int modes_x_count = heights->pile_count(modes[x]) -
01530       floating_heights->pile_count(modes[x]);
01531     if ((modes_x_count >= blob_count * textord_xheight_mode_fraction) &&
01532         (in_best_pile || modes_x_count > best_count)) {
01533       for (int asc = x + 1; asc < mode_count; asc++) {
01534         float ratio =
01535           static_cast<float>(modes[asc]) / static_cast<float>(modes[x]);
01536         if (textord_ascx_ratio_min < ratio &&
01537             ratio < textord_ascx_ratio_max &&
01538             (heights->pile_count(modes[asc]) >=
01539              blob_count * textord_ascheight_mode_fraction)) {
01540           if (modes_x_count > best_count) {
01541             in_best_pile = true;
01542             best_count = modes_x_count;
01543           }
01544           if (textord_debug_xheights) {
01545             tprintf("X=%d, asc=%d, count=%d, ratio=%g\n",
01546                     modes[x], modes[asc]-modes[x], modes_x_count, ratio);
01547           }
01548           prev_size = modes[x];
01549           *xheight = static_cast<float>(modes[x]);
01550           *ascrise = static_cast<float>(modes[asc] - modes[x]);
01551         }
01552       }
01553     }
01554   }
01555   if (*xheight == 0) {  // single mode
01556     // Remove counts of the "floating" blobs (the one whose height is too
01557     // small in relation to it's top end of the bounding box) from heights
01558     // before computing the single-mode xheight.
01559     // Restore the counts in heights after the mode is found, since
01560     // floating blobs might be useful for determining potential ascenders
01561     // in compute_row_descdrop().
01562     if (floating_heights->get_total() > 0) {
01563       for (x = min_height; x < max_height; ++x) {
01564         heights->add(x, -(floating_heights->pile_count(x)));
01565       }
01566       blob_index = heights->mode();  // find the modified mode
01567       for (x = min_height; x < max_height; ++x) {
01568         heights->add(x, floating_heights->pile_count(x));
01569       }
01570     }
01571     *xheight = static_cast<float>(blob_index);
01572     *ascrise = 0.0f;
01573     best_count = heights->pile_count(blob_index);
01574     if (textord_debug_xheights)
01575       tprintf("Single mode xheight set to %g\n", *xheight);
01576   } else if (textord_debug_xheights) {
01577     tprintf("Multi-mode xheight set to %g, asc=%g\n", *xheight, *ascrise);
01578   }
01579   return best_count;
01580 }
01581 
01594 inT32 compute_row_descdrop(TO_ROW *row, float gradient,
01595                            int xheight_blob_count, STATS *asc_heights) {
01596   // Count how many potential ascenders are in this row.
01597   int i_min = asc_heights->min_bucket();
01598   if ((i_min / row->xheight) < textord_ascx_ratio_min) {
01599     i_min = static_cast<int>(
01600         floor(row->xheight * textord_ascx_ratio_min + 0.5));
01601   }
01602   int i_max = asc_heights->max_bucket();
01603   if ((i_max / row->xheight) > textord_ascx_ratio_max) {
01604     i_max = static_cast<int>(floor(row->xheight * textord_ascx_ratio_max));
01605   }
01606   int num_potential_asc = 0;
01607   for (int i = i_min; i <= i_max; ++i) {
01608     num_potential_asc += asc_heights->pile_count(i);
01609   }
01610   inT32 min_height =
01611     static_cast<inT32>(floor(row->xheight * textord_descx_ratio_min + 0.5));
01612   inT32 max_height =
01613     static_cast<inT32>(floor(row->xheight * textord_descx_ratio_max));
01614   float xcentre;                 // centre of blob
01615   float height;                  // height of blob
01616   BLOBNBOX_IT blob_it = row->blob_list();
01617   BLOBNBOX *blob;                // current blob
01618   STATS heights (min_height, max_height + 1);
01619   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
01620     blob = blob_it.data();
01621     if (!blob->joined_to_prev()) {
01622       xcentre = (blob->bounding_box().left() +
01623                  blob->bounding_box().right()) / 2.0f;
01624       height = (gradient * xcentre + row->parallel_c() -
01625                 blob->bounding_box().bottom());
01626       if (height >= min_height && height <= max_height)
01627         heights.add(static_cast<int>(floor(height + 0.5)), 1);
01628     }
01629   }
01630   int blob_index = heights.mode();  // find mode
01631   int blob_count = heights.pile_count(blob_index);  // get count of mode
01632   float total_fraction =
01633     (textord_descheight_mode_fraction + textord_ascheight_mode_fraction);
01634   if (static_cast<float>(blob_count + num_potential_asc) <
01635       xheight_blob_count * total_fraction) {
01636     blob_count = 0;
01637   }
01638   int descdrop = blob_count > 0 ? -blob_index : 0;
01639   if (textord_debug_xheights) {
01640     tprintf("Descdrop: %d (potential ascenders %d, descenders %d)\n",
01641             descdrop, num_potential_asc, blob_count);
01642     heights.print();
01643   }
01644   return descdrop;
01645 }
01646 
01647 
01654 inT32 compute_height_modes(STATS *heights,    // stats to search
01655                            inT32 min_height,  // bottom of range
01656                            inT32 max_height,  // top of range
01657                            inT32 *modes,      // output array
01658                            inT32 maxmodes) {  // size of modes
01659   inT32 pile_count;              // no in source pile
01660   inT32 src_count;               // no of source entries
01661   inT32 src_index;               // current entry
01662   inT32 least_count;             // height of smalllest
01663   inT32 least_index;             // index of least
01664   inT32 dest_count;              // index in modes
01665 
01666   src_count = max_height + 1 - min_height;
01667   dest_count = 0;
01668   least_count = MAX_INT32;
01669   least_index = -1;
01670   for (src_index = 0; src_index < src_count; src_index++) {
01671     pile_count = heights->pile_count(min_height + src_index);
01672     if (pile_count > 0) {
01673       if (dest_count < maxmodes) {
01674         if (pile_count < least_count) {
01675           // find smallest in array
01676           least_count = pile_count;
01677           least_index = dest_count;
01678         }
01679         modes[dest_count++] = min_height + src_index;
01680       } else if (pile_count >= least_count) {
01681         while (least_index < maxmodes - 1) {
01682           modes[least_index] = modes[least_index + 1];
01683           // shuffle up
01684           least_index++;
01685         }
01686         // new one on end
01687         modes[maxmodes - 1] = min_height + src_index;
01688         if (pile_count == least_count) {
01689           // new smallest
01690           least_index = maxmodes - 1;
01691         } else {
01692           least_count = heights->pile_count(modes[0]);
01693           least_index = 0;
01694           for (dest_count = 1; dest_count < maxmodes; dest_count++) {
01695             pile_count = heights->pile_count(modes[dest_count]);
01696             if (pile_count < least_count) {
01697               // find smallest
01698               least_count = pile_count;
01699               least_index = dest_count;
01700             }
01701           }
01702         }
01703       }
01704     }
01705   }
01706   return dest_count;
01707 }
01708 
01709 
01716 void correct_row_xheight(TO_ROW *row, float xheight,
01717                          float ascrise, float descdrop) {
01718   ROW_CATEGORY row_category = get_row_category(row);
01719   if (textord_debug_xheights) {
01720     tprintf("correcting row xheight: row->xheight %.4f"
01721             ", row->acrise %.4f row->descdrop %.4f\n",
01722             row->xheight, row->ascrise, row->descdrop);
01723   }
01724   bool normal_xheight =
01725     within_error_margin(row->xheight, xheight, textord_xheight_error_margin);
01726   bool cap_xheight =
01727     within_error_margin(row->xheight, xheight + ascrise,
01728                         textord_xheight_error_margin);
01729   // Use the average xheight/ascrise for the following cases:
01730   // -- the xheight of the row could not be determined at all
01731   // -- the row has descenders (e.g. "many groups", "ISBN 12345 p.3")
01732   //    and its xheight is close to either cap height or average xheight
01733   // -- the row does not have ascenders or descenders, but its xheight
01734   //    is close to the average block xheight (e.g. row with "www.mmm.com")
01735   if (row_category == ROW_ASCENDERS_FOUND) {
01736     if (row->descdrop >= 0.0) {
01737       row->descdrop = row->xheight * (descdrop / xheight);
01738     }
01739   } else if (row_category == ROW_INVALID ||
01740              (row_category == ROW_DESCENDERS_FOUND &&
01741               (normal_xheight || cap_xheight)) ||
01742               (row_category == ROW_UNKNOWN && normal_xheight)) {
01743     if (textord_debug_xheights) tprintf("using average xheight\n");
01744     row->xheight = xheight;
01745     row->ascrise = ascrise;
01746     row->descdrop = descdrop;
01747   } else if (row_category == ROW_DESCENDERS_FOUND) {
01748     // Assume this is a row with mostly lowercase letters and it's xheight
01749     // is computed correctly (unfortunately there is no way to distinguish
01750     // this from the case when descenders are found, but the most common
01751     // height is capheight).
01752     if (textord_debug_xheights) tprintf("lowercase, corrected ascrise\n");
01753     row->ascrise = row->xheight * (ascrise / xheight);
01754   } else if (row_category == ROW_UNKNOWN) {
01755   // Otherwise assume this row is an all-caps or small-caps row
01756   // and adjust xheight and ascrise of the row.
01757 
01758     row->all_caps = true;
01759     if (cap_xheight) { // regular all caps
01760       if (textord_debug_xheights) tprintf("all caps\n");
01761       row->xheight = xheight;
01762       row->ascrise = ascrise;
01763       row->descdrop = descdrop;
01764     } else {  // small caps or caps with an odd xheight
01765       if (textord_debug_xheights) {
01766         if (row->xheight < xheight + ascrise && row->xheight > xheight) {
01767           tprintf("small caps\n");
01768         } else {
01769           tprintf("all caps with irregular xheight\n");
01770         }
01771       }
01772       row->ascrise = row->xheight * (ascrise / (xheight + ascrise));
01773       row->xheight -= row->ascrise;
01774       row->descdrop = row->xheight * (descdrop / xheight);
01775     }
01776   }
01777   if (textord_debug_xheights) {
01778     tprintf("corrected row->xheight = %.4f, row->acrise = %.4f, row->descdrop"
01779             " = %.4f\n", row->xheight, row->ascrise, row->descdrop);
01780   }
01781 }
01782 
01783 static int CountOverlaps(const TBOX& box, int min_height,
01784                          BLOBNBOX_LIST* blobs) {
01785   int overlaps = 0;
01786   BLOBNBOX_IT blob_it(blobs);
01787   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
01788     BLOBNBOX* blob = blob_it.data();
01789     TBOX blob_box = blob->bounding_box();
01790     if (blob_box.height() >= min_height && box.major_overlap(blob_box)) {
01791       ++overlaps;
01792     }
01793   }
01794   return overlaps;
01795 }
01796 
01803 void separate_underlines(TO_BLOCK *block,  // block to do
01804                          float gradient,   // skew angle
01805                          FCOORD rotation,  // inverse landscape
01806                          BOOL8 testing_on) {  // correct orientation
01807   BLOBNBOX *blob;                // current blob
01808   C_BLOB *rotated_blob;          // rotated blob
01809   TO_ROW *row;                   // current row
01810   float length;                  // of g_vec
01811   TBOX blob_box;
01812   FCOORD blob_rotation;          // inverse of rotation
01813   FCOORD g_vec;                  // skew rotation
01814   BLOBNBOX_IT blob_it;           // iterator
01815                                  // iterator
01816   BLOBNBOX_IT under_it = &block->underlines;
01817   BLOBNBOX_IT large_it = &block->large_blobs;
01818   TO_ROW_IT row_it = block->get_rows();
01819   int min_blob_height = static_cast<int>(textord_min_blob_height_fraction *
01820                                          block->line_size + 0.5);
01821 
01822                                  // length of vector
01823   length = sqrt(1 + gradient * gradient);
01824   g_vec = FCOORD(1 / length, -gradient / length);
01825   blob_rotation = FCOORD(rotation.x(), -rotation.y());
01826   blob_rotation.rotate(g_vec);  // undoing everything
01827   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
01828     row = row_it.data();
01829                                  // get blobs
01830     blob_it.set_to_list(row->blob_list());
01831     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
01832          blob_it.forward()) {
01833       blob = blob_it.data();
01834       blob_box = blob->bounding_box();
01835       if (blob_box.width() > block->line_size * textord_underline_width) {
01836         ASSERT_HOST(blob->cblob() != NULL);
01837         rotated_blob = crotate_cblob (blob->cblob(),
01838           blob_rotation);
01839         if (test_underline(
01840             testing_on && textord_show_final_rows,
01841             rotated_blob, static_cast<inT16>(row->intercept()),
01842             static_cast<inT16>(
01843                 block->line_size *
01844                 (tesseract::CCStruct::kXHeightFraction +
01845                  tesseract::CCStruct::kAscenderFraction / 2.0f)))) {
01846           under_it.add_after_then_move(blob_it.extract());
01847           if (testing_on && textord_show_final_rows) {
01848             tprintf("Underlined blob at:");
01849               rotated_blob->bounding_box().print();
01850             tprintf("Was:");
01851               blob_box.print();
01852           }
01853         } else if (CountOverlaps(blob->bounding_box(), min_blob_height,
01854                                  row->blob_list()) >
01855                    textord_max_blob_overlaps) {
01856           large_it.add_after_then_move(blob_it.extract());
01857           if (testing_on && textord_show_final_rows) {
01858             tprintf("Large blob overlaps %d blobs at:",
01859                     CountOverlaps(blob_box, min_blob_height,
01860                                   row->blob_list()));
01861             blob_box.print();
01862           }
01863         }
01864         delete rotated_blob;
01865       }
01866     }
01867   }
01868 }
01869 
01870 
01876 void pre_associate_blobs(                  //make rough chars
01877                          ICOORD page_tr,   //top right
01878                          TO_BLOCK *block,  //block to do
01879                          FCOORD rotation,  //inverse landscape
01880                          BOOL8 testing_on  //correct orientation
01881                         ) {
01882 #ifndef GRAPHICS_DISABLED
01883   ScrollView::Color colour;                 //of boxes
01884 #endif
01885   BLOBNBOX *blob;                //current blob
01886   BLOBNBOX *nextblob;            //next in list
01887   TBOX blob_box;
01888   FCOORD blob_rotation;          //inverse of rotation
01889   BLOBNBOX_IT blob_it;           //iterator
01890   BLOBNBOX_IT start_it;          //iterator
01891   TO_ROW_IT row_it = block->get_rows ();
01892 
01893 #ifndef GRAPHICS_DISABLED
01894   colour = ScrollView::RED;
01895 #endif
01896 
01897   blob_rotation = FCOORD (rotation.x (), -rotation.y ());
01898   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01899                                  //get blobs
01900     blob_it.set_to_list (row_it.data ()->blob_list ());
01901     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
01902     blob_it.forward ()) {
01903       blob = blob_it.data ();
01904       blob_box = blob->bounding_box ();
01905       start_it = blob_it;        //save start point
01906       //                      if (testing_on && textord_show_final_blobs)
01907       //                      {
01908       //                              tprintf("Blob at (%d,%d)->(%d,%d), addr=%x, count=%d\n",
01909       //                                      blob_box.left(),blob_box.bottom(),
01910       //                                      blob_box.right(),blob_box.top(),
01911       //                                      (void*)blob,blob_it.length());
01912       //                      }
01913       bool overlap;
01914       do {
01915         overlap = false;
01916         if (!blob_it.at_last ()) {
01917           nextblob = blob_it.data_relative(1);
01918           overlap = blob_box.major_x_overlap(nextblob->bounding_box());
01919           if (overlap) {
01920             blob->merge(nextblob); // merge new blob
01921             blob_box = blob->bounding_box(); // get bigger box
01922             blob_it.forward();
01923           }
01924         }
01925       }
01926       while (overlap);
01927       blob->chop (&start_it, &blob_it,
01928         blob_rotation,
01929         block->line_size * tesseract::CCStruct::kXHeightFraction *
01930         textord_chop_width);
01931       //attempt chop
01932     }
01933 #ifndef GRAPHICS_DISABLED
01934     if (testing_on && textord_show_final_blobs) {
01935       if (to_win == NULL)
01936         create_to_win(page_tr);
01937       to_win->Pen(colour);
01938       for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
01939       blob_it.forward ()) {
01940         blob = blob_it.data ();
01941         blob_box = blob->bounding_box ();
01942         blob_box.rotate (rotation);
01943         if (!blob->joined_to_prev ()) {
01944           to_win->Rectangle (blob_box.left (), blob_box.bottom (),
01945             blob_box.right (), blob_box.top ());
01946         }
01947       }
01948       colour = (ScrollView::Color) (colour + 1);
01949       if (colour > ScrollView::MAGENTA)
01950         colour = ScrollView::RED;
01951     }
01952 #endif
01953   }
01954 }
01955 
01956 
01962 void fit_parallel_rows(                   //find lines
01963                        TO_BLOCK *block,   //block to do
01964                        float gradient,    //gradient to fit
01965                        FCOORD rotation,   //for drawing
01966                        inT32 block_edge,  //edge of block
01967                        BOOL8 testing_on   //correct orientation
01968                       ) {
01969 #ifndef GRAPHICS_DISABLED
01970   ScrollView::Color colour;                 //of row
01971 #endif
01972   TO_ROW_IT row_it = block->get_rows ();
01973 
01974   row_it.move_to_first ();
01975   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01976     if (row_it.data ()->blob_list ()->empty ())
01977       delete row_it.extract ();  //nothing in it
01978     else
01979       fit_parallel_lms (gradient, row_it.data ());
01980   }
01981 #ifndef GRAPHICS_DISABLED
01982   if (testing_on) {
01983     colour = ScrollView::RED;
01984     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01985       plot_parallel_row (row_it.data (), gradient,
01986         block_edge, colour, rotation);
01987       colour = (ScrollView::Color) (colour + 1);
01988       if (colour > ScrollView::MAGENTA)
01989         colour = ScrollView::RED;
01990     }
01991   }
01992 #endif
01993   row_it.sort (row_y_order);     //may have gone out of order
01994 }
01995 
01996 
02004 void fit_parallel_lms(float gradient, TO_ROW *row) {
02005   float c;                       // fitted line
02006   int blobcount;                 // no of blobs
02007    tesseract::DetLineFit lms;
02008   BLOBNBOX_IT blob_it = row->blob_list();
02009 
02010   blobcount = 0;
02011   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
02012     if (!blob_it.data()->joined_to_prev()) {
02013       const TBOX& box = blob_it.data()->bounding_box();
02014       lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom()));
02015       blobcount++;
02016     }
02017   }
02018   double error = lms.ConstrainedFit(gradient, &c);
02019   row->set_parallel_line(gradient, c, error);
02020   if (textord_straight_baselines && blobcount > textord_lms_line_trials) {
02021     error = lms.Fit(&gradient, &c);
02022   }
02023                                  //set the other too
02024   row->set_line(gradient, c, error);
02025 }
02026 
02027 
02033 namespace tesseract {
02034 void Textord::make_spline_rows(TO_BLOCK *block,   // block to do
02035                                float gradient,    // gradient to fit
02036                                BOOL8 testing_on) {
02037 #ifndef GRAPHICS_DISABLED
02038   ScrollView::Color colour;       //of row
02039 #endif
02040   TO_ROW_IT row_it = block->get_rows ();
02041 
02042   row_it.move_to_first ();
02043   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
02044     if (row_it.data ()->blob_list ()->empty ())
02045       delete row_it.extract ();  //nothing in it
02046     else
02047       make_baseline_spline (row_it.data (), block);
02048   }
02049   if (textord_old_baselines) {
02050 #ifndef GRAPHICS_DISABLED
02051     if (testing_on) {
02052       colour = ScrollView::RED;
02053       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
02054       row_it.forward ()) {
02055         row_it.data ()->baseline.plot (to_win, colour);
02056         colour = (ScrollView::Color) (colour + 1);
02057         if (colour > ScrollView::MAGENTA)
02058           colour = ScrollView::RED;
02059       }
02060     }
02061 #endif
02062     make_old_baselines(block, testing_on, gradient);
02063   }
02064 #ifndef GRAPHICS_DISABLED
02065   if (testing_on) {
02066     colour = ScrollView::RED;
02067     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
02068       row_it.data ()->baseline.plot (to_win, colour);
02069       colour = (ScrollView::Color) (colour + 1);
02070       if (colour > ScrollView::MAGENTA)
02071         colour = ScrollView::RED;
02072     }
02073   }
02074 #endif
02075 }
02076 
02077 }  // namespace tesseract.
02078 
02079 
02087 void make_baseline_spline(TO_ROW *row,     //row to fit
02088                           TO_BLOCK *block) {
02089   inT32 *xstarts;                // spline boundaries
02090   double *coeffs;                // quadratic coeffs
02091   inT32 segments;                // no of segments
02092 
02093   xstarts =
02094     (inT32 *) alloc_mem((row->blob_list()->length() + 1) * sizeof(inT32));
02095   if (segment_baseline(row, block, segments, xstarts)
02096   && !textord_straight_baselines && !textord_parallel_baselines) {
02097     coeffs = linear_spline_baseline(row, block, segments, xstarts);
02098   } else {
02099     xstarts[1] = xstarts[segments];
02100     segments = 1;
02101     coeffs = (double *) alloc_mem (3 * sizeof (double));
02102     coeffs[0] = 0;
02103     coeffs[1] = row->line_m ();
02104     coeffs[2] = row->line_c ();
02105   }
02106   row->baseline = QSPLINE (segments, xstarts, coeffs);
02107   free_mem(coeffs);
02108   free_mem(xstarts);
02109 }
02110 
02111 
02119 BOOL8
02120 segment_baseline (               //split baseline
02121 TO_ROW * row,                    //row to fit
02122 TO_BLOCK * block,                //block it came from
02123 inT32 & segments,                //no fo segments
02124 inT32 xstarts[]                  //coords of segments
02125 ) {
02126   BOOL8 needs_curve;             //needs curved line
02127   int blobcount;                 //no of blobs
02128   int blobindex;                 //current blob
02129   int last_state;                //above, on , below
02130   int state;                     //of current blob
02131   float yshift;                  //from baseline
02132   TBOX box;                       //blob box
02133   TBOX new_box;                   //new_it box
02134   float middle;                  //xcentre of blob
02135                                  //blobs
02136   BLOBNBOX_IT blob_it = row->blob_list ();
02137   BLOBNBOX_IT new_it = blob_it;  //front end
02138   SORTED_FLOATS yshifts;         //shifts from baseline
02139 
02140   needs_curve = FALSE;
02141   box = box_next_pre_chopped (&blob_it);
02142   xstarts[0] = box.left ();
02143   segments = 1;
02144   blobcount = row->blob_list ()->length ();
02145   if (textord_oldbl_debug)
02146     tprintf ("Segmenting baseline of %d blobs at (%d,%d)\n",
02147       blobcount, box.left (), box.bottom ());
02148   if (blobcount <= textord_spline_medianwin
02149   || blobcount < textord_spline_minblobs) {
02150     blob_it.move_to_last ();
02151     box = blob_it.data ()->bounding_box ();
02152     xstarts[1] = box.right ();
02153     return FALSE;
02154   }
02155   last_state = 0;
02156   new_it.mark_cycle_pt ();
02157   for (blobindex = 0; blobindex < textord_spline_medianwin; blobindex++) {
02158     new_box = box_next_pre_chopped (&new_it);
02159     middle = (new_box.left () + new_box.right ()) / 2.0;
02160     yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
02161                                  //record shift
02162     yshifts.add (yshift, blobindex);
02163     if (new_it.cycled_list ()) {
02164       xstarts[1] = new_box.right ();
02165       return FALSE;
02166     }
02167   }
02168   for (blobcount = 0; blobcount < textord_spline_medianwin / 2; blobcount++)
02169     box = box_next_pre_chopped (&blob_it);
02170   do {
02171     new_box = box_next_pre_chopped (&new_it);
02172                                  //get middle one
02173     yshift = yshifts[textord_spline_medianwin / 2];
02174     if (yshift > textord_spline_shift_fraction * block->line_size)
02175       state = 1;
02176     else if (-yshift > textord_spline_shift_fraction * block->line_size)
02177       state = -1;
02178     else
02179       state = 0;
02180     if (state != 0)
02181       needs_curve = TRUE;
02182     //              tprintf("State=%d, prev=%d, shift=%g\n",
02183     //                      state,last_state,yshift);
02184     if (state != last_state && blobcount > textord_spline_minblobs) {
02185       xstarts[segments++] = box.left ();
02186       blobcount = 0;
02187     }
02188     last_state = state;
02189     yshifts.remove (blobindex - textord_spline_medianwin);
02190     box = box_next_pre_chopped (&blob_it);
02191     middle = (new_box.left () + new_box.right ()) / 2.0;
02192     yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
02193     yshifts.add (yshift, blobindex);
02194     blobindex++;
02195     blobcount++;
02196   }
02197   while (!new_it.cycled_list ());
02198   if (blobcount > textord_spline_minblobs || segments == 1) {
02199     xstarts[segments] = new_box.right ();
02200   }
02201   else {
02202     xstarts[--segments] = new_box.right ();
02203   }
02204   if (textord_oldbl_debug)
02205     tprintf ("Made %d segments on row at (%d,%d)\n",
02206       segments, box.right (), box.bottom ());
02207   return needs_curve;
02208 }
02209 
02210 
02218 double *
02219 linear_spline_baseline (         //split baseline
02220 TO_ROW * row,                    //row to fit
02221 TO_BLOCK * block,                //block it came from
02222 inT32 & segments,                //no fo segments
02223 inT32 xstarts[]                  //coords of segments
02224 ) {
02225   int blobcount;                 //no of blobs
02226   int blobindex;                 //current blob
02227   int index1, index2;            //blob numbers
02228   int blobs_per_segment;         //blobs in each
02229   TBOX box;                       //blob box
02230   TBOX new_box;                   //new_it box
02231                                  //blobs
02232   BLOBNBOX_IT blob_it = row->blob_list ();
02233   BLOBNBOX_IT new_it = blob_it;  //front end
02234   float b, c;                    //fitted curve
02235   tesseract::DetLineFit lms;
02236   double *coeffs;                //quadratic coeffs
02237   inT32 segment;                 //current segment
02238 
02239   box = box_next_pre_chopped (&blob_it);
02240   xstarts[0] = box.left ();
02241   blobcount = 1;
02242   while (!blob_it.at_first ()) {
02243     blobcount++;
02244     box = box_next_pre_chopped (&blob_it);
02245   }
02246   segments = blobcount / textord_spline_medianwin;
02247   if (segments < 1)
02248     segments = 1;
02249   blobs_per_segment = blobcount / segments;
02250   coeffs = (double *) alloc_mem (segments * 3 * sizeof (double));
02251   if (textord_oldbl_debug)
02252     tprintf
02253       ("Linear splining baseline of %d blobs at (%d,%d), into %d segments of %d blobs\n",
02254       blobcount, box.left (), box.bottom (), segments, blobs_per_segment);
02255   segment = 1;
02256   for (index2 = 0; index2 < blobs_per_segment / 2; index2++)
02257     box_next_pre_chopped(&new_it);
02258   index1 = 0;
02259   blobindex = index2;
02260   do {
02261     blobindex += blobs_per_segment;
02262     lms.Clear();
02263     while (index1 < blobindex || (segment == segments && index1 < blobcount)) {
02264       box = box_next_pre_chopped (&blob_it);
02265       int middle = (box.left() + box.right()) / 2;
02266       lms.Add(ICOORD(middle, box.bottom()));
02267       index1++;
02268       if (index1 == blobindex - blobs_per_segment / 2
02269       || index1 == blobcount - 1) {
02270         xstarts[segment] = box.left ();
02271       }
02272     }
02273     lms.Fit(&b, &c);
02274     coeffs[segment * 3 - 3] = 0;
02275     coeffs[segment * 3 - 2] = b;
02276     coeffs[segment * 3 - 1] = c;
02277     segment++;
02278     if (segment > segments)
02279       break;
02280 
02281     blobindex += blobs_per_segment;
02282     lms.Clear();
02283     while (index2 < blobindex || (segment == segments && index2 < blobcount)) {
02284       new_box = box_next_pre_chopped (&new_it);
02285       int middle = (new_box.left() + new_box.right()) / 2;
02286       lms.Add(ICOORD (middle, new_box.bottom()));
02287       index2++;
02288       if (index2 == blobindex - blobs_per_segment / 2
02289       || index2 == blobcount - 1) {
02290         xstarts[segment] = new_box.left ();
02291       }
02292     }
02293     lms.Fit(&b, &c);
02294     coeffs[segment * 3 - 3] = 0;
02295     coeffs[segment * 3 - 2] = b;
02296     coeffs[segment * 3 - 1] = c;
02297     segment++;
02298   }
02299   while (segment <= segments);
02300   return coeffs;
02301 }
02302 
02303 
02310 void assign_blobs_to_rows(                      //find lines
02311                           TO_BLOCK *block,      //block to do
02312                           float *gradient,      //block skew
02313                           int pass,             //identification
02314                           BOOL8 reject_misses,  //chuck big ones out
02315                           BOOL8 make_new_rows,  //add rows for unmatched
02316                           BOOL8 drawing_skew    //draw smoothed skew
02317                          ) {
02318   OVERLAP_STATE overlap_result;  //what to do with it
02319   float ycoord;                  //current y
02320   float top, bottom;             //of blob
02321   float g_length = 1.0f;         //from gradient
02322   inT16 row_count;               //no of rows
02323   inT16 left_x;                  //left edge
02324   inT16 last_x;                  //previous edge
02325   float block_skew;              //y delta
02326   float smooth_factor;           //for new coords
02327   float near_dist;               //dist to nearest row
02328   ICOORD testpt;                 //testing only
02329   BLOBNBOX *blob;                //current blob
02330   TO_ROW *row;                   //current row
02331   TO_ROW *dest_row = NULL;       //row to put blob in
02332                                  //iterators
02333   BLOBNBOX_IT blob_it = &block->blobs;
02334   TO_ROW_IT row_it = block->get_rows ();
02335 
02336   ycoord =
02337     (block->block->bounding_box ().bottom () +
02338     block->block->bounding_box ().top ()) / 2.0f;
02339   if (gradient != NULL)
02340     g_length = sqrt (1 + *gradient * *gradient);
02341 #ifndef GRAPHICS_DISABLED
02342   if (drawing_skew)
02343     to_win->SetCursor(block->block->bounding_box ().left (), ycoord);
02344 #endif
02345   testpt = ICOORD (textord_test_x, textord_test_y);
02346   blob_it.sort (blob_x_order);
02347   smooth_factor = 1.0;
02348   block_skew = 0.0f;
02349   row_count = row_it.length ();  //might have rows
02350   if (!blob_it.empty ()) {
02351     left_x = blob_it.data ()->bounding_box ().left ();
02352   }
02353   else {
02354     left_x = block->block->bounding_box ().left ();
02355   }
02356   last_x = left_x;
02357   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
02358     blob = blob_it.data ();
02359     if (gradient != NULL) {
02360       block_skew = (1 - 1 / g_length) * blob->bounding_box ().bottom ()
02361         + *gradient / g_length * blob->bounding_box ().left ();
02362     }
02363     else if (blob->bounding_box ().left () - last_x > block->line_size / 2
02364       && last_x - left_x > block->line_size * 2
02365     && textord_interpolating_skew) {
02366       //                      tprintf("Interpolating skew from %g",block_skew);
02367       block_skew *= (float) (blob->bounding_box ().left () - left_x)
02368         / (last_x - left_x);
02369       //                      tprintf("to %g\n",block_skew);
02370     }
02371     last_x = blob->bounding_box ().left ();
02372     top = blob->bounding_box ().top () - block_skew;
02373     bottom = blob->bounding_box ().bottom () - block_skew;
02374 #ifndef GRAPHICS_DISABLED
02375     if (drawing_skew)
02376       to_win->DrawTo(blob->bounding_box ().left (), ycoord + block_skew);
02377 #endif
02378     if (!row_it.empty ()) {
02379       for (row_it.move_to_first ();
02380         !row_it.at_last () && row_it.data ()->min_y () > top;
02381         row_it.forward ());
02382       row = row_it.data ();
02383       if (row->min_y () <= top && row->max_y () >= bottom) {
02384       //any overlap
02385         dest_row = row;
02386         overlap_result = most_overlapping_row (&row_it, dest_row,
02387           top, bottom,
02388           block->line_size,
02389           blob->bounding_box ().
02390           contains (testpt));
02391         if (overlap_result == NEW_ROW && !reject_misses)
02392           overlap_result = ASSIGN;
02393       }
02394       else {
02395         overlap_result = NEW_ROW;
02396         if (!make_new_rows) {
02397           near_dist = row_it.data_relative (-1)->min_y () - top;
02398                                  //below bottom
02399           if (bottom < row->min_y ()) {
02400             if (row->min_y () - bottom <=
02401               (block->line_spacing -
02402             block->line_size) * tesseract::CCStruct::kDescenderFraction) {
02403                                  //done it
02404               overlap_result = ASSIGN;
02405               dest_row = row;
02406             }
02407           }
02408           else if (near_dist > 0
02409           && near_dist < bottom - row->max_y ()) {
02410             row_it.backward ();
02411             dest_row = row_it.data ();
02412             if (dest_row->min_y () - bottom <=
02413               (block->line_spacing -
02414             block->line_size) * tesseract::CCStruct::kDescenderFraction) {
02415                                  //done it
02416               overlap_result = ASSIGN;
02417             }
02418           }
02419           else {
02420             if (top - row->max_y () <=
02421               (block->line_spacing -
02422               block->line_size) * (textord_overlap_x +
02423             tesseract::CCStruct::kAscenderFraction)) {
02424                                  //done it
02425               overlap_result = ASSIGN;
02426               dest_row = row;
02427             }
02428           }
02429         }
02430       }
02431       if (overlap_result == ASSIGN)
02432         dest_row->add_blob (blob_it.extract (), top, bottom,
02433           block->line_size);
02434       if (overlap_result == NEW_ROW) {
02435         if (make_new_rows && top - bottom < block->max_blob_size) {
02436           dest_row =
02437             new TO_ROW (blob_it.extract (), top, bottom,
02438             block->line_size);
02439           row_count++;
02440           if (bottom > row_it.data ()->min_y ())
02441             row_it.add_before_then_move (dest_row);
02442           //insert in right place
02443           else
02444             row_it.add_after_then_move (dest_row);
02445           smooth_factor =
02446             1.0 / (row_count * textord_skew_lag +
02447             textord_skewsmooth_offset);
02448         }
02449         else
02450           overlap_result = REJECT;
02451       }
02452     }
02453     else if (make_new_rows && top - bottom < block->max_blob_size) {
02454       overlap_result = NEW_ROW;
02455       dest_row =
02456         new TO_ROW(blob_it.extract(), top, bottom, block->line_size);
02457       row_count++;
02458       row_it.add_after_then_move(dest_row);
02459       smooth_factor = 1.0 / (row_count * textord_skew_lag +
02460                              textord_skewsmooth_offset2);
02461     }
02462     else
02463       overlap_result = REJECT;
02464     if (blob->bounding_box ().contains(testpt) && textord_debug_blob) {
02465       if (overlap_result != REJECT) {
02466         tprintf("Test blob assigned to row at (%g,%g) on pass %d\n",
02467           dest_row->min_y(), dest_row->max_y(), pass);
02468       }
02469       else {
02470         tprintf("Test blob assigned to no row on pass %d\n", pass);
02471       }
02472     }
02473     if (overlap_result != REJECT) {
02474       while (!row_it.at_first() &&
02475              row_it.data()->min_y() > row_it.data_relative(-1)->min_y()) {
02476         row = row_it.extract();
02477         row_it.backward();
02478         row_it.add_before_then_move(row);
02479       }
02480       while (!row_it.at_last() &&
02481              row_it.data ()->min_y() < row_it.data_relative (1)->min_y()) {
02482         row = row_it.extract();
02483         row_it.forward();
02484                                  // Keep rows in order.
02485         row_it.add_after_then_move(row);
02486       }
02487       BLOBNBOX_IT added_blob_it(dest_row->blob_list());
02488       added_blob_it.move_to_last();
02489       TBOX prev_box = added_blob_it.data_relative(-1)->bounding_box();
02490       if (dest_row->blob_list()->singleton() ||
02491           !prev_box.major_x_overlap(blob->bounding_box())) {
02492         block_skew = (1 - smooth_factor) * block_skew
02493             + smooth_factor * (blob->bounding_box().bottom() -
02494             dest_row->initial_min_y());
02495       }
02496     }
02497   }
02498   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
02499     if (row_it.data()->blob_list()->empty())
02500       delete row_it.extract();  // Discard empty rows.
02501   }
02502 }
02503 
02504 
02510 OVERLAP_STATE most_overlapping_row(                    //find best row
02511                                    TO_ROW_IT *row_it,  //iterator
02512                                    TO_ROW *&best_row,  //output row
02513                                    float top,          //top of blob
02514                                    float bottom,       //bottom of blob
02515                                    float rowsize,      //max row size
02516                                    BOOL8 testing_blob  //test stuff
02517                                   ) {
02518   OVERLAP_STATE result;          //result of tests
02519   float overlap;                 //of blob & row
02520   float bestover;                //nearest row
02521   float merge_top, merge_bottom; //size of merged row
02522   ICOORD testpt;                 //testing only
02523   TO_ROW *row;                   //current row
02524   TO_ROW *test_row;              //for multiple overlaps
02525   BLOBNBOX_IT blob_it;           //for merging rows
02526 
02527   result = ASSIGN;
02528   row = row_it->data ();
02529   bestover = top - bottom;
02530   if (top > row->max_y ())
02531     bestover -= top - row->max_y ();
02532   if (bottom < row->min_y ())
02533                                  //compute overlap
02534     bestover -= row->min_y () - bottom;
02535   if (testing_blob && textord_debug_blob) {
02536     tprintf("Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f\n",
02537             bottom, top, row->min_y(), row->max_y(), rowsize, bestover);
02538   }
02539   test_row = row;
02540   do {
02541     if (!row_it->at_last ()) {
02542       row_it->forward ();
02543       test_row = row_it->data ();
02544       if (test_row->min_y () <= top && test_row->max_y () >= bottom) {
02545         merge_top =
02546           test_row->max_y () >
02547           row->max_y ()? test_row->max_y () : row->max_y ();
02548         merge_bottom =
02549           test_row->min_y () <
02550           row->min_y ()? test_row->min_y () : row->min_y ();
02551         if (merge_top - merge_bottom <= rowsize) {
02552           if (testing_blob) {
02553             tprintf ("Merging rows at (%g,%g), (%g,%g)\n",
02554               row->min_y (), row->max_y (),
02555               test_row->min_y (), test_row->max_y ());
02556           }
02557           test_row->set_limits (merge_bottom, merge_top);
02558           blob_it.set_to_list (test_row->blob_list ());
02559           blob_it.add_list_after (row->blob_list ());
02560           blob_it.sort (blob_x_order);
02561           row_it->backward ();
02562           delete row_it->extract ();
02563           row_it->forward ();
02564           bestover = -1.0f;      //force replacement
02565         }
02566         overlap = top - bottom;
02567         if (top > test_row->max_y ())
02568           overlap -= top - test_row->max_y ();
02569         if (bottom < test_row->min_y ())
02570           overlap -= test_row->min_y () - bottom;
02571         if (bestover >= rowsize - 1 && overlap >= rowsize - 1) {
02572           result = REJECT;
02573         }
02574         if (overlap > bestover) {
02575           bestover = overlap;    //find biggest overlap
02576           row = test_row;
02577         }
02578         if (testing_blob && textord_debug_blob) {
02579           tprintf("Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f->%f\n",
02580                   bottom, top, test_row->min_y(), test_row->max_y(),
02581                   rowsize, overlap, bestover);
02582         }
02583       }
02584     }
02585   }
02586   while (!row_it->at_last ()
02587     && test_row->min_y () <= top && test_row->max_y () >= bottom);
02588   while (row_it->data () != row)
02589     row_it->backward ();         //make it point to row
02590                                  //doesn't overlap much
02591   if (top - bottom - bestover > rowsize * textord_overlap_x &&
02592       (!textord_fix_makerow_bug || bestover < rowsize * textord_overlap_x)
02593     && result == ASSIGN)
02594     result = NEW_ROW;            //doesn't overlap enough
02595   best_row = row;
02596   return result;
02597 }
02598 
02599 
02605 int blob_x_order(                    //sort function
02606                  const void *item1,  //items to compare
02607                  const void *item2) {
02608                                  //converted ptr
02609   BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
02610                                  //converted ptr
02611   BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
02612 
02613   if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
02614     return -1;
02615   else if (blob1->bounding_box ().left () > blob2->bounding_box ().left ())
02616     return 1;
02617   else
02618     return 0;
02619 }
02620 
02621 
02627 int row_y_order(                    //sort function
02628                 const void *item1,  //items to compare
02629                 const void *item2) {
02630                                  //converted ptr
02631   TO_ROW *row1 = *(TO_ROW **) item1;
02632                                  //converted ptr
02633   TO_ROW *row2 = *(TO_ROW **) item2;
02634 
02635   if (row1->parallel_c () > row2->parallel_c ())
02636     return -1;
02637   else if (row1->parallel_c () < row2->parallel_c ())
02638     return 1;
02639   else
02640     return 0;
02641 }
02642 
02643 
02649 int row_spacing_order(                    //sort function
02650                       const void *item1,  //items to compare
02651                       const void *item2) {
02652                                  //converted ptr
02653   TO_ROW *row1 = *(TO_ROW **) item1;
02654                                  //converted ptr
02655   TO_ROW *row2 = *(TO_ROW **) item2;
02656 
02657   if (row1->spacing < row2->spacing)
02658     return -1;
02659   else if (row1->spacing > row2->spacing)
02660     return 1;
02661   else
02662     return 0;
02663 }
02664 
02671 void mark_repeated_chars(TO_ROW *row) {
02672   BLOBNBOX_IT box_it(row->blob_list());            // Iterator.
02673   int num_repeated_sets = 0;
02674   if (!box_it.empty()) {
02675     do {
02676       BLOBNBOX* bblob = box_it.data();
02677       int repeat_length = 1;
02678       if (bblob->flow() == BTFT_LEADER &&
02679           !bblob->joined_to_prev() && bblob->cblob() != NULL) {
02680         BLOBNBOX_IT test_it(box_it);
02681         for (test_it.forward(); !test_it.at_first();) {
02682           bblob = test_it.data();
02683           if (bblob->flow() != BTFT_LEADER)
02684             break;
02685           test_it.forward();
02686           bblob = test_it.data();
02687           if (bblob->joined_to_prev() || bblob->cblob() == NULL) {
02688             repeat_length = 0;
02689             break;
02690           }
02691           ++repeat_length;
02692         }
02693       }
02694       if (repeat_length >= kMinLeaderCount) {
02695         num_repeated_sets++;
02696         for (; repeat_length > 0; box_it.forward(), --repeat_length) {
02697           bblob = box_it.data();
02698           bblob->set_repeated_set(num_repeated_sets);
02699         }
02700      } else {
02701         bblob->set_repeated_set(0);
02702         box_it.forward();
02703       }
02704     } while (!box_it.at_first());  // until all done
02705   }
02706   row->set_num_repeated_sets(num_repeated_sets);
02707 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines