tesseract 3.04.01

ccmain/fixxht.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        fixxht.cpp  (Formerly fixxht.c)
00003  * Description: Improve x_ht and look out for case inconsistencies
00004  * Author:              Phil Cheatle
00005  * Created:             Thu Aug  5 14:11:08 BST 1993
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include          <string.h>
00021 #include          <ctype.h>
00022 #include          "params.h"
00023 #include          "float2int.h"
00024 #include          "tesseractclass.h"
00025 
00026 namespace tesseract {
00027 
00028 // Fixxht overview.
00029 // Premise: Initial estimate of x-height is adequate most of the time, but
00030 // occasionally it is incorrect. Most notable causes of failure are:
00031 // 1. Small caps, where the top of the caps is the same as the body text
00032 // xheight. For small caps words the xheight needs to be reduced to correctly
00033 // recognize the caps in the small caps word.
00034 // 2. All xheight lines, such as summer. Here the initial estimate will have
00035 // guessed that the blob tops are caps and will have placed the xheight too low.
00036 // 3. Noise/logos beside words, or changes in font size on a line. Such
00037 // things can blow the statistics and cause an incorrect estimate.
00038 // 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged.
00039 // In this case the x-height is often still correct.
00040 //
00041 // Algorithm.
00042 // Compare the vertical position (top only) of alphnumerics in a word with
00043 // the range of positions in training data (in the unicharset).
00044 // See CountMisfitTops. If any characters disagree sufficiently with the
00045 // initial xheight estimate, then recalculate the xheight, re-run OCR on
00046 // the word, and if the number of vertical misfits goes down, along with
00047 // either the word rating or certainty, then keep the new xheight.
00048 // The new xheight is calculated as follows:ComputeCompatibleXHeight
00049 // For each alphanumeric character that has a vertically misplaced top
00050 // (a misfit), yet its bottom is within the acceptable range (ie it is not
00051 // likely a sub-or super-script) calculate the range of acceptable xheight
00052 // positions from its range of tops, and give each value in the range a
00053 // number of votes equal to the distance of its top from its acceptance range.
00054 // The x-height position with the median of the votes becomes the new
00055 // x-height. This assumes that most characters will be correctly recognized
00056 // even if the x-height is incorrect. This is not a terrible assumption, but
00057 // it is not great. An improvement would be to use a classifier that does
00058 // not care about vertical position or scaling at all.
00059 // Separately collect stats on shifted baselines and apply the same logic to
00060 // computing a best-fit shift to fix the error. If the baseline needs to be
00061 // shifted, but the x-height is OK, returns the original x-height along with
00062 // the baseline shift to indicate that recognition needs to re-run.
00063 
00064 // If the max-min top of a unicharset char is bigger than kMaxCharTopRange
00065 // then the char top cannot be used to judge misfits or suggest a new top.
00066 const int kMaxCharTopRange = 48;
00067 
00068 // Returns the number of misfit blob tops in this word.
00069 int Tesseract::CountMisfitTops(WERD_RES *word_res) {
00070   int bad_blobs = 0;
00071   int num_blobs = word_res->rebuild_word->NumBlobs();
00072   for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
00073     TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
00074     UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
00075     if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
00076       int top = blob->bounding_box().top();
00077       if (top >= INT_FEAT_RANGE)
00078         top = INT_FEAT_RANGE - 1;
00079       int min_bottom, max_bottom, min_top, max_top;
00080       unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
00081                                 &min_top, &max_top);
00082       if (max_top - min_top > kMaxCharTopRange)
00083         continue;
00084       bool bad =  top < min_top - x_ht_acceptance_tolerance ||
00085                   top > max_top + x_ht_acceptance_tolerance;
00086       if (bad)
00087         ++bad_blobs;
00088       if (debug_x_ht_level >= 1) {
00089         tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
00090                 unicharset.id_to_unichar(class_id),
00091                 bad ? "Misfit" : "OK", top, min_top, max_top,
00092                 static_cast<int>(x_ht_acceptance_tolerance));
00093       }
00094     }
00095   }
00096   return bad_blobs;
00097 }
00098 
00099 // Returns a new x-height maximally compatible with the result in word_res.
00100 // See comment above for overall algorithm.
00101 float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res,
00102                                           float* baseline_shift) {
00103   STATS top_stats(0, MAX_UINT8);
00104   STATS shift_stats(-MAX_UINT8, MAX_UINT8);
00105   int bottom_shift = 0;
00106   int num_blobs = word_res->rebuild_word->NumBlobs();
00107   do {
00108     top_stats.clear();
00109     shift_stats.clear();
00110     for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
00111       TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
00112       UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
00113       if (unicharset.get_isalpha(class_id) ||
00114           unicharset.get_isdigit(class_id)) {
00115         int top = blob->bounding_box().top() + bottom_shift;
00116         // Clip the top to the limit of normalized feature space.
00117         if (top >= INT_FEAT_RANGE)
00118           top = INT_FEAT_RANGE - 1;
00119         int bottom = blob->bounding_box().bottom() + bottom_shift;
00120         int min_bottom, max_bottom, min_top, max_top;
00121         unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
00122                                   &min_top, &max_top);
00123         // Chars with a wild top range would mess up the result so ignore them.
00124         if (max_top - min_top > kMaxCharTopRange)
00125           continue;
00126         int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
00127                             top - (max_top + x_ht_acceptance_tolerance));
00128         int height = top - kBlnBaselineOffset;
00129         if (debug_x_ht_level >= 2) {
00130           tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
00131                   unicharset.id_to_unichar(class_id),
00132                   height, min_bottom, max_bottom, min_top, max_top,
00133                   bottom, top);
00134         }
00135         // Use only chars that fit in the expected bottom range, and where
00136         // the range of tops is sensibly near the xheight.
00137         if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
00138             bottom - x_ht_acceptance_tolerance <= max_bottom &&
00139             min_top > kBlnBaselineOffset &&
00140             max_top - kBlnBaselineOffset >= kBlnXHeight &&
00141             misfit_dist > 0) {
00142           // Compute the x-height position using proportionality between the
00143           // actual height and expected height.
00144           int min_xht = DivRounded(height * kBlnXHeight,
00145                                    max_top - kBlnBaselineOffset);
00146           int max_xht = DivRounded(height * kBlnXHeight,
00147                                    min_top - kBlnBaselineOffset);
00148           if (debug_x_ht_level >= 2) {
00149             tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
00150           }
00151           // The range of expected heights gets a vote equal to the distance
00152           // of the actual top from the expected top.
00153           for (int y = min_xht; y <= max_xht; ++y)
00154             top_stats.add(y, misfit_dist);
00155         } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
00156                     bottom - x_ht_acceptance_tolerance > max_bottom) &&
00157                    bottom_shift == 0) {
00158           // Get the range of required bottom shift.
00159           int min_shift = min_bottom - bottom;
00160           int max_shift = max_bottom - bottom;
00161           if (debug_x_ht_level >= 2) {
00162             tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
00163           }
00164           // The range of expected shifts gets a vote equal to the min distance
00165           // of the actual bottom from the expected bottom, spread over the
00166           // range of its acceptance.
00167           int misfit_weight = abs(min_shift);
00168           if (max_shift > min_shift)
00169             misfit_weight /= max_shift - min_shift;
00170           for (int y = min_shift; y <= max_shift; ++y)
00171             shift_stats.add(y, misfit_weight);
00172         } else {
00173           if (bottom_shift == 0) {
00174             // Things with bottoms that are already ok need to say so, on the
00175             // 1st iteration only.
00176             shift_stats.add(0, kBlnBaselineOffset);
00177           }
00178           if (debug_x_ht_level >= 2) {
00179             tprintf(" already OK\n");
00180           }
00181         }
00182       }
00183     }
00184     if (shift_stats.get_total() > top_stats.get_total()) {
00185       bottom_shift = IntCastRounded(shift_stats.median());
00186       if (debug_x_ht_level >= 2) {
00187         tprintf("Applying bottom shift=%d\n", bottom_shift);
00188       }
00189     }
00190   } while (bottom_shift != 0 &&
00191            top_stats.get_total() < shift_stats.get_total());
00192   // Baseline shift is opposite sign to the bottom shift.
00193   *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
00194   if (debug_x_ht_level >= 2) {
00195     tprintf("baseline shift=%g\n", *baseline_shift);
00196   }
00197   if (top_stats.get_total() == 0)
00198     return bottom_shift != 0 ? word_res->x_height : 0.0f;
00199   // The new xheight is just the median vote, which is then scaled out
00200   // of BLN space back to pixel space to get the x-height in pixel space.
00201   float new_xht = top_stats.median();
00202   if (debug_x_ht_level >= 2) {
00203     tprintf("Median xht=%f\n", new_xht);
00204     tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
00205             new_xht, new_xht / word_res->denorm.y_scale());
00206   }
00207   // The xheight must change by at least x_ht_min_change to be used.
00208   if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
00209     return new_xht / word_res->denorm.y_scale();
00210   else
00211     return bottom_shift != 0 ? word_res->x_height : 0.0f;
00212 }
00213 
00214 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines