|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: fixxht.cpp (Formerly fixxht.c) 00003 * Description: Improve x_ht and look out for case inconsistencies 00004 * Author: Phil Cheatle 00005 * Created: Thu Aug 5 14:11:08 BST 1993 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include <string.h> 00021 #include <ctype.h> 00022 #include "params.h" 00023 #include "float2int.h" 00024 #include "tesseractclass.h" 00025 00026 namespace tesseract { 00027 00028 // Fixxht overview. 00029 // Premise: Initial estimate of x-height is adequate most of the time, but 00030 // occasionally it is incorrect. Most notable causes of failure are: 00031 // 1. Small caps, where the top of the caps is the same as the body text 00032 // xheight. For small caps words the xheight needs to be reduced to correctly 00033 // recognize the caps in the small caps word. 00034 // 2. All xheight lines, such as summer. Here the initial estimate will have 00035 // guessed that the blob tops are caps and will have placed the xheight too low. 00036 // 3. Noise/logos beside words, or changes in font size on a line. Such 00037 // things can blow the statistics and cause an incorrect estimate. 00038 // 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged. 00039 // In this case the x-height is often still correct. 00040 // 00041 // Algorithm. 00042 // Compare the vertical position (top only) of alphnumerics in a word with 00043 // the range of positions in training data (in the unicharset). 00044 // See CountMisfitTops. If any characters disagree sufficiently with the 00045 // initial xheight estimate, then recalculate the xheight, re-run OCR on 00046 // the word, and if the number of vertical misfits goes down, along with 00047 // either the word rating or certainty, then keep the new xheight. 00048 // The new xheight is calculated as follows:ComputeCompatibleXHeight 00049 // For each alphanumeric character that has a vertically misplaced top 00050 // (a misfit), yet its bottom is within the acceptable range (ie it is not 00051 // likely a sub-or super-script) calculate the range of acceptable xheight 00052 // positions from its range of tops, and give each value in the range a 00053 // number of votes equal to the distance of its top from its acceptance range. 00054 // The x-height position with the median of the votes becomes the new 00055 // x-height. This assumes that most characters will be correctly recognized 00056 // even if the x-height is incorrect. This is not a terrible assumption, but 00057 // it is not great. An improvement would be to use a classifier that does 00058 // not care about vertical position or scaling at all. 00059 // Separately collect stats on shifted baselines and apply the same logic to 00060 // computing a best-fit shift to fix the error. If the baseline needs to be 00061 // shifted, but the x-height is OK, returns the original x-height along with 00062 // the baseline shift to indicate that recognition needs to re-run. 00063 00064 // If the max-min top of a unicharset char is bigger than kMaxCharTopRange 00065 // then the char top cannot be used to judge misfits or suggest a new top. 00066 const int kMaxCharTopRange = 48; 00067 00068 // Returns the number of misfit blob tops in this word. 00069 int Tesseract::CountMisfitTops(WERD_RES *word_res) { 00070 int bad_blobs = 0; 00071 int num_blobs = word_res->rebuild_word->NumBlobs(); 00072 for (int blob_id = 0; blob_id < num_blobs; ++blob_id) { 00073 TBLOB* blob = word_res->rebuild_word->blobs[blob_id]; 00074 UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); 00075 if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) { 00076 int top = blob->bounding_box().top(); 00077 if (top >= INT_FEAT_RANGE) 00078 top = INT_FEAT_RANGE - 1; 00079 int min_bottom, max_bottom, min_top, max_top; 00080 unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, 00081 &min_top, &max_top); 00082 if (max_top - min_top > kMaxCharTopRange) 00083 continue; 00084 bool bad = top < min_top - x_ht_acceptance_tolerance || 00085 top > max_top + x_ht_acceptance_tolerance; 00086 if (bad) 00087 ++bad_blobs; 00088 if (debug_x_ht_level >= 1) { 00089 tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n", 00090 unicharset.id_to_unichar(class_id), 00091 bad ? "Misfit" : "OK", top, min_top, max_top, 00092 static_cast<int>(x_ht_acceptance_tolerance)); 00093 } 00094 } 00095 } 00096 return bad_blobs; 00097 } 00098 00099 // Returns a new x-height maximally compatible with the result in word_res. 00100 // See comment above for overall algorithm. 00101 float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res, 00102 float* baseline_shift) { 00103 STATS top_stats(0, MAX_UINT8); 00104 STATS shift_stats(-MAX_UINT8, MAX_UINT8); 00105 int bottom_shift = 0; 00106 int num_blobs = word_res->rebuild_word->NumBlobs(); 00107 do { 00108 top_stats.clear(); 00109 shift_stats.clear(); 00110 for (int blob_id = 0; blob_id < num_blobs; ++blob_id) { 00111 TBLOB* blob = word_res->rebuild_word->blobs[blob_id]; 00112 UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); 00113 if (unicharset.get_isalpha(class_id) || 00114 unicharset.get_isdigit(class_id)) { 00115 int top = blob->bounding_box().top() + bottom_shift; 00116 // Clip the top to the limit of normalized feature space. 00117 if (top >= INT_FEAT_RANGE) 00118 top = INT_FEAT_RANGE - 1; 00119 int bottom = blob->bounding_box().bottom() + bottom_shift; 00120 int min_bottom, max_bottom, min_top, max_top; 00121 unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, 00122 &min_top, &max_top); 00123 // Chars with a wild top range would mess up the result so ignore them. 00124 if (max_top - min_top > kMaxCharTopRange) 00125 continue; 00126 int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top, 00127 top - (max_top + x_ht_acceptance_tolerance)); 00128 int height = top - kBlnBaselineOffset; 00129 if (debug_x_ht_level >= 2) { 00130 tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ", 00131 unicharset.id_to_unichar(class_id), 00132 height, min_bottom, max_bottom, min_top, max_top, 00133 bottom, top); 00134 } 00135 // Use only chars that fit in the expected bottom range, and where 00136 // the range of tops is sensibly near the xheight. 00137 if (min_bottom <= bottom + x_ht_acceptance_tolerance && 00138 bottom - x_ht_acceptance_tolerance <= max_bottom && 00139 min_top > kBlnBaselineOffset && 00140 max_top - kBlnBaselineOffset >= kBlnXHeight && 00141 misfit_dist > 0) { 00142 // Compute the x-height position using proportionality between the 00143 // actual height and expected height. 00144 int min_xht = DivRounded(height * kBlnXHeight, 00145 max_top - kBlnBaselineOffset); 00146 int max_xht = DivRounded(height * kBlnXHeight, 00147 min_top - kBlnBaselineOffset); 00148 if (debug_x_ht_level >= 2) { 00149 tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht); 00150 } 00151 // The range of expected heights gets a vote equal to the distance 00152 // of the actual top from the expected top. 00153 for (int y = min_xht; y <= max_xht; ++y) 00154 top_stats.add(y, misfit_dist); 00155 } else if ((min_bottom > bottom + x_ht_acceptance_tolerance || 00156 bottom - x_ht_acceptance_tolerance > max_bottom) && 00157 bottom_shift == 0) { 00158 // Get the range of required bottom shift. 00159 int min_shift = min_bottom - bottom; 00160 int max_shift = max_bottom - bottom; 00161 if (debug_x_ht_level >= 2) { 00162 tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift); 00163 } 00164 // The range of expected shifts gets a vote equal to the min distance 00165 // of the actual bottom from the expected bottom, spread over the 00166 // range of its acceptance. 00167 int misfit_weight = abs(min_shift); 00168 if (max_shift > min_shift) 00169 misfit_weight /= max_shift - min_shift; 00170 for (int y = min_shift; y <= max_shift; ++y) 00171 shift_stats.add(y, misfit_weight); 00172 } else { 00173 if (bottom_shift == 0) { 00174 // Things with bottoms that are already ok need to say so, on the 00175 // 1st iteration only. 00176 shift_stats.add(0, kBlnBaselineOffset); 00177 } 00178 if (debug_x_ht_level >= 2) { 00179 tprintf(" already OK\n"); 00180 } 00181 } 00182 } 00183 } 00184 if (shift_stats.get_total() > top_stats.get_total()) { 00185 bottom_shift = IntCastRounded(shift_stats.median()); 00186 if (debug_x_ht_level >= 2) { 00187 tprintf("Applying bottom shift=%d\n", bottom_shift); 00188 } 00189 } 00190 } while (bottom_shift != 0 && 00191 top_stats.get_total() < shift_stats.get_total()); 00192 // Baseline shift is opposite sign to the bottom shift. 00193 *baseline_shift = -bottom_shift / word_res->denorm.y_scale(); 00194 if (debug_x_ht_level >= 2) { 00195 tprintf("baseline shift=%g\n", *baseline_shift); 00196 } 00197 if (top_stats.get_total() == 0) 00198 return bottom_shift != 0 ? word_res->x_height : 0.0f; 00199 // The new xheight is just the median vote, which is then scaled out 00200 // of BLN space back to pixel space to get the x-height in pixel space. 00201 float new_xht = top_stats.median(); 00202 if (debug_x_ht_level >= 2) { 00203 tprintf("Median xht=%f\n", new_xht); 00204 tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", 00205 new_xht, new_xht / word_res->denorm.y_scale()); 00206 } 00207 // The xheight must change by at least x_ht_min_change to be used. 00208 if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change) 00209 return new_xht / word_res->denorm.y_scale(); 00210 else 00211 return bottom_shift != 0 ? word_res->x_height : 0.0f; 00212 } 00213 00214 } // namespace tesseract