|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: pagesegmain.cpp 00003 * Description: Top-level page segmenter for Tesseract. 00004 * Author: Ray Smith 00005 * Created: Thu Sep 25 17:12:01 PDT 2008 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _WIN32 00021 #ifndef __GNUC__ 00022 #include <windows.h> 00023 #endif // __GNUC__ 00024 #ifndef unlink 00025 #include <io.h> 00026 #endif 00027 #else 00028 #include <unistd.h> 00029 #endif // _WIN32 00030 #ifdef _MSC_VER 00031 #pragma warning(disable:4244) // Conversion warnings 00032 #endif 00033 00034 // Include automatically generated configuration file if running autoconf. 00035 #ifdef HAVE_CONFIG_H 00036 #include "config_auto.h" 00037 #endif 00038 00039 #include "allheaders.h" 00040 #include "blobbox.h" 00041 #include "blread.h" 00042 #include "colfind.h" 00043 #include "equationdetect.h" 00044 #include "imagefind.h" 00045 #include "linefind.h" 00046 #include "makerow.h" 00047 #include "osdetect.h" 00048 #include "tabvector.h" 00049 #include "tesseractclass.h" 00050 #include "tessvars.h" 00051 #include "textord.h" 00052 #include "tordmain.h" 00053 #include "wordseg.h" 00054 00055 namespace tesseract { 00056 00058 const int kMinCredibleResolution = 70; 00060 const int kDefaultResolution = 300; 00061 // Max erosions to perform in removing an enclosing circle. 00062 const int kMaxCircleErosions = 8; 00063 00064 // Helper to remove an enclosing circle from an image. 00065 // If there isn't one, then the image will most likely get badly mangled. 00066 // The returned pix must be pixDestroyed after use. NULL may be returned 00067 // if the image doesn't meet the trivial conditions that it uses to determine 00068 // success. 00069 static Pix* RemoveEnclosingCircle(Pix* pixs) { 00070 Pix* pixsi = pixInvert(NULL, pixs); 00071 Pix* pixc = pixCreateTemplate(pixs); 00072 pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET); 00073 pixSeedfillBinary(pixc, pixc, pixsi, 4); 00074 pixInvert(pixc, pixc); 00075 pixDestroy(&pixsi); 00076 Pix* pixt = pixAnd(NULL, pixs, pixc); 00077 l_int32 max_count; 00078 pixCountConnComp(pixt, 8, &max_count); 00079 // The count has to go up before we start looking for the minimum. 00080 l_int32 min_count = MAX_INT32; 00081 Pix* pixout = NULL; 00082 for (int i = 1; i < kMaxCircleErosions; i++) { 00083 pixDestroy(&pixt); 00084 pixErodeBrick(pixc, pixc, 3, 3); 00085 pixt = pixAnd(NULL, pixs, pixc); 00086 l_int32 count; 00087 pixCountConnComp(pixt, 8, &count); 00088 if (i == 1 || count > max_count) { 00089 max_count = count; 00090 min_count = count; 00091 } else if (i > 1 && count < min_count) { 00092 min_count = count; 00093 pixDestroy(&pixout); 00094 pixout = pixCopy(NULL, pixt); // Save the best. 00095 } else if (count >= min_count) { 00096 break; // We have passed by the best. 00097 } 00098 } 00099 pixDestroy(&pixt); 00100 pixDestroy(&pixc); 00101 return pixout; 00102 } 00103 00109 int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, 00110 Tesseract* osd_tess, OSResults* osr) { 00111 ASSERT_HOST(pix_binary_ != NULL); 00112 int width = pixGetWidth(pix_binary_); 00113 int height = pixGetHeight(pix_binary_); 00114 // Get page segmentation mode. 00115 PageSegMode pageseg_mode = static_cast<PageSegMode>( 00116 static_cast<int>(tessedit_pageseg_mode)); 00117 // If a UNLV zone file can be found, use that instead of segmentation. 00118 if (!PSM_COL_FIND_ENABLED(pageseg_mode) && 00119 input_file != NULL && input_file->length() > 0) { 00120 STRING name = *input_file; 00121 const char* lastdot = strrchr(name.string(), '.'); 00122 if (lastdot != NULL) 00123 name[lastdot - name.string()] = '\0'; 00124 read_unlv_file(name, width, height, blocks); 00125 } 00126 if (blocks->empty()) { 00127 // No UNLV file present. Work according to the PageSegMode. 00128 // First make a single block covering the whole image. 00129 BLOCK_IT block_it(blocks); 00130 BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height); 00131 block->set_right_to_left(right_to_left()); 00132 block_it.add_to_end(block); 00133 } else { 00134 // UNLV file present. Use PSM_SINGLE_BLOCK. 00135 pageseg_mode = PSM_SINGLE_BLOCK; 00136 } 00137 // The diacritic_blobs holds noise blobs that may be diacritics. They 00138 // are separated out on areas of the image that seem noisy and short-circuit 00139 // the layout process, going straight from the initial partition creation 00140 // right through to after word segmentation, where they are added to the 00141 // rej_cblobs list of the most appropriate word. From there classification 00142 // will determine whether they are used. 00143 BLOBNBOX_LIST diacritic_blobs; 00144 int auto_page_seg_ret_val = 0; 00145 TO_BLOCK_LIST to_blocks; 00146 if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) || 00147 PSM_SPARSE(pageseg_mode)) { 00148 auto_page_seg_ret_val = AutoPageSeg( 00149 pageseg_mode, blocks, &to_blocks, 00150 enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr); 00151 if (pageseg_mode == PSM_OSD_ONLY) 00152 return auto_page_seg_ret_val; 00153 // To create blobs from the image region bounds uncomment this line: 00154 // to_blocks.clear(); // Uncomment to go back to the old mode. 00155 } else { 00156 deskew_ = FCOORD(1.0f, 0.0f); 00157 reskew_ = FCOORD(1.0f, 0.0f); 00158 if (pageseg_mode == PSM_CIRCLE_WORD) { 00159 Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_); 00160 if (pixcleaned != NULL) { 00161 pixDestroy(&pix_binary_); 00162 pix_binary_ = pixcleaned; 00163 } 00164 } 00165 } 00166 00167 if (auto_page_seg_ret_val < 0) { 00168 return -1; 00169 } 00170 00171 if (blocks->empty()) { 00172 if (textord_debug_tabfind) 00173 tprintf("Empty page\n"); 00174 return 0; // AutoPageSeg found an empty page. 00175 } 00176 bool splitting = 00177 pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT; 00178 bool cjk_mode = textord_use_cjk_fp_model; 00179 00180 textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_, 00181 pix_thresholds_, pix_grey_, splitting || cjk_mode, 00182 &diacritic_blobs, blocks, &to_blocks); 00183 return auto_page_seg_ret_val; 00184 } 00185 00186 // Helper writes a grey image to a file for use by scrollviewer. 00187 // Normally for speed we don't display the image in the layout debug windows. 00188 // If textord_debug_images is true, we draw the image as a background to some 00189 // of the debug windows. printable determines whether these 00190 // images are optimized for printing instead of screen display. 00191 static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) { 00192 Pix* grey_pix = pixCreate(pixGetWidth(pix_binary), 00193 pixGetHeight(pix_binary), 8); 00194 // Printable images are light grey on white, but for screen display 00195 // they are black on dark grey so the other colors show up well. 00196 if (printable) { 00197 pixSetAll(grey_pix); 00198 pixSetMasked(grey_pix, pix_binary, 192); 00199 } else { 00200 pixSetAllArbitrary(grey_pix, 64); 00201 pixSetMasked(grey_pix, pix_binary, 0); 00202 } 00203 AlignedBlob::IncrementDebugPix(); 00204 pixWrite(AlignedBlob::textord_debug_pix().string(), grey_pix, IFF_PNG); 00205 pixDestroy(&grey_pix); 00206 } 00207 00232 int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks, 00233 TO_BLOCK_LIST* to_blocks, 00234 BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess, 00235 OSResults* osr) { 00236 if (textord_debug_images) { 00237 WriteDebugBackgroundImage(textord_debug_printable, pix_binary_); 00238 } 00239 Pix* photomask_pix = NULL; 00240 Pix* musicmask_pix = NULL; 00241 // The blocks made by the ColumnFinder. Moved to blocks before return. 00242 BLOCK_LIST found_blocks; 00243 TO_BLOCK_LIST temp_blocks; 00244 00245 ColumnFinder* finder = SetupPageSegAndDetectOrientation( 00246 pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix, 00247 &musicmask_pix); 00248 int result = 0; 00249 if (finder != NULL) { 00250 TO_BLOCK_IT to_block_it(&temp_blocks); 00251 TO_BLOCK* to_block = to_block_it.data(); 00252 if (musicmask_pix != NULL) { 00253 // TODO(rays) pass the musicmask_pix into FindBlocks and mark music 00254 // blocks separately. For now combine with photomask_pix. 00255 pixOr(photomask_pix, photomask_pix, musicmask_pix); 00256 } 00257 if (equ_detect_) { 00258 finder->SetEquationDetect(equ_detect_); 00259 } 00260 result = finder->FindBlocks( 00261 pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix, 00262 pix_thresholds_, pix_grey_, &found_blocks, diacritic_blobs, to_blocks); 00263 if (result >= 0) 00264 finder->GetDeskewVectors(&deskew_, &reskew_); 00265 delete finder; 00266 } 00267 pixDestroy(&photomask_pix); 00268 pixDestroy(&musicmask_pix); 00269 if (result < 0) return result; 00270 00271 blocks->clear(); 00272 BLOCK_IT block_it(blocks); 00273 // Move the found blocks to the input/output blocks. 00274 block_it.add_list_after(&found_blocks); 00275 00276 if (textord_debug_images) { 00277 // The debug image is no longer needed so delete it. 00278 unlink(AlignedBlob::textord_debug_pix().string()); 00279 } 00280 return result; 00281 } 00282 00283 // Helper adds all the scripts from sid_set converted to ids from osd_set to 00284 // allowed_ids. 00285 static void AddAllScriptsConverted(const UNICHARSET& sid_set, 00286 const UNICHARSET& osd_set, 00287 GenericVector<int>* allowed_ids) { 00288 for (int i = 0; i < sid_set.get_script_table_size(); ++i) { 00289 if (i != sid_set.null_sid()) { 00290 const char* script = sid_set.get_script_from_script_id(i); 00291 allowed_ids->push_back(osd_set.get_script_id_from_name(script)); 00292 } 00293 } 00294 } 00295 00309 ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation( 00310 PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess, 00311 OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, 00312 Pix** music_mask_pix) { 00313 int vertical_x = 0; 00314 int vertical_y = 1; 00315 TabVector_LIST v_lines; 00316 TabVector_LIST h_lines; 00317 ICOORD bleft(0, 0); 00318 00319 ASSERT_HOST(pix_binary_ != NULL); 00320 if (tessedit_dump_pageseg_images) { 00321 pixWrite("tessinput.png", pix_binary_, IFF_PNG); 00322 } 00323 // Leptonica is used to find the rule/separator lines in the input. 00324 LineFinder::FindAndRemoveLines(source_resolution_, 00325 textord_tabfind_show_vlines, pix_binary_, 00326 &vertical_x, &vertical_y, music_mask_pix, 00327 &v_lines, &h_lines); 00328 if (tessedit_dump_pageseg_images) 00329 pixWrite("tessnolines.png", pix_binary_, IFF_PNG); 00330 // Leptonica is used to find a mask of the photo regions in the input. 00331 *photo_mask_pix = ImageFind::FindImages(pix_binary_); 00332 if (tessedit_dump_pageseg_images) 00333 pixWrite("tessnoimages.png", pix_binary_, IFF_PNG); 00334 if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear(); 00335 00336 // The rest of the algorithm uses the usual connected components. 00337 textord_.find_components(pix_binary_, blocks, to_blocks); 00338 00339 TO_BLOCK_IT to_block_it(to_blocks); 00340 // There must be exactly one input block. 00341 // TODO(rays) handle new textline finding with a UNLV zone file. 00342 ASSERT_HOST(to_blocks->singleton()); 00343 TO_BLOCK* to_block = to_block_it.data(); 00344 TBOX blkbox = to_block->block->bounding_box(); 00345 ColumnFinder* finder = NULL; 00346 00347 if (to_block->line_size >= 2) { 00348 finder = new ColumnFinder(static_cast<int>(to_block->line_size), 00349 blkbox.botleft(), blkbox.topright(), 00350 source_resolution_, textord_use_cjk_fp_model, 00351 textord_tabfind_aligned_gap_fraction, 00352 &v_lines, &h_lines, vertical_x, vertical_y); 00353 00354 finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block); 00355 00356 if (equ_detect_) { 00357 equ_detect_->LabelSpecialText(to_block); 00358 } 00359 00360 BLOBNBOX_CLIST osd_blobs; 00361 // osd_orientation is the number of 90 degree rotations to make the 00362 // characters upright. (See osdetect.h for precise definition.) 00363 // We want the text lines horizontal, (vertical text indicates vertical 00364 // textlines) which may conflict (eg vertically written CJK). 00365 int osd_orientation = 0; 00366 bool vertical_text = textord_tabfind_force_vertical_text || 00367 pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT; 00368 if (!vertical_text && textord_tabfind_vertical_text && 00369 PSM_ORIENTATION_ENABLED(pageseg_mode)) { 00370 vertical_text = 00371 finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio, 00372 to_block, &osd_blobs); 00373 } 00374 if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != NULL && osr != NULL) { 00375 GenericVector<int> osd_scripts; 00376 if (osd_tess != this) { 00377 // We are running osd as part of layout analysis, so constrain the 00378 // scripts to those allowed by *this. 00379 AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts); 00380 for (int s = 0; s < sub_langs_.size(); ++s) { 00381 AddAllScriptsConverted(sub_langs_[s]->unicharset, 00382 osd_tess->unicharset, &osd_scripts); 00383 } 00384 } 00385 os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess); 00386 if (pageseg_mode == PSM_OSD_ONLY) { 00387 delete finder; 00388 return NULL; 00389 } 00390 osd_orientation = osr->best_result.orientation_id; 00391 double osd_score = osr->orientations[osd_orientation]; 00392 double osd_margin = min_orientation_margin * 2; 00393 for (int i = 0; i < 4; ++i) { 00394 if (i != osd_orientation && 00395 osd_score - osr->orientations[i] < osd_margin) { 00396 osd_margin = osd_score - osr->orientations[i]; 00397 } 00398 } 00399 int best_script_id = osr->best_result.script_id; 00400 const char* best_script_str = 00401 osd_tess->unicharset.get_script_from_script_id(best_script_id); 00402 bool cjk = best_script_id == osd_tess->unicharset.han_sid() || 00403 best_script_id == osd_tess->unicharset.hiragana_sid() || 00404 best_script_id == osd_tess->unicharset.katakana_sid() || 00405 strcmp("Japanese", best_script_str) == 0 || 00406 strcmp("Korean", best_script_str) == 0 || 00407 strcmp("Hangul", best_script_str) == 0; 00408 if (cjk) { 00409 finder->set_cjk_script(true); 00410 } 00411 if (osd_margin < min_orientation_margin) { 00412 // The margin is weak. 00413 if (!cjk && !vertical_text && osd_orientation == 2) { 00414 // upside down latin text is improbable with such a weak margin. 00415 tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: " 00416 "Don't rotate.\n", osd_margin); 00417 osd_orientation = 0; 00418 } else { 00419 tprintf("OSD: Weak margin (%.2f) for %d blob text block, " 00420 "but using orientation anyway: %d\n", 00421 osd_blobs.length(), osd_margin, osd_orientation); 00422 } 00423 } 00424 } 00425 osd_blobs.shallow_clear(); 00426 finder->CorrectOrientation(to_block, vertical_text, osd_orientation); 00427 } 00428 00429 return finder; 00430 } 00431 00432 } // namespace tesseract.