tesseract 3.04.01

ccmain/pagesegmain.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        pagesegmain.cpp
00003  * Description: Top-level page segmenter for Tesseract.
00004  * Author:      Ray Smith
00005  * Created:     Thu Sep 25 17:12:01 PDT 2008
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef _WIN32
00021 #ifndef __GNUC__
00022 #include <windows.h>
00023 #endif  // __GNUC__
00024 #ifndef unlink
00025 #include <io.h>
00026 #endif
00027 #else
00028 #include <unistd.h>
00029 #endif  // _WIN32
00030 #ifdef _MSC_VER
00031 #pragma warning(disable:4244)  // Conversion warnings
00032 #endif
00033 
00034 // Include automatically generated configuration file if running autoconf.
00035 #ifdef HAVE_CONFIG_H
00036 #include "config_auto.h"
00037 #endif
00038 
00039 #include "allheaders.h"
00040 #include "blobbox.h"
00041 #include "blread.h"
00042 #include "colfind.h"
00043 #include "equationdetect.h"
00044 #include "imagefind.h"
00045 #include "linefind.h"
00046 #include "makerow.h"
00047 #include "osdetect.h"
00048 #include "tabvector.h"
00049 #include "tesseractclass.h"
00050 #include "tessvars.h"
00051 #include "textord.h"
00052 #include "tordmain.h"
00053 #include "wordseg.h"
00054 
00055 namespace tesseract {
00056 
00058 const int kMinCredibleResolution = 70;
00060 const int kDefaultResolution = 300;
00061 // Max erosions to perform in removing an enclosing circle.
00062 const int kMaxCircleErosions = 8;
00063 
00064 // Helper to remove an enclosing circle from an image.
00065 // If there isn't one, then the image will most likely get badly mangled.
00066 // The returned pix must be pixDestroyed after use. NULL may be returned
00067 // if the image doesn't meet the trivial conditions that it uses to determine
00068 // success.
00069 static Pix* RemoveEnclosingCircle(Pix* pixs) {
00070   Pix* pixsi = pixInvert(NULL, pixs);
00071   Pix* pixc = pixCreateTemplate(pixs);
00072   pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET);
00073   pixSeedfillBinary(pixc, pixc, pixsi, 4);
00074   pixInvert(pixc, pixc);
00075   pixDestroy(&pixsi);
00076   Pix* pixt = pixAnd(NULL, pixs, pixc);
00077   l_int32 max_count;
00078   pixCountConnComp(pixt, 8, &max_count);
00079   // The count has to go up before we start looking for the minimum.
00080   l_int32 min_count = MAX_INT32;
00081   Pix* pixout = NULL;
00082   for (int i = 1; i < kMaxCircleErosions; i++) {
00083     pixDestroy(&pixt);
00084     pixErodeBrick(pixc, pixc, 3, 3);
00085     pixt = pixAnd(NULL, pixs, pixc);
00086     l_int32 count;
00087     pixCountConnComp(pixt, 8, &count);
00088     if (i == 1 || count > max_count) {
00089       max_count = count;
00090       min_count = count;
00091     } else if (i > 1 && count < min_count) {
00092       min_count = count;
00093       pixDestroy(&pixout);
00094       pixout = pixCopy(NULL, pixt);  // Save the best.
00095     } else if (count >= min_count) {
00096       break;  // We have passed by the best.
00097     }
00098   }
00099   pixDestroy(&pixt);
00100   pixDestroy(&pixc);
00101   return pixout;
00102 }
00103 
00109 int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
00110                            Tesseract* osd_tess, OSResults* osr) {
00111   ASSERT_HOST(pix_binary_ != NULL);
00112   int width = pixGetWidth(pix_binary_);
00113   int height = pixGetHeight(pix_binary_);
00114   // Get page segmentation mode.
00115   PageSegMode pageseg_mode = static_cast<PageSegMode>(
00116       static_cast<int>(tessedit_pageseg_mode));
00117   // If a UNLV zone file can be found, use that instead of segmentation.
00118   if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
00119       input_file != NULL && input_file->length() > 0) {
00120     STRING name = *input_file;
00121     const char* lastdot = strrchr(name.string(), '.');
00122     if (lastdot != NULL)
00123       name[lastdot - name.string()] = '\0';
00124     read_unlv_file(name, width, height, blocks);
00125   }
00126   if (blocks->empty()) {
00127     // No UNLV file present. Work according to the PageSegMode.
00128     // First make a single block covering the whole image.
00129     BLOCK_IT block_it(blocks);
00130     BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
00131     block->set_right_to_left(right_to_left());
00132     block_it.add_to_end(block);
00133   } else {
00134     // UNLV file present. Use PSM_SINGLE_BLOCK.
00135     pageseg_mode = PSM_SINGLE_BLOCK;
00136   }
00137   // The diacritic_blobs holds noise blobs that may be diacritics. They
00138   // are separated out on areas of the image that seem noisy and short-circuit
00139   // the layout process, going straight from the initial partition creation
00140   // right through to after word segmentation, where they are added to the
00141   // rej_cblobs list of the most appropriate word. From there classification
00142   // will determine whether they are used.
00143   BLOBNBOX_LIST diacritic_blobs;
00144   int auto_page_seg_ret_val = 0;
00145   TO_BLOCK_LIST to_blocks;
00146   if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
00147       PSM_SPARSE(pageseg_mode)) {
00148     auto_page_seg_ret_val = AutoPageSeg(
00149         pageseg_mode, blocks, &to_blocks,
00150         enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr);
00151     if (pageseg_mode == PSM_OSD_ONLY)
00152       return auto_page_seg_ret_val;
00153     // To create blobs from the image region bounds uncomment this line:
00154     //  to_blocks.clear();  // Uncomment to go back to the old mode.
00155   } else {
00156     deskew_ = FCOORD(1.0f, 0.0f);
00157     reskew_ = FCOORD(1.0f, 0.0f);
00158     if (pageseg_mode == PSM_CIRCLE_WORD) {
00159       Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
00160       if (pixcleaned != NULL) {
00161         pixDestroy(&pix_binary_);
00162         pix_binary_ = pixcleaned;
00163       }
00164     }
00165   }
00166 
00167   if (auto_page_seg_ret_val < 0) {
00168     return -1;
00169   }
00170 
00171   if (blocks->empty()) {
00172     if (textord_debug_tabfind)
00173       tprintf("Empty page\n");
00174     return 0;  // AutoPageSeg found an empty page.
00175   }
00176   bool splitting =
00177       pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
00178   bool cjk_mode = textord_use_cjk_fp_model;
00179 
00180   textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
00181                        pix_thresholds_, pix_grey_, splitting || cjk_mode,
00182                        &diacritic_blobs, blocks, &to_blocks);
00183   return auto_page_seg_ret_val;
00184 }
00185 
00186 // Helper writes a grey image to a file for use by scrollviewer.
00187 // Normally for speed we don't display the image in the layout debug windows.
00188 // If textord_debug_images is true, we draw the image as a background to some
00189 // of the debug windows. printable determines whether these
00190 // images are optimized for printing instead of screen display.
00191 static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
00192   Pix* grey_pix = pixCreate(pixGetWidth(pix_binary),
00193                             pixGetHeight(pix_binary), 8);
00194   // Printable images are light grey on white, but for screen display
00195   // they are black on dark grey so the other colors show up well.
00196   if (printable) {
00197     pixSetAll(grey_pix);
00198     pixSetMasked(grey_pix, pix_binary, 192);
00199   } else {
00200     pixSetAllArbitrary(grey_pix, 64);
00201     pixSetMasked(grey_pix, pix_binary, 0);
00202   }
00203   AlignedBlob::IncrementDebugPix();
00204   pixWrite(AlignedBlob::textord_debug_pix().string(), grey_pix, IFF_PNG);
00205   pixDestroy(&grey_pix);
00206 }
00207 
00232 int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
00233                            TO_BLOCK_LIST* to_blocks,
00234                            BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess,
00235                            OSResults* osr) {
00236   if (textord_debug_images) {
00237     WriteDebugBackgroundImage(textord_debug_printable, pix_binary_);
00238   }
00239   Pix* photomask_pix = NULL;
00240   Pix* musicmask_pix = NULL;
00241   // The blocks made by the ColumnFinder. Moved to blocks before return.
00242   BLOCK_LIST found_blocks;
00243   TO_BLOCK_LIST temp_blocks;
00244 
00245   ColumnFinder* finder = SetupPageSegAndDetectOrientation(
00246       pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
00247       &musicmask_pix);
00248   int result = 0;
00249   if (finder != NULL) {
00250     TO_BLOCK_IT to_block_it(&temp_blocks);
00251     TO_BLOCK* to_block = to_block_it.data();
00252     if (musicmask_pix != NULL) {
00253       // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
00254       // blocks separately. For now combine with photomask_pix.
00255       pixOr(photomask_pix, photomask_pix, musicmask_pix);
00256     }
00257     if (equ_detect_) {
00258       finder->SetEquationDetect(equ_detect_);
00259     }
00260     result = finder->FindBlocks(
00261         pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix,
00262         pix_thresholds_, pix_grey_, &found_blocks, diacritic_blobs, to_blocks);
00263     if (result >= 0)
00264       finder->GetDeskewVectors(&deskew_, &reskew_);
00265     delete finder;
00266   }
00267   pixDestroy(&photomask_pix);
00268   pixDestroy(&musicmask_pix);
00269   if (result < 0) return result;
00270 
00271   blocks->clear();
00272   BLOCK_IT block_it(blocks);
00273   // Move the found blocks to the input/output blocks.
00274   block_it.add_list_after(&found_blocks);
00275 
00276   if (textord_debug_images) {
00277     // The debug image is no longer needed so delete it.
00278     unlink(AlignedBlob::textord_debug_pix().string());
00279   }
00280   return result;
00281 }
00282 
00283 // Helper adds all the scripts from sid_set converted to ids from osd_set to
00284 // allowed_ids.
00285 static void AddAllScriptsConverted(const UNICHARSET& sid_set,
00286                                    const UNICHARSET& osd_set,
00287                                    GenericVector<int>* allowed_ids) {
00288   for (int i = 0; i < sid_set.get_script_table_size(); ++i) {
00289     if (i != sid_set.null_sid()) {
00290       const char* script = sid_set.get_script_from_script_id(i);
00291       allowed_ids->push_back(osd_set.get_script_id_from_name(script));
00292     }
00293   }
00294 }
00295 
00309 ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
00310     PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess,
00311     OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
00312     Pix** music_mask_pix) {
00313   int vertical_x = 0;
00314   int vertical_y = 1;
00315   TabVector_LIST v_lines;
00316   TabVector_LIST h_lines;
00317   ICOORD bleft(0, 0);
00318 
00319   ASSERT_HOST(pix_binary_ != NULL);
00320   if (tessedit_dump_pageseg_images) {
00321     pixWrite("tessinput.png", pix_binary_, IFF_PNG);
00322   }
00323   // Leptonica is used to find the rule/separator lines in the input.
00324   LineFinder::FindAndRemoveLines(source_resolution_,
00325                                  textord_tabfind_show_vlines, pix_binary_,
00326                                  &vertical_x, &vertical_y, music_mask_pix,
00327                                  &v_lines, &h_lines);
00328   if (tessedit_dump_pageseg_images)
00329     pixWrite("tessnolines.png", pix_binary_, IFF_PNG);
00330   // Leptonica is used to find a mask of the photo regions in the input.
00331   *photo_mask_pix = ImageFind::FindImages(pix_binary_);
00332   if (tessedit_dump_pageseg_images)
00333     pixWrite("tessnoimages.png", pix_binary_, IFF_PNG);
00334   if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear();
00335 
00336   // The rest of the algorithm uses the usual connected components.
00337   textord_.find_components(pix_binary_, blocks, to_blocks);
00338 
00339   TO_BLOCK_IT to_block_it(to_blocks);
00340   // There must be exactly one input block.
00341   // TODO(rays) handle new textline finding with a UNLV zone file.
00342   ASSERT_HOST(to_blocks->singleton());
00343   TO_BLOCK* to_block = to_block_it.data();
00344   TBOX blkbox = to_block->block->bounding_box();
00345   ColumnFinder* finder = NULL;
00346 
00347   if (to_block->line_size >= 2) {
00348     finder = new ColumnFinder(static_cast<int>(to_block->line_size),
00349                               blkbox.botleft(), blkbox.topright(),
00350                               source_resolution_, textord_use_cjk_fp_model,
00351                               textord_tabfind_aligned_gap_fraction,
00352                               &v_lines, &h_lines, vertical_x, vertical_y);
00353 
00354     finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
00355 
00356     if (equ_detect_) {
00357       equ_detect_->LabelSpecialText(to_block);
00358     }
00359 
00360     BLOBNBOX_CLIST osd_blobs;
00361     // osd_orientation is the number of 90 degree rotations to make the
00362     // characters upright. (See osdetect.h for precise definition.)
00363     // We want the text lines horizontal, (vertical text indicates vertical
00364     // textlines) which may conflict (eg vertically written CJK).
00365     int osd_orientation = 0;
00366     bool vertical_text = textord_tabfind_force_vertical_text ||
00367                          pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
00368     if (!vertical_text && textord_tabfind_vertical_text &&
00369         PSM_ORIENTATION_ENABLED(pageseg_mode)) {
00370       vertical_text =
00371           finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
00372                                           to_block, &osd_blobs);
00373     }
00374     if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != NULL && osr != NULL) {
00375       GenericVector<int> osd_scripts;
00376       if (osd_tess != this) {
00377         // We are running osd as part of layout analysis, so constrain the
00378         // scripts to those allowed by *this.
00379         AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
00380         for (int s = 0; s < sub_langs_.size(); ++s) {
00381           AddAllScriptsConverted(sub_langs_[s]->unicharset,
00382                                  osd_tess->unicharset, &osd_scripts);
00383         }
00384       }
00385       os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
00386       if (pageseg_mode == PSM_OSD_ONLY) {
00387         delete finder;
00388         return NULL;
00389       }
00390       osd_orientation = osr->best_result.orientation_id;
00391       double osd_score = osr->orientations[osd_orientation];
00392       double osd_margin = min_orientation_margin * 2;
00393       for (int i = 0; i < 4; ++i) {
00394         if (i != osd_orientation &&
00395             osd_score - osr->orientations[i] < osd_margin) {
00396           osd_margin = osd_score - osr->orientations[i];
00397         }
00398       }
00399       int best_script_id = osr->best_result.script_id;
00400       const char* best_script_str =
00401           osd_tess->unicharset.get_script_from_script_id(best_script_id);
00402       bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
00403           best_script_id == osd_tess->unicharset.hiragana_sid() ||
00404           best_script_id == osd_tess->unicharset.katakana_sid() ||
00405           strcmp("Japanese", best_script_str) == 0 ||
00406           strcmp("Korean", best_script_str) == 0 ||
00407           strcmp("Hangul", best_script_str) == 0;
00408       if (cjk) {
00409         finder->set_cjk_script(true);
00410       }
00411       if (osd_margin < min_orientation_margin) {
00412         // The margin is weak.
00413         if (!cjk && !vertical_text && osd_orientation == 2) {
00414           // upside down latin text is improbable with such a weak margin.
00415           tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
00416                   "Don't rotate.\n", osd_margin);
00417           osd_orientation = 0;
00418         } else {
00419           tprintf("OSD: Weak margin (%.2f) for %d blob text block, "
00420                   "but using orientation anyway: %d\n",
00421                   osd_blobs.length(), osd_margin, osd_orientation);
00422         }
00423       }
00424     }
00425     osd_blobs.shallow_clear();
00426     finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
00427   }
00428 
00429   return finder;
00430 }
00431 
00432 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines