|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: baseapi.cpp 00003 * Description: Simple API for calling tesseract. 00004 * Author: Ray Smith 00005 * Created: Fri Oct 06 15:35:01 PDT 2006 00006 * 00007 * (C) Copyright 2006, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 // Include automatically generated configuration file if running autoconf. 00021 #ifdef HAVE_CONFIG_H 00022 #include "config_auto.h" 00023 #endif 00024 00025 #ifdef __linux__ 00026 #include <signal.h> 00027 #endif 00028 00029 #if defined(_WIN32) 00030 #ifdef _MSC_VER 00031 #include "vcsversion.h" 00032 #include "mathfix.h" 00033 #elif MINGW 00034 // workaround for stdlib.h with -std=c++11 for _splitpath and _MAX_FNAME 00035 #undef __STRICT_ANSI__ 00036 #endif // _MSC_VER 00037 #include <stdlib.h> 00038 #include <windows.h> 00039 #include <fcntl.h> 00040 #include <io.h> 00041 #else 00042 #include <dirent.h> 00043 #include <libgen.h> 00044 #include <string.h> 00045 #endif // _WIN32 00046 00047 #include <iostream> 00048 #include <string> 00049 #include <iterator> 00050 #include <fstream> 00051 00052 #include "allheaders.h" 00053 00054 #include "baseapi.h" 00055 #include "blobclass.h" 00056 #include "resultiterator.h" 00057 #include "mutableiterator.h" 00058 #include "thresholder.h" 00059 #include "tesseractclass.h" 00060 #include "pageres.h" 00061 #include "paragraphs.h" 00062 #include "tessvars.h" 00063 #include "control.h" 00064 #include "dict.h" 00065 #include "pgedit.h" 00066 #include "paramsd.h" 00067 #include "output.h" 00068 #include "globaloc.h" 00069 #include "globals.h" 00070 #include "edgblob.h" 00071 #include "equationdetect.h" 00072 #include "tessbox.h" 00073 #include "makerow.h" 00074 #include "otsuthr.h" 00075 #include "osdetect.h" 00076 #include "params.h" 00077 #include "renderer.h" 00078 #include "strngs.h" 00079 #include "openclwrapper.h" 00080 00081 BOOL_VAR(stream_filelist, FALSE, "Stream a filelist from stdin"); 00082 00083 namespace tesseract { 00084 00086 const int kMinRectSize = 10; 00088 const char kTesseractReject = '~'; 00090 const char kUNLVReject = '~'; 00092 const char kUNLVSuspect = '^'; 00097 const char* kInputFile = "noname.tif"; 00101 const char* kOldVarsFile = "failed_vars.txt"; 00103 const int kMaxIntSize = 22; 00108 const int kMinCredibleResolution = 70; 00110 const int kMaxCredibleResolution = 2400; 00111 00112 TessBaseAPI::TessBaseAPI() 00113 : tesseract_(NULL), 00114 osd_tesseract_(NULL), 00115 equ_detect_(NULL), 00116 // Thresholder is initialized to NULL here, but will be set before use by: 00117 // A constructor of a derived API, SetThresholder(), or 00118 // created implicitly when used in InternalSetImage. 00119 thresholder_(NULL), 00120 paragraph_models_(NULL), 00121 block_list_(NULL), 00122 page_res_(NULL), 00123 input_file_(NULL), 00124 input_image_(NULL), 00125 output_file_(NULL), 00126 datapath_(NULL), 00127 language_(NULL), 00128 last_oem_requested_(OEM_DEFAULT), 00129 recognition_done_(false), 00130 truth_cb_(NULL), 00131 rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0), 00132 image_width_(0), image_height_(0) { 00133 } 00134 00135 TessBaseAPI::~TessBaseAPI() { 00136 End(); 00137 } 00138 00142 const char* TessBaseAPI::Version() { 00143 #if defined(GIT_REV) && (defined(DEBUG) || defined(_DEBUG)) 00144 return GIT_REV; 00145 #else 00146 return TESSERACT_VERSION_STR; 00147 #endif 00148 } 00149 00157 #ifdef USE_OPENCL 00158 #if USE_DEVICE_SELECTION 00159 #include "opencl_device_selection.h" 00160 #endif 00161 #endif 00162 size_t TessBaseAPI::getOpenCLDevice(void **data) { 00163 #ifdef USE_OPENCL 00164 #if USE_DEVICE_SELECTION 00165 ds_device device = OpenclDevice::getDeviceSelection(); 00166 if (device.type == DS_DEVICE_OPENCL_DEVICE) { 00167 *data = reinterpret_cast<void*>(new cl_device_id); 00168 memcpy(*data, &device.oclDeviceID, sizeof(cl_device_id)); 00169 return sizeof(cl_device_id); 00170 } 00171 #endif 00172 #endif 00173 00174 *data = NULL; 00175 return 0; 00176 } 00177 00182 void TessBaseAPI::CatchSignals() { 00183 #ifdef __linux__ 00184 struct sigaction action; 00185 memset(&action, 0, sizeof(action)); 00186 action.sa_handler = &signal_exit; 00187 action.sa_flags = SA_RESETHAND; 00188 sigaction(SIGSEGV, &action, NULL); 00189 sigaction(SIGFPE, &action, NULL); 00190 sigaction(SIGBUS, &action, NULL); 00191 #else 00192 // Warn API users that an implementation is needed. 00193 tprintf("CatchSignals has no non-linux implementation!\n"); 00194 #endif 00195 } 00196 00201 void TessBaseAPI::SetInputName(const char* name) { 00202 if (input_file_ == NULL) 00203 input_file_ = new STRING(name); 00204 else 00205 *input_file_ = name; 00206 } 00207 00209 void TessBaseAPI::SetOutputName(const char* name) { 00210 if (output_file_ == NULL) 00211 output_file_ = new STRING(name); 00212 else 00213 *output_file_ = name; 00214 } 00215 00216 bool TessBaseAPI::SetVariable(const char* name, const char* value) { 00217 if (tesseract_ == NULL) tesseract_ = new Tesseract; 00218 return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY, 00219 tesseract_->params()); 00220 } 00221 00222 bool TessBaseAPI::SetDebugVariable(const char* name, const char* value) { 00223 if (tesseract_ == NULL) tesseract_ = new Tesseract; 00224 return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY, 00225 tesseract_->params()); 00226 } 00227 00228 bool TessBaseAPI::GetIntVariable(const char *name, int *value) const { 00229 IntParam *p = ParamUtils::FindParam<IntParam>( 00230 name, GlobalParams()->int_params, tesseract_->params()->int_params); 00231 if (p == NULL) return false; 00232 *value = (inT32)(*p); 00233 return true; 00234 } 00235 00236 bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const { 00237 BoolParam *p = ParamUtils::FindParam<BoolParam>( 00238 name, GlobalParams()->bool_params, tesseract_->params()->bool_params); 00239 if (p == NULL) return false; 00240 *value = (BOOL8)(*p); 00241 return true; 00242 } 00243 00244 const char *TessBaseAPI::GetStringVariable(const char *name) const { 00245 StringParam *p = ParamUtils::FindParam<StringParam>( 00246 name, GlobalParams()->string_params, tesseract_->params()->string_params); 00247 return (p != NULL) ? p->string() : NULL; 00248 } 00249 00250 bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const { 00251 DoubleParam *p = ParamUtils::FindParam<DoubleParam>( 00252 name, GlobalParams()->double_params, tesseract_->params()->double_params); 00253 if (p == NULL) return false; 00254 *value = (double)(*p); 00255 return true; 00256 } 00257 00259 bool TessBaseAPI::GetVariableAsString(const char *name, STRING *val) { 00260 return ParamUtils::GetParamAsString(name, tesseract_->params(), val); 00261 } 00262 00264 void TessBaseAPI::PrintVariables(FILE *fp) const { 00265 ParamUtils::PrintParams(fp, tesseract_->params()); 00266 } 00267 00276 int TessBaseAPI::Init(const char* datapath, const char* language, 00277 OcrEngineMode oem, char **configs, int configs_size, 00278 const GenericVector<STRING> *vars_vec, 00279 const GenericVector<STRING> *vars_values, 00280 bool set_only_non_debug_params) { 00281 PERF_COUNT_START("TessBaseAPI::Init") 00282 // Default language is "eng". 00283 if (language == NULL) language = "eng"; 00284 // If the datapath, OcrEngineMode or the language have changed - start again. 00285 // Note that the language_ field stores the last requested language that was 00286 // initialized successfully, while tesseract_->lang stores the language 00287 // actually used. They differ only if the requested language was NULL, in 00288 // which case tesseract_->lang is set to the Tesseract default ("eng"). 00289 if (tesseract_ != NULL && 00290 (datapath_ == NULL || language_ == NULL || 00291 *datapath_ != datapath || last_oem_requested_ != oem || 00292 (*language_ != language && tesseract_->lang != language))) { 00293 delete tesseract_; 00294 tesseract_ = NULL; 00295 } 00296 // PERF_COUNT_SUB("delete tesseract_") 00297 #ifdef USE_OPENCL 00298 OpenclDevice od; 00299 od.InitEnv(); 00300 #endif 00301 PERF_COUNT_SUB("OD::InitEnv()") 00302 bool reset_classifier = true; 00303 if (tesseract_ == NULL) { 00304 reset_classifier = false; 00305 tesseract_ = new Tesseract; 00306 if (tesseract_->init_tesseract( 00307 datapath, output_file_ != NULL ? output_file_->string() : NULL, 00308 language, oem, configs, configs_size, vars_vec, vars_values, 00309 set_only_non_debug_params) != 0) { 00310 return -1; 00311 } 00312 } 00313 PERF_COUNT_SUB("update tesseract_") 00314 // Update datapath and language requested for the last valid initialization. 00315 if (datapath_ == NULL) 00316 datapath_ = new STRING(datapath); 00317 else 00318 *datapath_ = datapath; 00319 if ((strcmp(datapath_->string(), "") == 0) && 00320 (strcmp(tesseract_->datadir.string(), "") != 0)) 00321 *datapath_ = tesseract_->datadir; 00322 00323 if (language_ == NULL) 00324 language_ = new STRING(language); 00325 else 00326 *language_ = language; 00327 last_oem_requested_ = oem; 00328 // PERF_COUNT_SUB("update last_oem_requested_") 00329 // For same language and datapath, just reset the adaptive classifier. 00330 if (reset_classifier) { 00331 tesseract_->ResetAdaptiveClassifier(); 00332 PERF_COUNT_SUB("tesseract_->ResetAdaptiveClassifier()") 00333 } 00334 PERF_COUNT_END 00335 return 0; 00336 } 00337 00346 const char* TessBaseAPI::GetInitLanguagesAsString() const { 00347 return (language_ == NULL || language_->string() == NULL) ? 00348 "" : language_->string(); 00349 } 00350 00356 void TessBaseAPI::GetLoadedLanguagesAsVector( 00357 GenericVector<STRING>* langs) const { 00358 langs->clear(); 00359 if (tesseract_ != NULL) { 00360 langs->push_back(tesseract_->lang); 00361 int num_subs = tesseract_->num_sub_langs(); 00362 for (int i = 0; i < num_subs; ++i) 00363 langs->push_back(tesseract_->get_sub_lang(i)->lang); 00364 } 00365 } 00366 00370 void TessBaseAPI::GetAvailableLanguagesAsVector( 00371 GenericVector<STRING>* langs) const { 00372 langs->clear(); 00373 if (tesseract_ != NULL) { 00374 #ifdef _WIN32 00375 STRING pattern = tesseract_->datadir + "/*." + kTrainedDataSuffix; 00376 char fname[_MAX_FNAME]; 00377 WIN32_FIND_DATA data; 00378 BOOL result = TRUE; 00379 HANDLE handle = FindFirstFile(pattern.string(), &data); 00380 if (handle != INVALID_HANDLE_VALUE) { 00381 for (; result; result = FindNextFile(handle, &data)) { 00382 _splitpath(data.cFileName, NULL, NULL, fname, NULL); 00383 langs->push_back(STRING(fname)); 00384 } 00385 FindClose(handle); 00386 } 00387 #else // _WIN32 00388 DIR *dir; 00389 struct dirent *dirent; 00390 char *dot; 00391 00392 STRING extension = STRING(".") + kTrainedDataSuffix; 00393 00394 dir = opendir(tesseract_->datadir.string()); 00395 if (dir != NULL) { 00396 while ((dirent = readdir(dir))) { 00397 // Skip '.', '..', and hidden files 00398 if (dirent->d_name[0] != '.') { 00399 if (strstr(dirent->d_name, extension.string()) != NULL) { 00400 dot = strrchr(dirent->d_name, '.'); 00401 // This ensures that .traineddata is at the end of the file name 00402 if (strncmp(dot, extension.string(), 00403 strlen(extension.string())) == 0) { 00404 *dot = '\0'; 00405 langs->push_back(STRING(dirent->d_name)); 00406 } 00407 } 00408 } 00409 } 00410 closedir(dir); 00411 } 00412 #endif 00413 } 00414 } 00415 00422 int TessBaseAPI::InitLangMod(const char* datapath, const char* language) { 00423 if (tesseract_ == NULL) 00424 tesseract_ = new Tesseract; 00425 else 00426 ParamUtils::ResetToDefaults(tesseract_->params()); 00427 return tesseract_->init_tesseract_lm(datapath, NULL, language); 00428 } 00429 00434 void TessBaseAPI::InitForAnalysePage() { 00435 if (tesseract_ == NULL) { 00436 tesseract_ = new Tesseract; 00437 tesseract_->InitAdaptiveClassifier(false); 00438 } 00439 } 00440 00446 void TessBaseAPI::ReadConfigFile(const char* filename) { 00447 tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY); 00448 } 00449 00451 void TessBaseAPI::ReadDebugConfigFile(const char* filename) { 00452 tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY); 00453 } 00454 00460 void TessBaseAPI::SetPageSegMode(PageSegMode mode) { 00461 if (tesseract_ == NULL) 00462 tesseract_ = new Tesseract; 00463 tesseract_->tessedit_pageseg_mode.set_value(mode); 00464 } 00465 00467 PageSegMode TessBaseAPI::GetPageSegMode() const { 00468 if (tesseract_ == NULL) 00469 return PSM_SINGLE_BLOCK; 00470 return static_cast<PageSegMode>( 00471 static_cast<int>(tesseract_->tessedit_pageseg_mode)); 00472 } 00473 00487 char* TessBaseAPI::TesseractRect(const unsigned char* imagedata, 00488 int bytes_per_pixel, 00489 int bytes_per_line, 00490 int left, int top, 00491 int width, int height) { 00492 if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize) 00493 return NULL; // Nothing worth doing. 00494 00495 // Since this original api didn't give the exact size of the image, 00496 // we have to invent a reasonable value. 00497 int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8; 00498 SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top, 00499 bytes_per_pixel, bytes_per_line); 00500 SetRectangle(left, top, width, height); 00501 00502 return GetUTF8Text(); 00503 } 00504 00509 void TessBaseAPI::ClearAdaptiveClassifier() { 00510 if (tesseract_ == NULL) 00511 return; 00512 tesseract_->ResetAdaptiveClassifier(); 00513 tesseract_->ResetDocumentDictionary(); 00514 } 00515 00525 void TessBaseAPI::SetImage(const unsigned char* imagedata, 00526 int width, int height, 00527 int bytes_per_pixel, int bytes_per_line) { 00528 if (InternalSetImage()) 00529 thresholder_->SetImage(imagedata, width, height, 00530 bytes_per_pixel, bytes_per_line); 00531 } 00532 00533 void TessBaseAPI::SetSourceResolution(int ppi) { 00534 if (thresholder_) 00535 thresholder_->SetSourceYResolution(ppi); 00536 else 00537 tprintf("Please call SetImage before SetSourceResolution.\n"); 00538 } 00539 00550 void TessBaseAPI::SetImage(Pix* pix) { 00551 if (InternalSetImage()) 00552 thresholder_->SetImage(pix); 00553 SetInputImage(pix); 00554 } 00555 00561 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) { 00562 if (thresholder_ == NULL) 00563 return; 00564 thresholder_->SetRectangle(left, top, width, height); 00565 ClearResults(); 00566 } 00567 00572 Pix* TessBaseAPI::GetThresholdedImage() { 00573 if (tesseract_ == NULL || thresholder_ == NULL) 00574 return NULL; 00575 if (tesseract_->pix_binary() == NULL) 00576 Threshold(tesseract_->mutable_pix_binary()); 00577 return pixClone(tesseract_->pix_binary()); 00578 } 00579 00585 Boxa* TessBaseAPI::GetRegions(Pixa** pixa) { 00586 return GetComponentImages(RIL_BLOCK, false, pixa, NULL); 00587 } 00588 00597 Boxa* TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding, 00598 Pixa** pixa, int** blockids, int** paraids) { 00599 return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding, 00600 pixa, blockids, paraids); 00601 } 00602 00611 Boxa* TessBaseAPI::GetStrips(Pixa** pixa, int** blockids) { 00612 return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids); 00613 } 00614 00620 Boxa* TessBaseAPI::GetWords(Pixa** pixa) { 00621 return GetComponentImages(RIL_WORD, true, pixa, NULL); 00622 } 00623 00630 Boxa* TessBaseAPI::GetConnectedComponents(Pixa** pixa) { 00631 return GetComponentImages(RIL_SYMBOL, true, pixa, NULL); 00632 } 00633 00642 Boxa* TessBaseAPI::GetComponentImages(PageIteratorLevel level, 00643 bool text_only, bool raw_image, 00644 const int raw_padding, 00645 Pixa** pixa, int** blockids, 00646 int** paraids) { 00647 PageIterator* page_it = GetIterator(); 00648 if (page_it == NULL) 00649 page_it = AnalyseLayout(); 00650 if (page_it == NULL) 00651 return NULL; // Failed. 00652 00653 // Count the components to get a size for the arrays. 00654 int component_count = 0; 00655 int left, top, right, bottom; 00656 00657 TessResultCallback<bool>* get_bbox = NULL; 00658 if (raw_image) { 00659 // Get bounding box in original raw image with padding. 00660 get_bbox = NewPermanentTessCallback(page_it, &PageIterator::BoundingBox, 00661 level, raw_padding, 00662 &left, &top, &right, &bottom); 00663 } else { 00664 // Get bounding box from binarized imaged. Note that this could be 00665 // differently scaled from the original image. 00666 get_bbox = NewPermanentTessCallback(page_it, 00667 &PageIterator::BoundingBoxInternal, 00668 level, &left, &top, &right, &bottom); 00669 } 00670 do { 00671 if (get_bbox->Run() && 00672 (!text_only || PTIsTextType(page_it->BlockType()))) 00673 ++component_count; 00674 } while (page_it->Next(level)); 00675 00676 Boxa* boxa = boxaCreate(component_count); 00677 if (pixa != NULL) 00678 *pixa = pixaCreate(component_count); 00679 if (blockids != NULL) 00680 *blockids = new int[component_count]; 00681 if (paraids != NULL) 00682 *paraids = new int[component_count]; 00683 00684 int blockid = 0; 00685 int paraid = 0; 00686 int component_index = 0; 00687 page_it->Begin(); 00688 do { 00689 if (get_bbox->Run() && 00690 (!text_only || PTIsTextType(page_it->BlockType()))) { 00691 Box* lbox = boxCreate(left, top, right - left, bottom - top); 00692 boxaAddBox(boxa, lbox, L_INSERT); 00693 if (pixa != NULL) { 00694 Pix* pix = NULL; 00695 if (raw_image) { 00696 pix = page_it->GetImage(level, raw_padding, input_image_, 00697 &left, &top); 00698 } else { 00699 pix = page_it->GetBinaryImage(level); 00700 } 00701 pixaAddPix(*pixa, pix, L_INSERT); 00702 pixaAddBox(*pixa, lbox, L_CLONE); 00703 } 00704 if (paraids != NULL) { 00705 (*paraids)[component_index] = paraid; 00706 if (page_it->IsAtFinalElement(RIL_PARA, level)) 00707 ++paraid; 00708 } 00709 if (blockids != NULL) { 00710 (*blockids)[component_index] = blockid; 00711 if (page_it->IsAtFinalElement(RIL_BLOCK, level)) { 00712 ++blockid; 00713 paraid = 0; 00714 } 00715 } 00716 ++component_index; 00717 } 00718 } while (page_it->Next(level)); 00719 delete page_it; 00720 delete get_bbox; 00721 return boxa; 00722 } 00723 00724 int TessBaseAPI::GetThresholdedImageScaleFactor() const { 00725 if (thresholder_ == NULL) { 00726 return 0; 00727 } 00728 return thresholder_->GetScaleFactor(); 00729 } 00730 00732 void TessBaseAPI::DumpPGM(const char* filename) { 00733 if (tesseract_ == NULL) 00734 return; 00735 FILE *fp = fopen(filename, "wb"); 00736 Pix* pix = tesseract_->pix_binary(); 00737 int width = pixGetWidth(pix); 00738 int height = pixGetHeight(pix); 00739 l_uint32* data = pixGetData(pix); 00740 fprintf(fp, "P5 %d %d 255\n", width, height); 00741 for (int y = 0; y < height; ++y, data += pixGetWpl(pix)) { 00742 for (int x = 0; x < width; ++x) { 00743 uinT8 b = GET_DATA_BIT(data, x) ? 0 : 255; 00744 fwrite(&b, 1, 1, fp); 00745 } 00746 } 00747 fclose(fp); 00748 } 00749 00750 #ifndef NO_CUBE_BUILD 00751 00757 int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks, 00758 Boxa* boxa_words, Pixa* pixa_words, 00759 const FCOORD& reskew, Pix* page_pix, 00760 PAGE_RES* page_res) { 00761 int block_count = boxaGetCount(boxa_blocks); 00762 ASSERT_HOST(block_count == pixaGetCount(pixa_blocks)); 00763 // Write each block to the current directory as junk_write_display.nnn.png. 00764 for (int i = 0; i < block_count; ++i) { 00765 Pix* pix = pixaGetPix(pixa_blocks, i, L_CLONE); 00766 pixDisplayWrite(pix, 1); 00767 } 00768 int word_count = boxaGetCount(boxa_words); 00769 ASSERT_HOST(word_count == pixaGetCount(pixa_words)); 00770 int pr_word = 0; 00771 PAGE_RES_IT page_res_it(page_res); 00772 for (page_res_it.restart_page(); page_res_it.word () != NULL; 00773 page_res_it.forward(), ++pr_word) { 00774 WERD_RES *word = page_res_it.word(); 00775 WERD_CHOICE* choice = word->best_choice; 00776 // Write the first 100 words to files names wordims/<wordstring>.tif. 00777 if (pr_word < 100) { 00778 STRING filename("wordims/"); 00779 if (choice != NULL) { 00780 filename += choice->unichar_string(); 00781 } else { 00782 char numbuf[32]; 00783 filename += "unclassified"; 00784 snprintf(numbuf, 32, "%03d", pr_word); 00785 filename += numbuf; 00786 } 00787 filename += ".tif"; 00788 Pix* pix = pixaGetPix(pixa_words, pr_word, L_CLONE); 00789 pixWrite(filename.string(), pix, IFF_TIFF_G4); 00790 } 00791 } 00792 ASSERT_HOST(pr_word == word_count); 00793 return 0; 00794 } 00795 #endif // NO_CUBE_BUILD 00796 00812 PageIterator* TessBaseAPI::AnalyseLayout(bool merge_similar_words) { 00813 if (FindLines() == 0) { 00814 if (block_list_->empty()) 00815 return NULL; // The page was empty. 00816 page_res_ = new PAGE_RES(merge_similar_words, block_list_, NULL); 00817 DetectParagraphs(false); 00818 return new PageIterator( 00819 page_res_, tesseract_, thresholder_->GetScaleFactor(), 00820 thresholder_->GetScaledYResolution(), 00821 rect_left_, rect_top_, rect_width_, rect_height_); 00822 } 00823 return NULL; 00824 } 00825 00830 int TessBaseAPI::Recognize(ETEXT_DESC* monitor) { 00831 if (tesseract_ == NULL) 00832 return -1; 00833 if (FindLines() != 0) 00834 return -1; 00835 if (page_res_ != NULL) 00836 delete page_res_; 00837 if (block_list_->empty()) { 00838 page_res_ = new PAGE_RES(false, block_list_, 00839 &tesseract_->prev_word_best_choice_); 00840 return 0; // Empty page. 00841 } 00842 00843 tesseract_->SetBlackAndWhitelist(); 00844 recognition_done_ = true; 00845 if (tesseract_->tessedit_resegment_from_line_boxes) { 00846 page_res_ = tesseract_->ApplyBoxes(*input_file_, true, block_list_); 00847 } else if (tesseract_->tessedit_resegment_from_boxes) { 00848 page_res_ = tesseract_->ApplyBoxes(*input_file_, false, block_list_); 00849 } else { 00850 // TODO(rays) LSTM here. 00851 page_res_ = new PAGE_RES(false, 00852 block_list_, &tesseract_->prev_word_best_choice_); 00853 } 00854 if (tesseract_->tessedit_make_boxes_from_boxes) { 00855 tesseract_->CorrectClassifyWords(page_res_); 00856 return 0; 00857 } 00858 00859 if (truth_cb_ != NULL) { 00860 tesseract_->wordrec_run_blamer.set_value(true); 00861 PageIterator *page_it = new PageIterator( 00862 page_res_, tesseract_, thresholder_->GetScaleFactor(), 00863 thresholder_->GetScaledYResolution(), 00864 rect_left_, rect_top_, rect_width_, rect_height_); 00865 truth_cb_->Run(tesseract_->getDict().getUnicharset(), 00866 image_height_, page_it, this->tesseract()->pix_grey()); 00867 delete page_it; 00868 } 00869 00870 int result = 0; 00871 if (tesseract_->interactive_display_mode) { 00872 #ifndef GRAPHICS_DISABLED 00873 tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_); 00874 #endif // GRAPHICS_DISABLED 00875 // The page_res is invalid after an interactive session, so cleanup 00876 // in a way that lets us continue to the next page without crashing. 00877 delete page_res_; 00878 page_res_ = NULL; 00879 return -1; 00880 } else if (tesseract_->tessedit_train_from_boxes) { 00881 STRING fontname; 00882 ExtractFontName(*output_file_, &fontname); 00883 tesseract_->ApplyBoxTraining(fontname, page_res_); 00884 } else if (tesseract_->tessedit_ambigs_training) { 00885 FILE *training_output_file = tesseract_->init_recog_training(*input_file_); 00886 // OCR the page segmented into words by tesseract. 00887 tesseract_->recog_training_segmented( 00888 *input_file_, page_res_, monitor, training_output_file); 00889 fclose(training_output_file); 00890 } else { 00891 // Now run the main recognition. 00892 bool wait_for_text = true; 00893 GetBoolVariable("paragraph_text_based", &wait_for_text); 00894 if (!wait_for_text) DetectParagraphs(false); 00895 if (tesseract_->recog_all_words(page_res_, monitor, NULL, NULL, 0)) { 00896 if (wait_for_text) DetectParagraphs(true); 00897 } else { 00898 result = -1; 00899 } 00900 } 00901 return result; 00902 } 00903 00905 int TessBaseAPI::RecognizeForChopTest(ETEXT_DESC* monitor) { 00906 if (tesseract_ == NULL) 00907 return -1; 00908 if (thresholder_ == NULL || thresholder_->IsEmpty()) { 00909 tprintf("Please call SetImage before attempting recognition."); 00910 return -1; 00911 } 00912 if (page_res_ != NULL) 00913 ClearResults(); 00914 if (FindLines() != 0) 00915 return -1; 00916 // Additional conditions under which chopper test cannot be run 00917 if (tesseract_->interactive_display_mode) return -1; 00918 00919 recognition_done_ = true; 00920 00921 page_res_ = new PAGE_RES(false, block_list_, 00922 &(tesseract_->prev_word_best_choice_)); 00923 00924 PAGE_RES_IT page_res_it(page_res_); 00925 00926 while (page_res_it.word() != NULL) { 00927 WERD_RES *word_res = page_res_it.word(); 00928 GenericVector<TBOX> boxes; 00929 tesseract_->MaximallyChopWord(boxes, page_res_it.block()->block, 00930 page_res_it.row()->row, word_res); 00931 page_res_it.forward(); 00932 } 00933 return 0; 00934 } 00935 00936 void TessBaseAPI::SetInputImage(Pix *pix) { 00937 if (input_image_) 00938 pixDestroy(&input_image_); 00939 input_image_ = NULL; 00940 if (pix) 00941 input_image_ = pixCopy(NULL, pix); 00942 } 00943 00944 Pix* TessBaseAPI::GetInputImage() { 00945 return input_image_; 00946 } 00947 00948 const char * TessBaseAPI::GetInputName() { 00949 if (input_file_) 00950 return input_file_->c_str(); 00951 return NULL; 00952 } 00953 00954 const char * TessBaseAPI::GetDatapath() { 00955 return tesseract_->datadir.c_str(); 00956 } 00957 00958 int TessBaseAPI::GetSourceYResolution() { 00959 return thresholder_->GetSourceYResolution(); 00960 } 00961 00962 // If flist exists, get data from there. Otherwise get data from buf. 00963 // Seems convoluted, but is the easiest way I know of to meet multiple 00964 // goals. Support streaming from stdin, and also work on platforms 00965 // lacking fmemopen. 00966 bool TessBaseAPI::ProcessPagesFileList(FILE *flist, 00967 STRING *buf, 00968 const char* retry_config, 00969 int timeout_millisec, 00970 TessResultRenderer* renderer, 00971 int tessedit_page_number) { 00972 if (!flist && !buf) return false; 00973 int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0; 00974 char pagename[MAX_PATH]; 00975 00976 GenericVector<STRING> lines; 00977 if (!flist) { 00978 buf->split('\n', &lines); 00979 if (lines.empty()) return false; 00980 } 00981 00982 // Skip to the requested page number. 00983 for (int i = 0; i < page; i++) { 00984 if (flist) { 00985 if (fgets(pagename, sizeof(pagename), flist) == NULL) break; 00986 } 00987 } 00988 00989 // Begin producing output 00990 const char* kUnknownTitle = ""; 00991 if (renderer && !renderer->BeginDocument(kUnknownTitle)) { 00992 return false; 00993 } 00994 00995 // Loop over all pages - or just the requested one 00996 while (true) { 00997 if (flist) { 00998 if (fgets(pagename, sizeof(pagename), flist) == NULL) break; 00999 } else { 01000 if (page >= lines.size()) break; 01001 snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str()); 01002 } 01003 chomp_string(pagename); 01004 Pix *pix = pixRead(pagename); 01005 if (pix == NULL) { 01006 tprintf("Image file %s cannot be read!\n", pagename); 01007 return false; 01008 } 01009 tprintf("Page %d : %s\n", page, pagename); 01010 bool r = ProcessPage(pix, page, pagename, retry_config, 01011 timeout_millisec, renderer); 01012 pixDestroy(&pix); 01013 if (!r) return false; 01014 if (tessedit_page_number >= 0) break; 01015 ++page; 01016 } 01017 01018 // Finish producing output 01019 if (renderer && !renderer->EndDocument()) { 01020 return false; 01021 } 01022 return true; 01023 } 01024 01025 bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, 01026 size_t size, 01027 const char* filename, 01028 const char* retry_config, 01029 int timeout_millisec, 01030 TessResultRenderer* renderer, 01031 int tessedit_page_number) { 01032 #ifndef ANDROID_BUILD 01033 Pix *pix = NULL; 01034 #ifdef USE_OPENCL 01035 OpenclDevice od; 01036 #endif // USE_OPENCL 01037 int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0; 01038 for (; ; ++page) { 01039 if (tessedit_page_number >= 0) 01040 page = tessedit_page_number; 01041 #ifdef USE_OPENCL 01042 if ( od.selectedDeviceIsOpenCL() ) { 01043 // FIXME(jbreiden) Not implemented. 01044 pix = od.pixReadMemTiffCl(data, size, page); 01045 } else { 01046 #endif // USE_OPENCL 01047 pix = pixReadMemTiff(data, size, page); 01048 #ifdef USE_OPENCL 01049 } 01050 #endif // USE_OPENCL 01051 if (pix == NULL) break; 01052 tprintf("Page %d\n", page + 1); 01053 char page_str[kMaxIntSize]; 01054 snprintf(page_str, kMaxIntSize - 1, "%d", page); 01055 SetVariable("applybox_page", page_str); 01056 bool r = ProcessPage(pix, page, filename, retry_config, 01057 timeout_millisec, renderer); 01058 pixDestroy(&pix); 01059 if (!r) return false; 01060 if (tessedit_page_number >= 0) break; 01061 } 01062 return true; 01063 #else 01064 return false; 01065 #endif 01066 } 01067 01068 // Master ProcessPages calls ProcessPagesInternal and then does any post- 01069 // processing required due to being in a training mode. 01070 bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config, 01071 int timeout_millisec, 01072 TessResultRenderer* renderer) { 01073 bool result = 01074 ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer); 01075 if (result) { 01076 if (tesseract_->tessedit_train_from_boxes && 01077 !tesseract_->WriteTRFile(*output_file_)) { 01078 tprintf("Write of TR file failed: %s\n", output_file_->string()); 01079 return false; 01080 } 01081 } 01082 return result; 01083 } 01084 01085 // In the ideal scenario, Tesseract will start working on data as soon 01086 // as it can. For example, if you steam a filelist through stdin, we 01087 // should start the OCR process as soon as the first filename is 01088 // available. This is particularly useful when hooking Tesseract up to 01089 // slow hardware such as a book scanning machine. 01090 // 01091 // Unfortunately there are tradeoffs. You can't seek on stdin. That 01092 // makes automatic detection of datatype (TIFF? filelist? PNG?) 01093 // impractical. So we support a command line flag to explicitly 01094 // identify the scenario that really matters: filelists on 01095 // stdin. We'll still do our best if the user likes pipes. That means 01096 // piling up any data coming into stdin into a memory buffer. 01097 bool TessBaseAPI::ProcessPagesInternal(const char* filename, 01098 const char* retry_config, 01099 int timeout_millisec, 01100 TessResultRenderer* renderer) { 01101 #ifndef ANDROID_BUILD 01102 PERF_COUNT_START("ProcessPages") 01103 bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-"); 01104 if (stdInput) { 01105 #ifdef WIN32 01106 if (_setmode(_fileno(stdin), _O_BINARY) == -1) 01107 tprintf("ERROR: cin to binary: %s", strerror(errno)); 01108 #endif // WIN32 01109 } 01110 01111 if (stream_filelist) { 01112 return ProcessPagesFileList(stdin, NULL, retry_config, 01113 timeout_millisec, renderer, 01114 tesseract_->tessedit_page_number); 01115 } 01116 01117 // At this point we are officially in autodection territory. 01118 // That means we are going to buffer stdin so that it is 01119 // seekable. To keep code simple we will also buffer data 01120 // coming from a file. 01121 std::string buf; 01122 if (stdInput) { 01123 buf.assign((std::istreambuf_iterator<char>(std::cin)), 01124 (std::istreambuf_iterator<char>())); 01125 } else { 01126 std::ifstream ifs(filename, std::ios::binary); 01127 if (ifs) { 01128 buf.assign((std::istreambuf_iterator<char>(ifs)), 01129 (std::istreambuf_iterator<char>())); 01130 } else { 01131 tprintf("ERROR: Can not open input file %s\n", filename); 01132 return false; 01133 } 01134 } 01135 01136 // Here is our autodetection 01137 int format; 01138 const l_uint8 * data = reinterpret_cast<const l_uint8 *>(buf.c_str()); 01139 findFileFormatBuffer(data, &format); 01140 01141 // Maybe we have a filelist 01142 if (format == IFF_UNKNOWN) { 01143 STRING s(buf.c_str()); 01144 return ProcessPagesFileList(NULL, &s, retry_config, 01145 timeout_millisec, renderer, 01146 tesseract_->tessedit_page_number); 01147 } 01148 01149 // Maybe we have a TIFF which is potentially multipage 01150 bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS || 01151 format == IFF_TIFF_RLE || format == IFF_TIFF_G3 || 01152 format == IFF_TIFF_G4 || format == IFF_TIFF_LZW || 01153 format == IFF_TIFF_ZIP); 01154 01155 // Fail early if we can, before producing any output 01156 Pix *pix = NULL; 01157 if (!tiff) { 01158 pix = pixReadMem(data, buf.size()); 01159 if (pix == NULL) { 01160 return false; 01161 } 01162 } 01163 01164 // Begin the output 01165 const char* kUnknownTitle = ""; 01166 if (renderer && !renderer->BeginDocument(kUnknownTitle)) { 01167 pixDestroy(&pix); 01168 return false; 01169 } 01170 01171 // Produce output 01172 bool r = false; 01173 if (tiff) { 01174 r = ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config, 01175 timeout_millisec, renderer, 01176 tesseract_->tessedit_page_number); 01177 } else { 01178 r = ProcessPage(pix, 0, filename, retry_config, 01179 timeout_millisec, renderer); 01180 pixDestroy(&pix); 01181 } 01182 01183 // End the output 01184 if (!r || (renderer && !renderer->EndDocument())) { 01185 return false; 01186 } 01187 PERF_COUNT_END 01188 return true; 01189 #else 01190 return false; 01191 #endif 01192 } 01193 01194 bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename, 01195 const char* retry_config, int timeout_millisec, 01196 TessResultRenderer* renderer) { 01197 PERF_COUNT_START("ProcessPage") 01198 SetInputName(filename); 01199 SetImage(pix); 01200 bool failed = false; 01201 01202 if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) { 01203 // Disabled character recognition 01204 PageIterator* it = AnalyseLayout(); 01205 01206 if (it == NULL) { 01207 failed = true; 01208 } else { 01209 delete it; 01210 } 01211 } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY) { 01212 failed = FindLines() != 0; 01213 } else if (timeout_millisec > 0) { 01214 // Running with a timeout. 01215 ETEXT_DESC monitor; 01216 monitor.cancel = NULL; 01217 monitor.cancel_this = NULL; 01218 monitor.set_deadline_msecs(timeout_millisec); 01219 01220 // Now run the main recognition. 01221 failed = Recognize(&monitor) < 0; 01222 } else { 01223 // Normal layout and character recognition with no timeout. 01224 failed = Recognize(NULL) < 0; 01225 } 01226 01227 if (tesseract_->tessedit_write_images) { 01228 #ifndef ANDROID_BUILD 01229 Pix* page_pix = GetThresholdedImage(); 01230 pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4); 01231 #endif // ANDROID_BUILD 01232 } 01233 01234 if (failed && retry_config != NULL && retry_config[0] != '\0') { 01235 // Save current config variables before switching modes. 01236 FILE* fp = fopen(kOldVarsFile, "wb"); 01237 PrintVariables(fp); 01238 fclose(fp); 01239 // Switch to alternate mode for retry. 01240 ReadConfigFile(retry_config); 01241 SetImage(pix); 01242 Recognize(NULL); 01243 // Restore saved config variables. 01244 ReadConfigFile(kOldVarsFile); 01245 } 01246 01247 if (renderer && !failed) { 01248 failed = !renderer->AddImage(this); 01249 } 01250 01251 PERF_COUNT_END 01252 return !failed; 01253 } 01254 01259 LTRResultIterator* TessBaseAPI::GetLTRIterator() { 01260 if (tesseract_ == NULL || page_res_ == NULL) 01261 return NULL; 01262 return new LTRResultIterator( 01263 page_res_, tesseract_, 01264 thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(), 01265 rect_left_, rect_top_, rect_width_, rect_height_); 01266 } 01267 01276 ResultIterator* TessBaseAPI::GetIterator() { 01277 if (tesseract_ == NULL || page_res_ == NULL) 01278 return NULL; 01279 return ResultIterator::StartOfParagraph(LTRResultIterator( 01280 page_res_, tesseract_, 01281 thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(), 01282 rect_left_, rect_top_, rect_width_, rect_height_)); 01283 } 01284 01293 MutableIterator* TessBaseAPI::GetMutableIterator() { 01294 if (tesseract_ == NULL || page_res_ == NULL) 01295 return NULL; 01296 return new MutableIterator(page_res_, tesseract_, 01297 thresholder_->GetScaleFactor(), 01298 thresholder_->GetScaledYResolution(), 01299 rect_left_, rect_top_, rect_width_, rect_height_); 01300 } 01301 01303 char* TessBaseAPI::GetUTF8Text() { 01304 if (tesseract_ == NULL || 01305 (!recognition_done_ && Recognize(NULL) < 0)) 01306 return NULL; 01307 STRING text(""); 01308 ResultIterator *it = GetIterator(); 01309 do { 01310 if (it->Empty(RIL_PARA)) continue; 01311 char *para_text = it->GetUTF8Text(RIL_PARA); 01312 text += para_text; 01313 delete []para_text; 01314 } while (it->Next(RIL_PARA)); 01315 char* result = new char[text.length() + 1]; 01316 strncpy(result, text.string(), text.length() + 1); 01317 delete it; 01318 return result; 01319 } 01320 01324 static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) { 01325 tesseract::Orientation orientation; 01326 tesseract::WritingDirection writing_direction; 01327 tesseract::TextlineOrder textline_order; 01328 float deskew_angle; 01329 it->Orientation(&orientation, &writing_direction, &textline_order, 01330 &deskew_angle); 01331 return orientation; 01332 } 01333 01342 static void AddBaselineCoordsTohOCR(const PageIterator *it, 01343 PageIteratorLevel level, 01344 STRING* hocr_str) { 01345 tesseract::Orientation orientation = GetBlockTextOrientation(it); 01346 if (orientation != ORIENTATION_PAGE_UP) { 01347 hocr_str->add_str_int("; textangle ", 360 - orientation * 90); 01348 return; 01349 } 01350 01351 int left, top, right, bottom; 01352 it->BoundingBox(level, &left, &top, &right, &bottom); 01353 01354 // Try to get the baseline coordinates at this level. 01355 int x1, y1, x2, y2; 01356 if (!it->Baseline(level, &x1, &y1, &x2, &y2)) 01357 return; 01358 // Following the description of this field of the hOCR spec, we convert the 01359 // baseline coordinates so that "the bottom left of the bounding box is the 01360 // origin". 01361 x1 -= left; 01362 x2 -= left; 01363 y1 -= bottom; 01364 y2 -= bottom; 01365 01366 // Now fit a line through the points so we can extract coefficients for the 01367 // equation: y = p1 x + p0 01368 double p1 = 0; 01369 double p0 = 0; 01370 if (x1 == x2) { 01371 // Problem computing the polynomial coefficients. 01372 return; 01373 } 01374 p1 = (y2 - y1) / static_cast<double>(x2 - x1); 01375 p0 = y1 - static_cast<double>(p1 * x1); 01376 01377 hocr_str->add_str_double("; baseline ", round(p1 * 1000.0) / 1000.0); 01378 hocr_str->add_str_double(" ", round(p0 * 1000.0) / 1000.0); 01379 } 01380 01381 static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, int num2) { 01382 unsigned long bufsize = base.length() + 2 * kMaxIntSize; 01383 char id_buffer[bufsize]; 01384 if (num2 >= 0) { 01385 snprintf(id_buffer, bufsize - 1, "%s_%d_%d", base.c_str(), num1, num2); 01386 } else { 01387 snprintf(id_buffer, bufsize - 1, "%s_%d", base.c_str(), num1); 01388 } 01389 id_buffer[bufsize - 1] = '\0'; 01390 *hocr_str += " id='"; 01391 *hocr_str += id_buffer; 01392 *hocr_str += "'"; 01393 } 01394 01395 static void AddBoxTohOCR(const ResultIterator *it, 01396 PageIteratorLevel level, 01397 STRING* hocr_str) { 01398 int left, top, right, bottom; 01399 it->BoundingBox(level, &left, &top, &right, &bottom); 01400 // This is the only place we use double quotes instead of single quotes, 01401 // but it may too late to change for consistency 01402 hocr_str->add_str_int(" title=\"bbox ", left); 01403 hocr_str->add_str_int(" ", top); 01404 hocr_str->add_str_int(" ", right); 01405 hocr_str->add_str_int(" ", bottom); 01406 // Add baseline coordinates & heights for textlines only. 01407 if (level == RIL_TEXTLINE) { 01408 AddBaselineCoordsTohOCR(it, level, hocr_str); 01409 // add custom height measures 01410 float row_height, descenders, ascenders; // row attributes 01411 it->RowAttributes(&row_height, &descenders, &ascenders); 01412 // TODO: Do we want to limit these to a single decimal place? 01413 hocr_str->add_str_double("; x_size ", row_height); 01414 hocr_str->add_str_double("; x_descenders ", descenders * -1); 01415 hocr_str->add_str_double("; x_ascenders ", ascenders); 01416 } 01417 *hocr_str += "\">"; 01418 } 01419 01428 char* TessBaseAPI::GetHOCRText(int page_number) { 01429 if (tesseract_ == NULL || 01430 (page_res_ == NULL && Recognize(NULL) < 0)) 01431 return NULL; 01432 01433 int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1; 01434 int page_id = page_number + 1; // hOCR uses 1-based page numbers. 01435 bool font_info = false; 01436 GetBoolVariable("hocr_font_info", &font_info); 01437 01438 STRING hocr_str(""); 01439 01440 if (input_file_ == NULL) 01441 SetInputName(NULL); 01442 01443 #ifdef _WIN32 01444 // convert input name from ANSI encoding to utf-8 01445 int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, 01446 NULL, 0); 01447 wchar_t *uni16_str = new WCHAR[str16_len]; 01448 str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, 01449 uni16_str, str16_len); 01450 int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL, 01451 0, NULL, NULL); 01452 char *utf8_str = new char[utf8_len]; 01453 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, 01454 utf8_len, NULL, NULL); 01455 *input_file_ = utf8_str; 01456 delete[] uni16_str; 01457 delete[] utf8_str; 01458 #endif 01459 01460 hocr_str += " <div class='ocr_page'"; 01461 AddIdTohOCR(&hocr_str, "page", page_id, -1); 01462 hocr_str += " title='image \""; 01463 if (input_file_) { 01464 hocr_str += HOcrEscape(input_file_->string()); 01465 } else { 01466 hocr_str += "unknown"; 01467 } 01468 hocr_str.add_str_int("\"; bbox ", rect_left_); 01469 hocr_str.add_str_int(" ", rect_top_); 01470 hocr_str.add_str_int(" ", rect_width_); 01471 hocr_str.add_str_int(" ", rect_height_); 01472 hocr_str.add_str_int("; ppageno ", page_number); 01473 hocr_str += "'>\n"; 01474 01475 ResultIterator *res_it = GetIterator(); 01476 while (!res_it->Empty(RIL_BLOCK)) { 01477 if (res_it->Empty(RIL_WORD)) { 01478 res_it->Next(RIL_WORD); 01479 continue; 01480 } 01481 01482 // Open any new block/paragraph/textline. 01483 if (res_it->IsAtBeginningOf(RIL_BLOCK)) { 01484 hocr_str += " <div class='ocr_carea'"; 01485 AddIdTohOCR(&hocr_str, "block", page_id, bcnt); 01486 AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str); 01487 } 01488 if (res_it->IsAtBeginningOf(RIL_PARA)) { 01489 hocr_str += "\n <p class='ocr_par'"; 01490 if (res_it->ParagraphIsLtr()) { 01491 hocr_str += " dir='ltr'"; 01492 } else { 01493 hocr_str += " dir='rtl'"; 01494 } 01495 AddIdTohOCR(&hocr_str, "par", page_id, pcnt); 01496 AddBoxTohOCR(res_it, RIL_PARA, &hocr_str); 01497 } 01498 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { 01499 hocr_str += "\n <span class='ocr_line'"; 01500 AddIdTohOCR(&hocr_str, "line", page_id, lcnt); 01501 AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str); 01502 } 01503 01504 // Now, process the word... 01505 hocr_str += "<span class='ocrx_word'"; 01506 AddIdTohOCR(&hocr_str, "word", page_id, wcnt); 01507 int left, top, right, bottom; 01508 bool bold, italic, underlined, monospace, serif, smallcaps; 01509 int pointsize, font_id; 01510 const char *font_name; 01511 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); 01512 font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, 01513 &monospace, &serif, &smallcaps, 01514 &pointsize, &font_id); 01515 hocr_str.add_str_int(" title='bbox ", left); 01516 hocr_str.add_str_int(" ", top); 01517 hocr_str.add_str_int(" ", right); 01518 hocr_str.add_str_int(" ", bottom); 01519 hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD)); 01520 if (font_info) { 01521 if (font_name) { 01522 hocr_str += "; x_font "; 01523 hocr_str += HOcrEscape(font_name); 01524 } 01525 hocr_str.add_str_int("; x_fsize ", pointsize); 01526 } 01527 hocr_str += "'"; 01528 if (res_it->WordRecognitionLanguage()) { 01529 hocr_str += " lang='"; 01530 hocr_str += res_it->WordRecognitionLanguage(); 01531 hocr_str += "'"; 01532 } 01533 switch (res_it->WordDirection()) { 01534 case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break; 01535 case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break; 01536 default: // Do nothing. 01537 break; 01538 } 01539 hocr_str += ">"; 01540 bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); 01541 bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); 01542 bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); 01543 if (bold) hocr_str += "<strong>"; 01544 if (italic) hocr_str += "<em>"; 01545 do { 01546 const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL); 01547 if (grapheme && grapheme[0] != 0) { 01548 hocr_str += HOcrEscape(grapheme); 01549 } 01550 delete []grapheme; 01551 res_it->Next(RIL_SYMBOL); 01552 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); 01553 if (italic) hocr_str += "</em>"; 01554 if (bold) hocr_str += "</strong>"; 01555 hocr_str += "</span> "; 01556 wcnt++; 01557 // Close any ending block/paragraph/textline. 01558 if (last_word_in_line) { 01559 hocr_str += "\n </span>"; 01560 lcnt++; 01561 } 01562 if (last_word_in_para) { 01563 hocr_str += "\n </p>\n"; 01564 pcnt++; 01565 } 01566 if (last_word_in_block) { 01567 hocr_str += " </div>\n"; 01568 bcnt++; 01569 } 01570 } 01571 hocr_str += " </div>\n"; 01572 01573 char *ret = new char[hocr_str.length() + 1]; 01574 strcpy(ret, hocr_str.string()); 01575 delete res_it; 01576 return ret; 01577 } 01578 01580 const int kNumbersPerBlob = 5; 01585 const int kBytesPerNumber = 5; 01591 const int kBytesPerBlob = kNumbersPerBlob * (kBytesPerNumber + 1) + 1; 01592 const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1; 01594 const int kBytesPer64BitNumber = 20; 01601 const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 + 01602 UNICHAR_LEN; 01603 01609 char* TessBaseAPI::GetBoxText(int page_number) { 01610 if (tesseract_ == NULL || 01611 (!recognition_done_ && Recognize(NULL) < 0)) 01612 return NULL; 01613 int blob_count; 01614 int utf8_length = TextLength(&blob_count); 01615 int total_length = blob_count * kBytesPerBoxFileLine + utf8_length + 01616 kMaxBytesPerLine; 01617 char* result = new char[total_length]; 01618 strcpy(result, "\0"); 01619 int output_length = 0; 01620 LTRResultIterator* it = GetLTRIterator(); 01621 do { 01622 int left, top, right, bottom; 01623 if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) { 01624 char* text = it->GetUTF8Text(RIL_SYMBOL); 01625 // Tesseract uses space for recognition failure. Fix to a reject 01626 // character, kTesseractReject so we don't create illegal box files. 01627 for (int i = 0; text[i] != '\0'; ++i) { 01628 if (text[i] == ' ') 01629 text[i] = kTesseractReject; 01630 } 01631 snprintf(result + output_length, total_length - output_length, 01632 "%s %d %d %d %d %d\n", 01633 text, left, image_height_ - bottom, 01634 right, image_height_ - top, page_number); 01635 output_length += strlen(result + output_length); 01636 delete [] text; 01637 // Just in case... 01638 if (output_length + kMaxBytesPerLine > total_length) 01639 break; 01640 } 01641 } while (it->Next(RIL_SYMBOL)); 01642 delete it; 01643 return result; 01644 } 01645 01651 const int kUniChs[] = { 01652 0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0 01653 }; 01655 const int kLatinChs[] = { 01656 0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0 01657 }; 01658 01664 char* TessBaseAPI::GetUNLVText() { 01665 if (tesseract_ == NULL || 01666 (!recognition_done_ && Recognize(NULL) < 0)) 01667 return NULL; 01668 bool tilde_crunch_written = false; 01669 bool last_char_was_newline = true; 01670 bool last_char_was_tilde = false; 01671 01672 int total_length = TextLength(NULL); 01673 PAGE_RES_IT page_res_it(page_res_); 01674 char* result = new char[total_length]; 01675 char* ptr = result; 01676 for (page_res_it.restart_page(); page_res_it.word () != NULL; 01677 page_res_it.forward()) { 01678 WERD_RES *word = page_res_it.word(); 01679 // Process the current word. 01680 if (word->unlv_crunch_mode != CR_NONE) { 01681 if (word->unlv_crunch_mode != CR_DELETE && 01682 (!tilde_crunch_written || 01683 (word->unlv_crunch_mode == CR_KEEP_SPACE && 01684 word->word->space() > 0 && 01685 !word->word->flag(W_FUZZY_NON) && 01686 !word->word->flag(W_FUZZY_SP)))) { 01687 if (!word->word->flag(W_BOL) && 01688 word->word->space() > 0 && 01689 !word->word->flag(W_FUZZY_NON) && 01690 !word->word->flag(W_FUZZY_SP)) { 01691 /* Write a space to separate from preceding good text */ 01692 *ptr++ = ' '; 01693 last_char_was_tilde = false; 01694 } 01695 if (!last_char_was_tilde) { 01696 // Write a reject char. 01697 last_char_was_tilde = true; 01698 *ptr++ = kUNLVReject; 01699 tilde_crunch_written = true; 01700 last_char_was_newline = false; 01701 } 01702 } 01703 } else { 01704 // NORMAL PROCESSING of non tilde crunched words. 01705 tilde_crunch_written = false; 01706 tesseract_->set_unlv_suspects(word); 01707 const char* wordstr = word->best_choice->unichar_string().string(); 01708 const STRING& lengths = word->best_choice->unichar_lengths(); 01709 int length = lengths.length(); 01710 int i = 0; 01711 int offset = 0; 01712 01713 if (last_char_was_tilde && 01714 word->word->space() == 0 && wordstr[offset] == ' ') { 01715 // Prevent adjacent tilde across words - we know that adjacent tildes 01716 // within words have been removed. 01717 // Skip the first character. 01718 offset = lengths[i++]; 01719 } 01720 if (i < length && wordstr[offset] != 0) { 01721 if (!last_char_was_newline) 01722 *ptr++ = ' '; 01723 else 01724 last_char_was_newline = false; 01725 for (; i < length; offset += lengths[i++]) { 01726 if (wordstr[offset] == ' ' || 01727 wordstr[offset] == kTesseractReject) { 01728 *ptr++ = kUNLVReject; 01729 last_char_was_tilde = true; 01730 } else { 01731 if (word->reject_map[i].rejected()) 01732 *ptr++ = kUNLVSuspect; 01733 UNICHAR ch(wordstr + offset, lengths[i]); 01734 int uni_ch = ch.first_uni(); 01735 for (int j = 0; kUniChs[j] != 0; ++j) { 01736 if (kUniChs[j] == uni_ch) { 01737 uni_ch = kLatinChs[j]; 01738 break; 01739 } 01740 } 01741 if (uni_ch <= 0xff) { 01742 *ptr++ = static_cast<char>(uni_ch); 01743 last_char_was_tilde = false; 01744 } else { 01745 *ptr++ = kUNLVReject; 01746 last_char_was_tilde = true; 01747 } 01748 } 01749 } 01750 } 01751 } 01752 if (word->word->flag(W_EOL) && !last_char_was_newline) { 01753 /* Add a new line output */ 01754 *ptr++ = '\n'; 01755 tilde_crunch_written = false; 01756 last_char_was_newline = true; 01757 last_char_was_tilde = false; 01758 } 01759 } 01760 *ptr++ = '\n'; 01761 *ptr = '\0'; 01762 return result; 01763 } 01764 01770 char* TessBaseAPI::GetOsdText(int page_number) { 01771 OSResults osr; 01772 01773 bool osd = DetectOS(&osr); 01774 if (!osd) { 01775 return NULL; 01776 } 01777 01778 int orient_id = osr.best_result.orientation_id; 01779 int script_id = osr.get_best_script(orient_id); 01780 float orient_conf = osr.best_result.oconfidence; 01781 float script_conf = osr.best_result.sconfidence; 01782 const char* script_name = 01783 osr.unicharset->get_script_from_script_id(script_id); 01784 01785 // clockwise orientation of the input image, in degrees 01786 int orient_deg = orient_id * 90; 01787 01788 // clockwise rotation needed to make the page upright 01789 int rotate = OrientationIdToValue(orient_id); 01790 01791 char* osd_buf = new char[255]; 01792 snprintf(osd_buf, 255, 01793 "Page number: %d\n" 01794 "Orientation in degrees: %d\n" 01795 "Rotate: %d\n" 01796 "Orientation confidence: %.2f\n" 01797 "Script: %s\n" 01798 "Script confidence: %.2f\n", 01799 page_number, 01800 orient_deg, rotate, orient_conf, 01801 script_name, script_conf); 01802 01803 return osd_buf; 01804 } 01805 01807 int TessBaseAPI::MeanTextConf() { 01808 int* conf = AllWordConfidences(); 01809 if (!conf) return 0; 01810 int sum = 0; 01811 int *pt = conf; 01812 while (*pt >= 0) sum += *pt++; 01813 if (pt != conf) sum /= pt - conf; 01814 delete [] conf; 01815 return sum; 01816 } 01817 01819 int* TessBaseAPI::AllWordConfidences() { 01820 if (tesseract_ == NULL || 01821 (!recognition_done_ && Recognize(NULL) < 0)) 01822 return NULL; 01823 int n_word = 0; 01824 PAGE_RES_IT res_it(page_res_); 01825 for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) 01826 n_word++; 01827 01828 int* conf = new int[n_word+1]; 01829 n_word = 0; 01830 for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) { 01831 WERD_RES *word = res_it.word(); 01832 WERD_CHOICE* choice = word->best_choice; 01833 int w_conf = static_cast<int>(100 + 5 * choice->certainty()); 01834 // This is the eq for converting Tesseract confidence to 1..100 01835 if (w_conf < 0) w_conf = 0; 01836 if (w_conf > 100) w_conf = 100; 01837 conf[n_word++] = w_conf; 01838 } 01839 conf[n_word] = -1; 01840 return conf; 01841 } 01842 01853 bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char* wordstr) { 01854 int debug = 0; 01855 GetIntVariable("applybox_debug", &debug); 01856 bool success = true; 01857 PageSegMode current_psm = GetPageSegMode(); 01858 SetPageSegMode(mode); 01859 SetVariable("classify_enable_learning", "0"); 01860 char* text = GetUTF8Text(); 01861 if (debug) { 01862 tprintf("Trying to adapt \"%s\" to \"%s\"\n", text, wordstr); 01863 } 01864 if (text != NULL) { 01865 PAGE_RES_IT it(page_res_); 01866 WERD_RES* word_res = it.word(); 01867 if (word_res != NULL) { 01868 word_res->word->set_text(wordstr); 01869 } else { 01870 success = false; 01871 } 01872 // Check to see if text matches wordstr. 01873 int w = 0; 01874 int t = 0; 01875 for (t = 0; text[t] != '\0'; ++t) { 01876 if (text[t] == '\n' || text[t] == ' ') 01877 continue; 01878 while (wordstr[w] != '\0' && wordstr[w] == ' ') 01879 ++w; 01880 if (text[t] != wordstr[w]) 01881 break; 01882 ++w; 01883 } 01884 if (text[t] != '\0' || wordstr[w] != '\0') { 01885 // No match. 01886 delete page_res_; 01887 GenericVector<TBOX> boxes; 01888 page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_); 01889 tesseract_->ReSegmentByClassification(page_res_); 01890 tesseract_->TidyUp(page_res_); 01891 PAGE_RES_IT pr_it(page_res_); 01892 if (pr_it.word() == NULL) 01893 success = false; 01894 else 01895 word_res = pr_it.word(); 01896 } else { 01897 word_res->BestChoiceToCorrectText(); 01898 } 01899 if (success) { 01900 tesseract_->EnableLearning = true; 01901 tesseract_->LearnWord(NULL, word_res); 01902 } 01903 delete [] text; 01904 } else { 01905 success = false; 01906 } 01907 SetPageSegMode(current_psm); 01908 return success; 01909 } 01910 01917 void TessBaseAPI::Clear() { 01918 if (thresholder_ != NULL) 01919 thresholder_->Clear(); 01920 ClearResults(); 01921 SetInputImage(NULL); 01922 } 01923 01930 void TessBaseAPI::End() { 01931 if (thresholder_ != NULL) { 01932 delete thresholder_; 01933 thresholder_ = NULL; 01934 } 01935 if (page_res_ != NULL) { 01936 delete page_res_; 01937 page_res_ = NULL; 01938 } 01939 if (block_list_ != NULL) { 01940 delete block_list_; 01941 block_list_ = NULL; 01942 } 01943 if (paragraph_models_ != NULL) { 01944 paragraph_models_->delete_data_pointers(); 01945 delete paragraph_models_; 01946 paragraph_models_ = NULL; 01947 } 01948 if (tesseract_ != NULL) { 01949 delete tesseract_; 01950 if (osd_tesseract_ == tesseract_) 01951 osd_tesseract_ = NULL; 01952 tesseract_ = NULL; 01953 } 01954 if (osd_tesseract_ != NULL) { 01955 delete osd_tesseract_; 01956 osd_tesseract_ = NULL; 01957 } 01958 if (equ_detect_ != NULL) { 01959 delete equ_detect_; 01960 equ_detect_ = NULL; 01961 } 01962 if (input_file_ != NULL) { 01963 delete input_file_; 01964 input_file_ = NULL; 01965 } 01966 if (input_image_ != NULL) { 01967 pixDestroy(&input_image_); 01968 input_image_ = NULL; 01969 } 01970 if (output_file_ != NULL) { 01971 delete output_file_; 01972 output_file_ = NULL; 01973 } 01974 if (datapath_ != NULL) { 01975 delete datapath_; 01976 datapath_ = NULL; 01977 } 01978 if (language_ != NULL) { 01979 delete language_; 01980 language_ = NULL; 01981 } 01982 } 01983 01984 // Clear any library-level memory caches. 01985 // There are a variety of expensive-to-load constant data structures (mostly 01986 // language dictionaries) that are cached globally -- surviving the Init() 01987 // and End() of individual TessBaseAPI's. This function allows the clearing 01988 // of these caches. 01989 void TessBaseAPI::ClearPersistentCache() { 01990 Dict::GlobalDawgCache()->DeleteUnusedDawgs(); 01991 } 01992 01997 int TessBaseAPI::IsValidWord(const char *word) { 01998 return tesseract_->getDict().valid_word(word); 01999 } 02000 // Returns true if utf8_character is defined in the UniCharset. 02001 bool TessBaseAPI::IsValidCharacter(const char *utf8_character) { 02002 return tesseract_->unicharset.contains_unichar(utf8_character); 02003 } 02004 02005 02006 // TODO(rays) Obsolete this function and replace with a more aptly named 02007 // function that returns image coordinates rather than tesseract coordinates. 02008 bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) { 02009 PageIterator* it = AnalyseLayout(); 02010 if (it == NULL) { 02011 return false; 02012 } 02013 int x1, x2, y1, y2; 02014 it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2); 02015 // Calculate offset and slope (NOTE: Kind of ugly) 02016 if (x2 <= x1) x2 = x1 + 1; 02017 // Convert the point pair to slope/offset of the baseline (in image coords.) 02018 *out_slope = static_cast<float>(y2 - y1) / (x2 - x1); 02019 *out_offset = static_cast<int>(y1 - *out_slope * x1); 02020 // Get the y-coord of the baseline at the left and right edges of the 02021 // textline's bounding box. 02022 int left, top, right, bottom; 02023 if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) { 02024 delete it; 02025 return false; 02026 } 02027 int left_y = IntCastRounded(*out_slope * left + *out_offset); 02028 int right_y = IntCastRounded(*out_slope * right + *out_offset); 02029 // Shift the baseline down so it passes through the nearest bottom-corner 02030 // of the textline's bounding box. This is the difference between the y 02031 // at the lowest (max) edge of the box and the actual box bottom. 02032 *out_offset += bottom - MAX(left_y, right_y); 02033 // Switch back to bottom-up tesseract coordinates. Requires negation of 02034 // the slope and height - offset for the offset. 02035 *out_slope = -*out_slope; 02036 *out_offset = rect_height_ - *out_offset; 02037 delete it; 02038 02039 return true; 02040 } 02041 02043 void TessBaseAPI::SetDictFunc(DictFunc f) { 02044 if (tesseract_ != NULL) { 02045 tesseract_->getDict().letter_is_okay_ = f; 02046 } 02047 } 02048 02057 void TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) { 02058 if (tesseract_ != NULL) { 02059 tesseract_->getDict().probability_in_context_ = f; 02060 // Set it for the sublangs too. 02061 int num_subs = tesseract_->num_sub_langs(); 02062 for (int i = 0; i < num_subs; ++i) { 02063 tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f; 02064 } 02065 } 02066 } 02067 02069 void TessBaseAPI::SetFillLatticeFunc(FillLatticeFunc f) { 02070 if (tesseract_ != NULL) tesseract_->fill_lattice_ = f; 02071 } 02072 02074 bool TessBaseAPI::InternalSetImage() { 02075 if (tesseract_ == NULL) { 02076 tprintf("Please call Init before attempting to set an image."); 02077 return false; 02078 } 02079 if (thresholder_ == NULL) 02080 thresholder_ = new ImageThresholder; 02081 ClearResults(); 02082 return true; 02083 } 02084 02091 void TessBaseAPI::Threshold(Pix** pix) { 02092 ASSERT_HOST(pix != NULL); 02093 if (*pix != NULL) 02094 pixDestroy(pix); 02095 // Zero resolution messes up the algorithms, so make sure it is credible. 02096 int y_res = thresholder_->GetScaledYResolution(); 02097 if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) { 02098 // Use the minimum default resolution, as it is safer to under-estimate 02099 // than over-estimate resolution. 02100 thresholder_->SetSourceYResolution(kMinCredibleResolution); 02101 } 02102 PageSegMode pageseg_mode = 02103 static_cast<PageSegMode>( 02104 static_cast<int>(tesseract_->tessedit_pageseg_mode)); 02105 thresholder_->ThresholdToPix(pageseg_mode, pix); 02106 thresholder_->GetImageSizes(&rect_left_, &rect_top_, 02107 &rect_width_, &rect_height_, 02108 &image_width_, &image_height_); 02109 if (!thresholder_->IsBinary()) { 02110 tesseract_->set_pix_thresholds(thresholder_->GetPixRectThresholds()); 02111 tesseract_->set_pix_grey(thresholder_->GetPixRectGrey()); 02112 } else { 02113 tesseract_->set_pix_thresholds(NULL); 02114 tesseract_->set_pix_grey(NULL); 02115 } 02116 // Set the internal resolution that is used for layout parameters from the 02117 // estimated resolution, rather than the image resolution, which may be 02118 // fabricated, but we will use the image resolution, if there is one, to 02119 // report output point sizes. 02120 int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(), 02121 kMinCredibleResolution, 02122 kMaxCredibleResolution); 02123 if (estimated_res != thresholder_->GetScaledEstimatedResolution()) { 02124 tprintf("Estimated resolution %d out of range! Corrected to %d\n", 02125 thresholder_->GetScaledEstimatedResolution(), estimated_res); 02126 } 02127 tesseract_->set_source_resolution(estimated_res); 02128 SavePixForCrash(estimated_res, *pix); 02129 } 02130 02132 int TessBaseAPI::FindLines() { 02133 if (thresholder_ == NULL || thresholder_->IsEmpty()) { 02134 tprintf("Please call SetImage before attempting recognition."); 02135 return -1; 02136 } 02137 if (recognition_done_) 02138 ClearResults(); 02139 if (!block_list_->empty()) { 02140 return 0; 02141 } 02142 if (tesseract_ == NULL) { 02143 tesseract_ = new Tesseract; 02144 tesseract_->InitAdaptiveClassifier(false); 02145 } 02146 if (tesseract_->pix_binary() == NULL) 02147 Threshold(tesseract_->mutable_pix_binary()); 02148 if (tesseract_->ImageWidth() > MAX_INT16 || 02149 tesseract_->ImageHeight() > MAX_INT16) { 02150 tprintf("Image too large: (%d, %d)\n", 02151 tesseract_->ImageWidth(), tesseract_->ImageHeight()); 02152 return -1; 02153 } 02154 02155 tesseract_->PrepareForPageseg(); 02156 02157 if (tesseract_->textord_equation_detect) { 02158 if (equ_detect_ == NULL && datapath_ != NULL) { 02159 equ_detect_ = new EquationDetect(datapath_->string(), NULL); 02160 } 02161 tesseract_->SetEquationDetect(equ_detect_); 02162 } 02163 02164 Tesseract* osd_tess = osd_tesseract_; 02165 OSResults osr; 02166 if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == NULL) { 02167 if (strcmp(language_->string(), "osd") == 0) { 02168 osd_tess = tesseract_; 02169 } else { 02170 osd_tesseract_ = new Tesseract; 02171 if (osd_tesseract_->init_tesseract( 02172 datapath_->string(), NULL, "osd", OEM_TESSERACT_ONLY, 02173 NULL, 0, NULL, NULL, false) == 0) { 02174 osd_tess = osd_tesseract_; 02175 osd_tesseract_->set_source_resolution( 02176 thresholder_->GetSourceYResolution()); 02177 } else { 02178 tprintf("Warning: Auto orientation and script detection requested," 02179 " but osd language failed to load\n"); 02180 delete osd_tesseract_; 02181 osd_tesseract_ = NULL; 02182 } 02183 } 02184 } 02185 02186 if (tesseract_->SegmentPage(input_file_, block_list_, osd_tess, &osr) < 0) 02187 return -1; 02188 // If Devanagari is being recognized, we use different images for page seg 02189 // and for OCR. 02190 tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr); 02191 return 0; 02192 } 02193 02195 void TessBaseAPI::ClearResults() { 02196 if (tesseract_ != NULL) { 02197 tesseract_->Clear(); 02198 } 02199 if (page_res_ != NULL) { 02200 delete page_res_; 02201 page_res_ = NULL; 02202 } 02203 recognition_done_ = false; 02204 if (block_list_ == NULL) 02205 block_list_ = new BLOCK_LIST; 02206 else 02207 block_list_->clear(); 02208 if (paragraph_models_ != NULL) { 02209 paragraph_models_->delete_data_pointers(); 02210 delete paragraph_models_; 02211 paragraph_models_ = NULL; 02212 } 02213 SavePixForCrash(0, NULL); 02214 } 02215 02223 int TessBaseAPI::TextLength(int* blob_count) { 02224 if (tesseract_ == NULL || page_res_ == NULL) 02225 return 0; 02226 02227 PAGE_RES_IT page_res_it(page_res_); 02228 int total_length = 2; 02229 int total_blobs = 0; 02230 // Iterate over the data structures to extract the recognition result. 02231 for (page_res_it.restart_page(); page_res_it.word () != NULL; 02232 page_res_it.forward()) { 02233 WERD_RES *word = page_res_it.word(); 02234 WERD_CHOICE* choice = word->best_choice; 02235 if (choice != NULL) { 02236 total_blobs += choice->length() + 2; 02237 total_length += choice->unichar_string().length() + 2; 02238 for (int i = 0; i < word->reject_map.length(); ++i) { 02239 if (word->reject_map[i].rejected()) 02240 ++total_length; 02241 } 02242 } 02243 } 02244 if (blob_count != NULL) 02245 *blob_count = total_blobs; 02246 return total_length; 02247 } 02248 02253 bool TessBaseAPI::DetectOS(OSResults* osr) { 02254 if (tesseract_ == NULL) 02255 return false; 02256 ClearResults(); 02257 if (tesseract_->pix_binary() == NULL) 02258 Threshold(tesseract_->mutable_pix_binary()); 02259 if (input_file_ == NULL) 02260 input_file_ = new STRING(kInputFile); 02261 return orientation_and_script_detection(*input_file_, osr, tesseract_); 02262 } 02263 02264 void TessBaseAPI::set_min_orientation_margin(double margin) { 02265 tesseract_->min_orientation_margin.set_value(margin); 02266 } 02267 02282 void TessBaseAPI::GetBlockTextOrientations(int** block_orientation, 02283 bool** vertical_writing) { 02284 delete[] *block_orientation; 02285 *block_orientation = NULL; 02286 delete[] *vertical_writing; 02287 *vertical_writing = NULL; 02288 BLOCK_IT block_it(block_list_); 02289 02290 block_it.move_to_first(); 02291 int num_blocks = 0; 02292 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { 02293 if (!block_it.data()->poly_block()->IsText()) { 02294 continue; 02295 } 02296 ++num_blocks; 02297 } 02298 if (!num_blocks) { 02299 tprintf("WARNING: Found no blocks\n"); 02300 return; 02301 } 02302 *block_orientation = new int[num_blocks]; 02303 *vertical_writing = new bool[num_blocks]; 02304 block_it.move_to_first(); 02305 int i = 0; 02306 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 02307 block_it.forward()) { 02308 if (!block_it.data()->poly_block()->IsText()) { 02309 continue; 02310 } 02311 FCOORD re_rotation = block_it.data()->re_rotation(); 02312 float re_theta = re_rotation.angle(); 02313 FCOORD classify_rotation = block_it.data()->classify_rotation(); 02314 float classify_theta = classify_rotation.angle(); 02315 double rot_theta = - (re_theta - classify_theta) * 2.0 / PI; 02316 if (rot_theta < 0) rot_theta += 4; 02317 int num_rotations = static_cast<int>(rot_theta + 0.5); 02318 (*block_orientation)[i] = num_rotations; 02319 // The classify_rotation is non-zero only if the text has vertical 02320 // writing direction. 02321 (*vertical_writing)[i] = classify_rotation.y() != 0.0f; 02322 ++i; 02323 } 02324 } 02325 02326 // ____________________________________________________________________________ 02327 // Ocropus add-ons. 02328 02330 BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() { 02331 FindLines(); 02332 BLOCK_LIST* result = block_list_; 02333 block_list_ = NULL; 02334 return result; 02335 } 02336 02342 void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) { 02343 delete block_list; 02344 } 02345 02346 02347 ROW *TessBaseAPI::MakeTessOCRRow(float baseline, 02348 float xheight, 02349 float descender, 02350 float ascender) { 02351 inT32 xstarts[] = {-32000}; 02352 double quad_coeffs[] = {0, 0, baseline}; 02353 return new ROW(1, 02354 xstarts, 02355 quad_coeffs, 02356 xheight, 02357 ascender - (baseline + xheight), 02358 descender - baseline, 02359 0, 02360 0); 02361 } 02362 02364 TBLOB *TessBaseAPI::MakeTBLOB(Pix *pix) { 02365 int width = pixGetWidth(pix); 02366 int height = pixGetHeight(pix); 02367 BLOCK block("a character", TRUE, 0, 0, 0, 0, width, height); 02368 02369 // Create C_BLOBs from the page 02370 extract_edges(pix, &block); 02371 02372 // Merge all C_BLOBs 02373 C_BLOB_LIST *list = block.blob_list(); 02374 C_BLOB_IT c_blob_it(list); 02375 if (c_blob_it.empty()) 02376 return NULL; 02377 // Move all the outlines to the first blob. 02378 C_OUTLINE_IT ol_it(c_blob_it.data()->out_list()); 02379 for (c_blob_it.forward(); 02380 !c_blob_it.at_first(); 02381 c_blob_it.forward()) { 02382 C_BLOB *c_blob = c_blob_it.data(); 02383 ol_it.add_list_after(c_blob->out_list()); 02384 } 02385 // Convert the first blob to the output TBLOB. 02386 return TBLOB::PolygonalCopy(false, c_blob_it.data()); 02387 } 02388 02394 void TessBaseAPI::NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode) { 02395 TBOX box = tblob->bounding_box(); 02396 float x_center = (box.left() + box.right()) / 2.0f; 02397 float baseline = row->base_line(x_center); 02398 float scale = kBlnXHeight / row->x_height(); 02399 tblob->Normalize(NULL, NULL, NULL, x_center, baseline, scale, scale, 02400 0.0f, static_cast<float>(kBlnBaselineOffset), false, NULL); 02401 } 02402 02407 TBLOB *make_tesseract_blob(float baseline, float xheight, 02408 float descender, float ascender, 02409 bool numeric_mode, Pix* pix) { 02410 TBLOB *tblob = TessBaseAPI::MakeTBLOB(pix); 02411 02412 // Normalize TBLOB 02413 ROW *row = 02414 TessBaseAPI::MakeTessOCRRow(baseline, xheight, descender, ascender); 02415 TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode); 02416 delete row; 02417 return tblob; 02418 } 02419 02425 void TessBaseAPI::AdaptToCharacter(const char *unichar_repr, 02426 int length, 02427 float baseline, 02428 float xheight, 02429 float descender, 02430 float ascender) { 02431 UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length); 02432 TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender, 02433 tesseract_->classify_bln_numeric_mode, 02434 tesseract_->pix_binary()); 02435 float threshold; 02436 float best_rating = -100; 02437 02438 02439 // Classify to get a raw choice. 02440 BLOB_CHOICE_LIST choices; 02441 tesseract_->AdaptiveClassifier(blob, &choices); 02442 BLOB_CHOICE_IT choice_it; 02443 choice_it.set_to_list(&choices); 02444 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); 02445 choice_it.forward()) { 02446 if (choice_it.data()->rating() > best_rating) { 02447 best_rating = choice_it.data()->rating(); 02448 } 02449 } 02450 02451 threshold = tesseract_->matcher_good_threshold; 02452 02453 if (blob->outlines) 02454 tesseract_->AdaptToChar(blob, id, kUnknownFontinfoId, threshold, 02455 tesseract_->AdaptedTemplates); 02456 delete blob; 02457 } 02458 02459 02460 PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) { 02461 PAGE_RES *page_res = new PAGE_RES(false, block_list, 02462 &(tesseract_->prev_word_best_choice_)); 02463 tesseract_->recog_all_words(page_res, NULL, NULL, NULL, 1); 02464 return page_res; 02465 } 02466 02467 PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list, 02468 PAGE_RES* pass1_result) { 02469 if (!pass1_result) 02470 pass1_result = new PAGE_RES(false, block_list, 02471 &(tesseract_->prev_word_best_choice_)); 02472 tesseract_->recog_all_words(pass1_result, NULL, NULL, NULL, 2); 02473 return pass1_result; 02474 } 02475 02476 void TessBaseAPI::DetectParagraphs(bool after_text_recognition) { 02477 int debug_level = 0; 02478 GetIntVariable("paragraph_debug_level", &debug_level); 02479 if (paragraph_models_ == NULL) 02480 paragraph_models_ = new GenericVector<ParagraphModel*>; 02481 MutableIterator *result_it = GetMutableIterator(); 02482 do { // Detect paragraphs for this block 02483 GenericVector<ParagraphModel *> models; 02484 ::tesseract::DetectParagraphs(debug_level, after_text_recognition, 02485 result_it, &models); 02486 *paragraph_models_ += models; 02487 } while (result_it->Next(RIL_BLOCK)); 02488 delete result_it; 02489 } 02490 02491 struct TESS_CHAR : ELIST_LINK { 02492 char *unicode_repr; 02493 int length; // of unicode_repr 02494 float cost; 02495 TBOX box; 02496 02497 TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) { 02498 length = (len == -1 ? strlen(repr) : len); 02499 unicode_repr = new char[length + 1]; 02500 strncpy(unicode_repr, repr, length); 02501 } 02502 02503 TESS_CHAR() { // Satisfies ELISTIZE. 02504 } 02505 ~TESS_CHAR() { 02506 delete [] unicode_repr; 02507 } 02508 }; 02509 02510 ELISTIZEH(TESS_CHAR) 02511 ELISTIZE(TESS_CHAR) 02512 02513 static void add_space(TESS_CHAR_IT* it) { 02514 TESS_CHAR *t = new TESS_CHAR(0, " "); 02515 it->add_after_then_move(t); 02516 } 02517 02518 02519 static float rating_to_cost(float rating) { 02520 rating = 100 + rating; 02521 // cuddled that to save from coverage profiler 02522 // (I have never seen ratings worse than -100, 02523 // but the check won't hurt) 02524 if (rating < 0) rating = 0; 02525 return rating; 02526 } 02527 02532 static void extract_result(TESS_CHAR_IT* out, 02533 PAGE_RES* page_res) { 02534 PAGE_RES_IT page_res_it(page_res); 02535 int word_count = 0; 02536 while (page_res_it.word() != NULL) { 02537 WERD_RES *word = page_res_it.word(); 02538 const char *str = word->best_choice->unichar_string().string(); 02539 const char *len = word->best_choice->unichar_lengths().string(); 02540 TBOX real_rect = word->word->bounding_box(); 02541 02542 if (word_count) 02543 add_space(out); 02544 int n = strlen(len); 02545 for (int i = 0; i < n; i++) { 02546 TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()), 02547 str, *len); 02548 tc->box = real_rect.intersection(word->box_word->BlobBox(i)); 02549 out->add_after_then_move(tc); 02550 str += *len; 02551 len++; 02552 } 02553 page_res_it.forward(); 02554 word_count++; 02555 } 02556 } 02557 02562 int TessBaseAPI::TesseractExtractResult(char** text, 02563 int** lengths, 02564 float** costs, 02565 int** x0, 02566 int** y0, 02567 int** x1, 02568 int** y1, 02569 PAGE_RES* page_res) { 02570 TESS_CHAR_LIST tess_chars; 02571 TESS_CHAR_IT tess_chars_it(&tess_chars); 02572 extract_result(&tess_chars_it, page_res); 02573 tess_chars_it.move_to_first(); 02574 int n = tess_chars.length(); 02575 int text_len = 0; 02576 *lengths = new int[n]; 02577 *costs = new float[n]; 02578 *x0 = new int[n]; 02579 *y0 = new int[n]; 02580 *x1 = new int[n]; 02581 *y1 = new int[n]; 02582 int i = 0; 02583 for (tess_chars_it.mark_cycle_pt(); 02584 !tess_chars_it.cycled_list(); 02585 tess_chars_it.forward(), i++) { 02586 TESS_CHAR *tc = tess_chars_it.data(); 02587 text_len += (*lengths)[i] = tc->length; 02588 (*costs)[i] = tc->cost; 02589 (*x0)[i] = tc->box.left(); 02590 (*y0)[i] = tc->box.bottom(); 02591 (*x1)[i] = tc->box.right(); 02592 (*y1)[i] = tc->box.top(); 02593 } 02594 char *p = *text = new char[text_len]; 02595 02596 tess_chars_it.move_to_first(); 02597 for (tess_chars_it.mark_cycle_pt(); 02598 !tess_chars_it.cycled_list(); 02599 tess_chars_it.forward()) { 02600 TESS_CHAR *tc = tess_chars_it.data(); 02601 strncpy(p, tc->unicode_repr, tc->length); 02602 p += tc->length; 02603 } 02604 return n; 02605 } 02606 02608 // The resulting features are returned in int_features, which must be 02609 // of size MAX_NUM_INT_FEATURES. The number of features is returned in 02610 // num_features (or 0 if there was a failure). 02611 // On return feature_outline_index is filled with an index of the outline 02612 // corresponding to each feature in int_features. 02613 // TODO(rays) Fix the caller to out outline_counts instead. 02614 void TessBaseAPI::GetFeaturesForBlob(TBLOB* blob, 02615 INT_FEATURE_STRUCT* int_features, 02616 int* num_features, 02617 int* feature_outline_index) { 02618 GenericVector<int> outline_counts; 02619 GenericVector<INT_FEATURE_STRUCT> bl_features; 02620 GenericVector<INT_FEATURE_STRUCT> cn_features; 02621 INT_FX_RESULT_STRUCT fx_info; 02622 tesseract_->ExtractFeatures(*blob, false, &bl_features, 02623 &cn_features, &fx_info, &outline_counts); 02624 if (cn_features.size() == 0 || cn_features.size() > MAX_NUM_INT_FEATURES) { 02625 *num_features = 0; 02626 return; // Feature extraction failed. 02627 } 02628 *num_features = cn_features.size(); 02629 memcpy(int_features, &cn_features[0], *num_features * sizeof(cn_features[0])); 02630 // TODO(rays) Pass outline_counts back and simplify the calling code. 02631 if (feature_outline_index != NULL) { 02632 int f = 0; 02633 for (int i = 0; i < outline_counts.size(); ++i) { 02634 while (f < outline_counts[i]) 02635 feature_outline_index[f++] = i; 02636 } 02637 } 02638 } 02639 02640 // This method returns the row to which a box of specified dimensions would 02641 // belong. If no good match is found, it returns NULL. 02642 ROW* TessBaseAPI::FindRowForBox(BLOCK_LIST* blocks, 02643 int left, int top, int right, int bottom) { 02644 TBOX box(left, bottom, right, top); 02645 BLOCK_IT b_it(blocks); 02646 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { 02647 BLOCK* block = b_it.data(); 02648 if (!box.major_overlap(block->bounding_box())) 02649 continue; 02650 ROW_IT r_it(block->row_list()); 02651 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { 02652 ROW* row = r_it.data(); 02653 if (!box.major_overlap(row->bounding_box())) 02654 continue; 02655 WERD_IT w_it(row->word_list()); 02656 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 02657 WERD* word = w_it.data(); 02658 if (box.major_overlap(word->bounding_box())) 02659 return row; 02660 } 02661 } 02662 } 02663 return NULL; 02664 } 02665 02667 void TessBaseAPI::RunAdaptiveClassifier(TBLOB* blob, 02668 int num_max_matches, 02669 int* unichar_ids, 02670 float* ratings, 02671 int* num_matches_returned) { 02672 BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST; 02673 tesseract_->AdaptiveClassifier(blob, choices); 02674 BLOB_CHOICE_IT choices_it(choices); 02675 int& index = *num_matches_returned; 02676 index = 0; 02677 for (choices_it.mark_cycle_pt(); 02678 !choices_it.cycled_list() && index < num_max_matches; 02679 choices_it.forward()) { 02680 BLOB_CHOICE* choice = choices_it.data(); 02681 unichar_ids[index] = choice->unichar_id(); 02682 ratings[index] = choice->rating(); 02683 ++index; 02684 } 02685 *num_matches_returned = index; 02686 delete choices; 02687 } 02688 02690 const char* TessBaseAPI::GetUnichar(int unichar_id) { 02691 return tesseract_->unicharset.id_to_unichar(unichar_id); 02692 } 02693 02695 const Dawg *TessBaseAPI::GetDawg(int i) const { 02696 if (tesseract_ == NULL || i >= NumDawgs()) return NULL; 02697 return tesseract_->getDict().GetDawg(i); 02698 } 02699 02701 int TessBaseAPI::NumDawgs() const { 02702 return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs(); 02703 } 02704 02705 #ifndef NO_CUBE_BUILD 02706 02707 CubeRecoContext *TessBaseAPI::GetCubeRecoContext() const { 02708 return (tesseract_ == NULL) ? NULL : tesseract_->GetCubeRecoContext(); 02709 } 02710 #endif // NO_CUBE_BUILD 02711 02713 STRING HOcrEscape(const char* text) { 02714 STRING ret; 02715 const char *ptr; 02716 for (ptr = text; *ptr; ptr++) { 02717 switch (*ptr) { 02718 case '<': ret += "<"; break; 02719 case '>': ret += ">"; break; 02720 case '&': ret += "&"; break; 02721 case '"': ret += """; break; 02722 case '\'': ret += "'"; break; 02723 default: ret += *ptr; 02724 } 02725 } 02726 return ret; 02727 } 02728 02729 } // namespace tesseract.