tesseract 3.04.01

api/baseapi.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        baseapi.cpp
00003  * Description: Simple API for calling tesseract.
00004  * Author:      Ray Smith
00005  * Created:     Fri Oct 06 15:35:01 PDT 2006
00006  *
00007  * (C) Copyright 2006, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 // Include automatically generated configuration file if running autoconf.
00021 #ifdef HAVE_CONFIG_H
00022 #include "config_auto.h"
00023 #endif
00024 
00025 #ifdef __linux__
00026 #include <signal.h>
00027 #endif
00028 
00029 #if defined(_WIN32)
00030 #ifdef _MSC_VER
00031 #include "vcsversion.h"
00032 #include "mathfix.h"
00033 #elif MINGW
00034 // workaround for stdlib.h with -std=c++11 for _splitpath and _MAX_FNAME
00035 #undef __STRICT_ANSI__
00036 #endif  // _MSC_VER
00037 #include <stdlib.h>
00038 #include <windows.h>
00039 #include <fcntl.h>
00040 #include <io.h>
00041 #else
00042 #include <dirent.h>
00043 #include <libgen.h>
00044 #include <string.h>
00045 #endif  // _WIN32
00046 
00047 #include <iostream>
00048 #include <string>
00049 #include <iterator>
00050 #include <fstream>
00051 
00052 #include "allheaders.h"
00053 
00054 #include "baseapi.h"
00055 #include "blobclass.h"
00056 #include "resultiterator.h"
00057 #include "mutableiterator.h"
00058 #include "thresholder.h"
00059 #include "tesseractclass.h"
00060 #include "pageres.h"
00061 #include "paragraphs.h"
00062 #include "tessvars.h"
00063 #include "control.h"
00064 #include "dict.h"
00065 #include "pgedit.h"
00066 #include "paramsd.h"
00067 #include "output.h"
00068 #include "globaloc.h"
00069 #include "globals.h"
00070 #include "edgblob.h"
00071 #include "equationdetect.h"
00072 #include "tessbox.h"
00073 #include "makerow.h"
00074 #include "otsuthr.h"
00075 #include "osdetect.h"
00076 #include "params.h"
00077 #include "renderer.h"
00078 #include "strngs.h"
00079 #include "openclwrapper.h"
00080 
00081 BOOL_VAR(stream_filelist, FALSE, "Stream a filelist from stdin");
00082 
00083 namespace tesseract {
00084 
00086 const int kMinRectSize = 10;
00088 const char kTesseractReject = '~';
00090 const char kUNLVReject = '~';
00092 const char kUNLVSuspect = '^';
00097 const char* kInputFile = "noname.tif";
00101 const char* kOldVarsFile = "failed_vars.txt";
00103 const int kMaxIntSize = 22;
00108 const int kMinCredibleResolution = 70;
00110 const int kMaxCredibleResolution = 2400;
00111 
00112 TessBaseAPI::TessBaseAPI()
00113   : tesseract_(NULL),
00114     osd_tesseract_(NULL),
00115     equ_detect_(NULL),
00116     // Thresholder is initialized to NULL here, but will be set before use by:
00117     // A constructor of a derived API,  SetThresholder(), or
00118     // created implicitly when used in InternalSetImage.
00119     thresholder_(NULL),
00120     paragraph_models_(NULL),
00121     block_list_(NULL),
00122     page_res_(NULL),
00123     input_file_(NULL),
00124     input_image_(NULL),
00125     output_file_(NULL),
00126     datapath_(NULL),
00127     language_(NULL),
00128     last_oem_requested_(OEM_DEFAULT),
00129     recognition_done_(false),
00130     truth_cb_(NULL),
00131     rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0),
00132     image_width_(0), image_height_(0) {
00133 }
00134 
00135 TessBaseAPI::~TessBaseAPI() {
00136   End();
00137 }
00138 
00142 const char* TessBaseAPI::Version() {
00143 #if defined(GIT_REV) && (defined(DEBUG) || defined(_DEBUG))
00144   return GIT_REV;
00145 #else
00146   return TESSERACT_VERSION_STR;
00147 #endif
00148 }
00149 
00157 #ifdef USE_OPENCL
00158 #if USE_DEVICE_SELECTION
00159 #include "opencl_device_selection.h"
00160 #endif
00161 #endif
00162 size_t TessBaseAPI::getOpenCLDevice(void **data) {
00163 #ifdef USE_OPENCL
00164 #if USE_DEVICE_SELECTION
00165   ds_device device = OpenclDevice::getDeviceSelection();
00166   if (device.type == DS_DEVICE_OPENCL_DEVICE) {
00167     *data = reinterpret_cast<void*>(new cl_device_id);
00168     memcpy(*data, &device.oclDeviceID, sizeof(cl_device_id));
00169     return sizeof(cl_device_id);
00170   }
00171 #endif
00172 #endif
00173 
00174   *data = NULL;
00175   return 0;
00176 }
00177 
00182 void TessBaseAPI::CatchSignals() {
00183 #ifdef __linux__
00184   struct sigaction action;
00185   memset(&action, 0, sizeof(action));
00186   action.sa_handler = &signal_exit;
00187   action.sa_flags = SA_RESETHAND;
00188   sigaction(SIGSEGV, &action, NULL);
00189   sigaction(SIGFPE, &action, NULL);
00190   sigaction(SIGBUS, &action, NULL);
00191 #else
00192   // Warn API users that an implementation is needed.
00193   tprintf("CatchSignals has no non-linux implementation!\n");
00194 #endif
00195 }
00196 
00201 void TessBaseAPI::SetInputName(const char* name) {
00202   if (input_file_ == NULL)
00203     input_file_ = new STRING(name);
00204   else
00205     *input_file_ = name;
00206 }
00207 
00209 void TessBaseAPI::SetOutputName(const char* name) {
00210   if (output_file_ == NULL)
00211     output_file_ = new STRING(name);
00212   else
00213     *output_file_ = name;
00214 }
00215 
00216 bool TessBaseAPI::SetVariable(const char* name, const char* value) {
00217   if (tesseract_ == NULL) tesseract_ = new Tesseract;
00218   return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
00219                               tesseract_->params());
00220 }
00221 
00222 bool TessBaseAPI::SetDebugVariable(const char* name, const char* value) {
00223   if (tesseract_ == NULL) tesseract_ = new Tesseract;
00224   return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY,
00225                               tesseract_->params());
00226 }
00227 
00228 bool TessBaseAPI::GetIntVariable(const char *name, int *value) const {
00229   IntParam *p = ParamUtils::FindParam<IntParam>(
00230       name, GlobalParams()->int_params, tesseract_->params()->int_params);
00231   if (p == NULL) return false;
00232   *value = (inT32)(*p);
00233   return true;
00234 }
00235 
00236 bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {
00237   BoolParam *p = ParamUtils::FindParam<BoolParam>(
00238       name, GlobalParams()->bool_params, tesseract_->params()->bool_params);
00239   if (p == NULL) return false;
00240   *value = (BOOL8)(*p);
00241   return true;
00242 }
00243 
00244 const char *TessBaseAPI::GetStringVariable(const char *name) const {
00245   StringParam *p = ParamUtils::FindParam<StringParam>(
00246       name, GlobalParams()->string_params, tesseract_->params()->string_params);
00247   return (p != NULL) ? p->string() : NULL;
00248 }
00249 
00250 bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {
00251   DoubleParam *p = ParamUtils::FindParam<DoubleParam>(
00252       name, GlobalParams()->double_params, tesseract_->params()->double_params);
00253   if (p == NULL) return false;
00254   *value = (double)(*p);
00255   return true;
00256 }
00257 
00259 bool TessBaseAPI::GetVariableAsString(const char *name, STRING *val) {
00260   return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
00261 }
00262 
00264 void TessBaseAPI::PrintVariables(FILE *fp) const {
00265   ParamUtils::PrintParams(fp, tesseract_->params());
00266 }
00267 
00276 int TessBaseAPI::Init(const char* datapath, const char* language,
00277                       OcrEngineMode oem, char **configs, int configs_size,
00278                       const GenericVector<STRING> *vars_vec,
00279                       const GenericVector<STRING> *vars_values,
00280                       bool set_only_non_debug_params) {
00281   PERF_COUNT_START("TessBaseAPI::Init")
00282   // Default language is "eng".
00283   if (language == NULL) language = "eng";
00284   // If the datapath, OcrEngineMode or the language have changed - start again.
00285   // Note that the language_ field stores the last requested language that was
00286   // initialized successfully, while tesseract_->lang stores the language
00287   // actually used. They differ only if the requested language was NULL, in
00288   // which case tesseract_->lang is set to the Tesseract default ("eng").
00289   if (tesseract_ != NULL &&
00290       (datapath_ == NULL || language_ == NULL ||
00291        *datapath_ != datapath || last_oem_requested_ != oem ||
00292        (*language_ != language && tesseract_->lang != language))) {
00293     delete tesseract_;
00294     tesseract_ = NULL;
00295   }
00296   // PERF_COUNT_SUB("delete tesseract_")
00297 #ifdef USE_OPENCL
00298   OpenclDevice od;
00299   od.InitEnv();
00300 #endif
00301   PERF_COUNT_SUB("OD::InitEnv()")
00302   bool reset_classifier = true;
00303   if (tesseract_ == NULL) {
00304     reset_classifier = false;
00305     tesseract_ = new Tesseract;
00306     if (tesseract_->init_tesseract(
00307         datapath, output_file_ != NULL ? output_file_->string() : NULL,
00308         language, oem, configs, configs_size, vars_vec, vars_values,
00309         set_only_non_debug_params) != 0) {
00310       return -1;
00311     }
00312   }
00313   PERF_COUNT_SUB("update tesseract_")
00314   // Update datapath and language requested for the last valid initialization.
00315   if (datapath_ == NULL)
00316     datapath_ = new STRING(datapath);
00317   else
00318     *datapath_ = datapath;
00319   if ((strcmp(datapath_->string(), "") == 0) &&
00320       (strcmp(tesseract_->datadir.string(), "") != 0))
00321      *datapath_ = tesseract_->datadir;
00322 
00323   if (language_ == NULL)
00324     language_ = new STRING(language);
00325   else
00326     *language_ = language;
00327   last_oem_requested_ = oem;
00328   // PERF_COUNT_SUB("update last_oem_requested_")
00329   // For same language and datapath, just reset the adaptive classifier.
00330   if (reset_classifier) {
00331     tesseract_->ResetAdaptiveClassifier();
00332     PERF_COUNT_SUB("tesseract_->ResetAdaptiveClassifier()")
00333   }
00334   PERF_COUNT_END
00335   return 0;
00336 }
00337 
00346 const char* TessBaseAPI::GetInitLanguagesAsString() const {
00347   return (language_ == NULL || language_->string() == NULL) ?
00348       "" : language_->string();
00349 }
00350 
00356 void TessBaseAPI::GetLoadedLanguagesAsVector(
00357     GenericVector<STRING>* langs) const {
00358   langs->clear();
00359   if (tesseract_ != NULL) {
00360     langs->push_back(tesseract_->lang);
00361     int num_subs = tesseract_->num_sub_langs();
00362     for (int i = 0; i < num_subs; ++i)
00363       langs->push_back(tesseract_->get_sub_lang(i)->lang);
00364   }
00365 }
00366 
00370 void TessBaseAPI::GetAvailableLanguagesAsVector(
00371     GenericVector<STRING>* langs) const {
00372   langs->clear();
00373   if (tesseract_ != NULL) {
00374 #ifdef _WIN32
00375     STRING pattern = tesseract_->datadir + "/*." + kTrainedDataSuffix;
00376     char fname[_MAX_FNAME];
00377     WIN32_FIND_DATA data;
00378     BOOL result = TRUE;
00379     HANDLE handle = FindFirstFile(pattern.string(), &data);
00380     if (handle != INVALID_HANDLE_VALUE) {
00381       for (; result; result = FindNextFile(handle, &data)) {
00382         _splitpath(data.cFileName, NULL, NULL, fname, NULL);
00383         langs->push_back(STRING(fname));
00384       }
00385       FindClose(handle);
00386     }
00387 #else  // _WIN32
00388     DIR *dir;
00389     struct dirent *dirent;
00390     char *dot;
00391 
00392     STRING extension = STRING(".") + kTrainedDataSuffix;
00393 
00394     dir = opendir(tesseract_->datadir.string());
00395     if (dir != NULL) {
00396       while ((dirent = readdir(dir))) {
00397         // Skip '.', '..', and hidden files
00398         if (dirent->d_name[0] != '.') {
00399           if (strstr(dirent->d_name, extension.string()) != NULL) {
00400             dot = strrchr(dirent->d_name, '.');
00401             // This ensures that .traineddata is at the end of the file name
00402             if (strncmp(dot, extension.string(),
00403                         strlen(extension.string())) == 0) {
00404               *dot = '\0';
00405               langs->push_back(STRING(dirent->d_name));
00406             }
00407           }
00408         }
00409       }
00410       closedir(dir);
00411     }
00412 #endif
00413   }
00414 }
00415 
00422 int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
00423   if (tesseract_ == NULL)
00424     tesseract_ = new Tesseract;
00425   else
00426     ParamUtils::ResetToDefaults(tesseract_->params());
00427   return tesseract_->init_tesseract_lm(datapath, NULL, language);
00428 }
00429 
00434 void TessBaseAPI::InitForAnalysePage() {
00435   if (tesseract_ == NULL) {
00436     tesseract_ = new Tesseract;
00437     tesseract_->InitAdaptiveClassifier(false);
00438   }
00439 }
00440 
00446 void TessBaseAPI::ReadConfigFile(const char* filename) {
00447   tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY);
00448 }
00449 
00451 void TessBaseAPI::ReadDebugConfigFile(const char* filename) {
00452   tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY);
00453 }
00454 
00460 void TessBaseAPI::SetPageSegMode(PageSegMode mode) {
00461   if (tesseract_ == NULL)
00462     tesseract_ = new Tesseract;
00463   tesseract_->tessedit_pageseg_mode.set_value(mode);
00464 }
00465 
00467 PageSegMode TessBaseAPI::GetPageSegMode() const {
00468   if (tesseract_ == NULL)
00469     return PSM_SINGLE_BLOCK;
00470   return static_cast<PageSegMode>(
00471     static_cast<int>(tesseract_->tessedit_pageseg_mode));
00472 }
00473 
00487 char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
00488                                  int bytes_per_pixel,
00489                                  int bytes_per_line,
00490                                  int left, int top,
00491                                  int width, int height) {
00492   if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize)
00493     return NULL;  // Nothing worth doing.
00494 
00495   // Since this original api didn't give the exact size of the image,
00496   // we have to invent a reasonable value.
00497   int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
00498   SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top,
00499            bytes_per_pixel, bytes_per_line);
00500   SetRectangle(left, top, width, height);
00501 
00502   return GetUTF8Text();
00503 }
00504 
00509 void TessBaseAPI::ClearAdaptiveClassifier() {
00510   if (tesseract_ == NULL)
00511     return;
00512   tesseract_->ResetAdaptiveClassifier();
00513   tesseract_->ResetDocumentDictionary();
00514 }
00515 
00525 void TessBaseAPI::SetImage(const unsigned char* imagedata,
00526                            int width, int height,
00527                            int bytes_per_pixel, int bytes_per_line) {
00528   if (InternalSetImage())
00529     thresholder_->SetImage(imagedata, width, height,
00530                            bytes_per_pixel, bytes_per_line);
00531 }
00532 
00533 void TessBaseAPI::SetSourceResolution(int ppi) {
00534   if (thresholder_)
00535     thresholder_->SetSourceYResolution(ppi);
00536   else
00537     tprintf("Please call SetImage before SetSourceResolution.\n");
00538 }
00539 
00550 void TessBaseAPI::SetImage(Pix* pix) {
00551   if (InternalSetImage())
00552     thresholder_->SetImage(pix);
00553   SetInputImage(pix);
00554 }
00555 
00561 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
00562   if (thresholder_ == NULL)
00563     return;
00564   thresholder_->SetRectangle(left, top, width, height);
00565   ClearResults();
00566 }
00567 
00572 Pix* TessBaseAPI::GetThresholdedImage() {
00573   if (tesseract_ == NULL || thresholder_ == NULL)
00574     return NULL;
00575   if (tesseract_->pix_binary() == NULL)
00576     Threshold(tesseract_->mutable_pix_binary());
00577   return pixClone(tesseract_->pix_binary());
00578 }
00579 
00585 Boxa* TessBaseAPI::GetRegions(Pixa** pixa) {
00586   return GetComponentImages(RIL_BLOCK, false, pixa, NULL);
00587 }
00588 
00597 Boxa* TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding,
00598                                 Pixa** pixa, int** blockids, int** paraids) {
00599   return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding,
00600                             pixa, blockids, paraids);
00601 }
00602 
00611 Boxa* TessBaseAPI::GetStrips(Pixa** pixa, int** blockids) {
00612   return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
00613 }
00614 
00620 Boxa* TessBaseAPI::GetWords(Pixa** pixa) {
00621   return GetComponentImages(RIL_WORD, true, pixa, NULL);
00622 }
00623 
00630 Boxa* TessBaseAPI::GetConnectedComponents(Pixa** pixa) {
00631   return GetComponentImages(RIL_SYMBOL, true, pixa, NULL);
00632 }
00633 
00642 Boxa* TessBaseAPI::GetComponentImages(PageIteratorLevel level,
00643                                       bool text_only, bool raw_image,
00644                                       const int raw_padding,
00645                                       Pixa** pixa, int** blockids,
00646                                       int** paraids) {
00647   PageIterator* page_it = GetIterator();
00648   if (page_it == NULL)
00649     page_it = AnalyseLayout();
00650   if (page_it == NULL)
00651     return NULL;  // Failed.
00652 
00653   // Count the components to get a size for the arrays.
00654   int component_count = 0;
00655   int left, top, right, bottom;
00656 
00657   TessResultCallback<bool>* get_bbox = NULL;
00658   if (raw_image) {
00659     // Get bounding box in original raw image with padding.
00660     get_bbox = NewPermanentTessCallback(page_it, &PageIterator::BoundingBox,
00661                                         level, raw_padding,
00662                                         &left, &top, &right, &bottom);
00663   } else {
00664     // Get bounding box from binarized imaged. Note that this could be
00665     // differently scaled from the original image.
00666     get_bbox = NewPermanentTessCallback(page_it,
00667                                         &PageIterator::BoundingBoxInternal,
00668                                         level, &left, &top, &right, &bottom);
00669   }
00670   do {
00671     if (get_bbox->Run() &&
00672         (!text_only || PTIsTextType(page_it->BlockType())))
00673       ++component_count;
00674   } while (page_it->Next(level));
00675 
00676   Boxa* boxa = boxaCreate(component_count);
00677   if (pixa != NULL)
00678     *pixa = pixaCreate(component_count);
00679   if (blockids != NULL)
00680     *blockids = new int[component_count];
00681   if (paraids != NULL)
00682     *paraids = new int[component_count];
00683 
00684   int blockid = 0;
00685   int paraid = 0;
00686   int component_index = 0;
00687   page_it->Begin();
00688   do {
00689     if (get_bbox->Run() &&
00690         (!text_only || PTIsTextType(page_it->BlockType()))) {
00691       Box* lbox = boxCreate(left, top, right - left, bottom - top);
00692       boxaAddBox(boxa, lbox, L_INSERT);
00693       if (pixa != NULL) {
00694         Pix* pix = NULL;
00695         if (raw_image) {
00696           pix = page_it->GetImage(level, raw_padding, input_image_,
00697                                   &left, &top);
00698         } else {
00699           pix = page_it->GetBinaryImage(level);
00700         }
00701         pixaAddPix(*pixa, pix, L_INSERT);
00702         pixaAddBox(*pixa, lbox, L_CLONE);
00703       }
00704       if (paraids != NULL) {
00705         (*paraids)[component_index] = paraid;
00706         if (page_it->IsAtFinalElement(RIL_PARA, level))
00707           ++paraid;
00708       }
00709       if (blockids != NULL) {
00710         (*blockids)[component_index] = blockid;
00711         if (page_it->IsAtFinalElement(RIL_BLOCK, level)) {
00712           ++blockid;
00713           paraid = 0;
00714         }
00715       }
00716       ++component_index;
00717     }
00718   } while (page_it->Next(level));
00719   delete page_it;
00720   delete get_bbox;
00721   return boxa;
00722 }
00723 
00724 int TessBaseAPI::GetThresholdedImageScaleFactor() const {
00725   if (thresholder_ == NULL) {
00726     return 0;
00727   }
00728   return thresholder_->GetScaleFactor();
00729 }
00730 
00732 void TessBaseAPI::DumpPGM(const char* filename) {
00733   if (tesseract_ == NULL)
00734     return;
00735   FILE *fp = fopen(filename, "wb");
00736   Pix* pix = tesseract_->pix_binary();
00737   int width = pixGetWidth(pix);
00738   int height = pixGetHeight(pix);
00739   l_uint32* data = pixGetData(pix);
00740   fprintf(fp, "P5 %d %d 255\n", width, height);
00741   for (int y = 0; y < height; ++y, data += pixGetWpl(pix)) {
00742     for (int x = 0; x < width; ++x) {
00743       uinT8 b = GET_DATA_BIT(data, x) ? 0 : 255;
00744       fwrite(&b, 1, 1, fp);
00745     }
00746   }
00747   fclose(fp);
00748 }
00749 
00750 #ifndef NO_CUBE_BUILD
00751 
00757 int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks,
00758                 Boxa* boxa_words, Pixa* pixa_words,
00759                 const FCOORD& reskew, Pix* page_pix,
00760                 PAGE_RES* page_res) {
00761   int block_count = boxaGetCount(boxa_blocks);
00762   ASSERT_HOST(block_count == pixaGetCount(pixa_blocks));
00763   // Write each block to the current directory as junk_write_display.nnn.png.
00764   for (int i = 0; i < block_count; ++i) {
00765     Pix* pix = pixaGetPix(pixa_blocks, i, L_CLONE);
00766     pixDisplayWrite(pix, 1);
00767   }
00768   int word_count = boxaGetCount(boxa_words);
00769   ASSERT_HOST(word_count == pixaGetCount(pixa_words));
00770   int pr_word = 0;
00771   PAGE_RES_IT page_res_it(page_res);
00772   for (page_res_it.restart_page(); page_res_it.word () != NULL;
00773        page_res_it.forward(), ++pr_word) {
00774     WERD_RES *word = page_res_it.word();
00775     WERD_CHOICE* choice = word->best_choice;
00776     // Write the first 100 words to files names wordims/<wordstring>.tif.
00777     if (pr_word < 100) {
00778       STRING filename("wordims/");
00779       if (choice != NULL) {
00780         filename += choice->unichar_string();
00781       } else {
00782         char numbuf[32];
00783         filename += "unclassified";
00784         snprintf(numbuf, 32, "%03d", pr_word);
00785         filename += numbuf;
00786       }
00787       filename += ".tif";
00788       Pix* pix = pixaGetPix(pixa_words, pr_word, L_CLONE);
00789       pixWrite(filename.string(), pix, IFF_TIFF_G4);
00790     }
00791   }
00792   ASSERT_HOST(pr_word == word_count);
00793   return 0;
00794 }
00795 #endif  // NO_CUBE_BUILD
00796 
00812 PageIterator* TessBaseAPI::AnalyseLayout(bool merge_similar_words) {
00813   if (FindLines() == 0) {
00814     if (block_list_->empty())
00815       return NULL;  // The page was empty.
00816     page_res_ = new PAGE_RES(merge_similar_words, block_list_, NULL);
00817     DetectParagraphs(false);
00818     return new PageIterator(
00819         page_res_, tesseract_, thresholder_->GetScaleFactor(),
00820         thresholder_->GetScaledYResolution(),
00821         rect_left_, rect_top_, rect_width_, rect_height_);
00822   }
00823   return NULL;
00824 }
00825 
00830 int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
00831   if (tesseract_ == NULL)
00832     return -1;
00833   if (FindLines() != 0)
00834     return -1;
00835   if (page_res_ != NULL)
00836     delete page_res_;
00837   if (block_list_->empty()) {
00838     page_res_ = new PAGE_RES(false, block_list_,
00839                              &tesseract_->prev_word_best_choice_);
00840     return 0; // Empty page.
00841   }
00842 
00843   tesseract_->SetBlackAndWhitelist();
00844   recognition_done_ = true;
00845   if (tesseract_->tessedit_resegment_from_line_boxes) {
00846     page_res_ = tesseract_->ApplyBoxes(*input_file_, true, block_list_);
00847   } else if (tesseract_->tessedit_resegment_from_boxes) {
00848     page_res_ = tesseract_->ApplyBoxes(*input_file_, false, block_list_);
00849   } else {
00850     // TODO(rays) LSTM here.
00851     page_res_ = new PAGE_RES(false,
00852                              block_list_, &tesseract_->prev_word_best_choice_);
00853   }
00854   if (tesseract_->tessedit_make_boxes_from_boxes) {
00855     tesseract_->CorrectClassifyWords(page_res_);
00856     return 0;
00857   }
00858 
00859   if (truth_cb_ != NULL) {
00860     tesseract_->wordrec_run_blamer.set_value(true);
00861     PageIterator *page_it = new PageIterator(
00862             page_res_, tesseract_, thresholder_->GetScaleFactor(),
00863             thresholder_->GetScaledYResolution(),
00864             rect_left_, rect_top_, rect_width_, rect_height_);
00865     truth_cb_->Run(tesseract_->getDict().getUnicharset(),
00866                    image_height_, page_it, this->tesseract()->pix_grey());
00867     delete page_it;
00868   }
00869 
00870   int result = 0;
00871   if (tesseract_->interactive_display_mode) {
00872     #ifndef GRAPHICS_DISABLED
00873     tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_);
00874     #endif  // GRAPHICS_DISABLED
00875     // The page_res is invalid after an interactive session, so cleanup
00876     // in a way that lets us continue to the next page without crashing.
00877     delete page_res_;
00878     page_res_ = NULL;
00879     return -1;
00880   } else if (tesseract_->tessedit_train_from_boxes) {
00881     STRING fontname;
00882     ExtractFontName(*output_file_, &fontname);
00883     tesseract_->ApplyBoxTraining(fontname, page_res_);
00884   } else if (tesseract_->tessedit_ambigs_training) {
00885     FILE *training_output_file = tesseract_->init_recog_training(*input_file_);
00886     // OCR the page segmented into words by tesseract.
00887     tesseract_->recog_training_segmented(
00888         *input_file_, page_res_, monitor, training_output_file);
00889     fclose(training_output_file);
00890   } else {
00891     // Now run the main recognition.
00892     bool wait_for_text = true;
00893     GetBoolVariable("paragraph_text_based", &wait_for_text);
00894     if (!wait_for_text) DetectParagraphs(false);
00895     if (tesseract_->recog_all_words(page_res_, monitor, NULL, NULL, 0)) {
00896       if (wait_for_text) DetectParagraphs(true);
00897     } else {
00898       result = -1;
00899     }
00900   }
00901   return result;
00902 }
00903 
00905 int TessBaseAPI::RecognizeForChopTest(ETEXT_DESC* monitor) {
00906   if (tesseract_ == NULL)
00907     return -1;
00908   if (thresholder_ == NULL || thresholder_->IsEmpty()) {
00909     tprintf("Please call SetImage before attempting recognition.");
00910     return -1;
00911   }
00912   if (page_res_ != NULL)
00913     ClearResults();
00914   if (FindLines() != 0)
00915     return -1;
00916   // Additional conditions under which chopper test cannot be run
00917   if (tesseract_->interactive_display_mode) return -1;
00918 
00919   recognition_done_ = true;
00920 
00921   page_res_ = new PAGE_RES(false, block_list_,
00922                            &(tesseract_->prev_word_best_choice_));
00923 
00924   PAGE_RES_IT page_res_it(page_res_);
00925 
00926   while (page_res_it.word() != NULL) {
00927     WERD_RES *word_res = page_res_it.word();
00928     GenericVector<TBOX> boxes;
00929     tesseract_->MaximallyChopWord(boxes, page_res_it.block()->block,
00930                                   page_res_it.row()->row, word_res);
00931     page_res_it.forward();
00932   }
00933   return 0;
00934 }
00935 
00936 void TessBaseAPI::SetInputImage(Pix *pix) {
00937   if (input_image_)
00938     pixDestroy(&input_image_);
00939   input_image_ = NULL;
00940   if (pix)
00941     input_image_ = pixCopy(NULL, pix);
00942 }
00943 
00944 Pix* TessBaseAPI::GetInputImage() {
00945   return input_image_;
00946 }
00947 
00948 const char * TessBaseAPI::GetInputName() {
00949   if (input_file_)
00950     return input_file_->c_str();
00951   return NULL;
00952 }
00953 
00954 const char *  TessBaseAPI::GetDatapath() {
00955   return tesseract_->datadir.c_str();
00956 }
00957 
00958 int TessBaseAPI::GetSourceYResolution() {
00959   return thresholder_->GetSourceYResolution();
00960 }
00961 
00962 // If flist exists, get data from there. Otherwise get data from buf.
00963 // Seems convoluted, but is the easiest way I know of to meet multiple
00964 // goals. Support streaming from stdin, and also work on platforms
00965 // lacking fmemopen.
00966 bool TessBaseAPI::ProcessPagesFileList(FILE *flist,
00967                                        STRING *buf,
00968                                        const char* retry_config,
00969                                        int timeout_millisec,
00970                                        TessResultRenderer* renderer,
00971                                        int tessedit_page_number) {
00972   if (!flist && !buf) return false;
00973   int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
00974   char pagename[MAX_PATH];
00975 
00976   GenericVector<STRING> lines;
00977   if (!flist) {
00978     buf->split('\n', &lines);
00979     if (lines.empty()) return false;
00980   }
00981 
00982   // Skip to the requested page number.
00983   for (int i = 0; i < page; i++) {
00984     if (flist) {
00985       if (fgets(pagename, sizeof(pagename), flist) == NULL) break;
00986     }
00987   }
00988 
00989   // Begin producing output
00990   const char* kUnknownTitle = "";
00991   if (renderer && !renderer->BeginDocument(kUnknownTitle)) {
00992     return false;
00993   }
00994 
00995   // Loop over all pages - or just the requested one
00996   while (true) {
00997     if (flist) {
00998       if (fgets(pagename, sizeof(pagename), flist) == NULL) break;
00999     } else {
01000       if (page >= lines.size()) break;
01001       snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str());
01002     }
01003     chomp_string(pagename);
01004     Pix *pix = pixRead(pagename);
01005     if (pix == NULL) {
01006       tprintf("Image file %s cannot be read!\n", pagename);
01007       return false;
01008     }
01009     tprintf("Page %d : %s\n", page, pagename);
01010     bool r = ProcessPage(pix, page, pagename, retry_config,
01011                          timeout_millisec, renderer);
01012     pixDestroy(&pix);
01013     if (!r) return false;
01014     if (tessedit_page_number >= 0) break;
01015     ++page;
01016   }
01017 
01018   // Finish producing output
01019   if (renderer && !renderer->EndDocument()) {
01020     return false;
01021   }
01022   return true;
01023 }
01024 
01025 bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
01026                                             size_t size,
01027                                             const char* filename,
01028                                             const char* retry_config,
01029                                             int timeout_millisec,
01030                                             TessResultRenderer* renderer,
01031                                             int tessedit_page_number) {
01032 #ifndef ANDROID_BUILD
01033   Pix *pix = NULL;
01034 #ifdef USE_OPENCL
01035   OpenclDevice od;
01036 #endif  // USE_OPENCL
01037   int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
01038   for (; ; ++page) {
01039     if (tessedit_page_number >= 0)
01040       page = tessedit_page_number;
01041 #ifdef USE_OPENCL
01042     if ( od.selectedDeviceIsOpenCL() ) {
01043       // FIXME(jbreiden) Not implemented.
01044       pix = od.pixReadMemTiffCl(data, size, page);
01045     } else {
01046 #endif  // USE_OPENCL
01047       pix = pixReadMemTiff(data, size, page);
01048 #ifdef USE_OPENCL
01049     }
01050 #endif  // USE_OPENCL
01051     if (pix == NULL) break;
01052     tprintf("Page %d\n", page + 1);
01053     char page_str[kMaxIntSize];
01054     snprintf(page_str, kMaxIntSize - 1, "%d", page);
01055     SetVariable("applybox_page", page_str);
01056     bool r = ProcessPage(pix, page, filename, retry_config,
01057                            timeout_millisec, renderer);
01058     pixDestroy(&pix);
01059     if (!r) return false;
01060     if (tessedit_page_number >= 0) break;
01061   }
01062   return true;
01063 #else
01064   return false;
01065 #endif
01066 }
01067 
01068 // Master ProcessPages calls ProcessPagesInternal and then does any post-
01069 // processing required due to being in a training mode.
01070 bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config,
01071                                int timeout_millisec,
01072                                TessResultRenderer* renderer) {
01073   bool result =
01074       ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
01075   if (result) {
01076     if (tesseract_->tessedit_train_from_boxes &&
01077         !tesseract_->WriteTRFile(*output_file_)) {
01078       tprintf("Write of TR file failed: %s\n", output_file_->string());
01079       return false;
01080     }
01081   }
01082   return result;
01083 }
01084 
01085 // In the ideal scenario, Tesseract will start working on data as soon
01086 // as it can. For example, if you steam a filelist through stdin, we
01087 // should start the OCR process as soon as the first filename is
01088 // available. This is particularly useful when hooking Tesseract up to
01089 // slow hardware such as a book scanning machine.
01090 //
01091 // Unfortunately there are tradeoffs. You can't seek on stdin. That
01092 // makes automatic detection of datatype (TIFF? filelist? PNG?)
01093 // impractical.  So we support a command line flag to explicitly
01094 // identify the scenario that really matters: filelists on
01095 // stdin. We'll still do our best if the user likes pipes.  That means
01096 // piling up any data coming into stdin into a memory buffer.
01097 bool TessBaseAPI::ProcessPagesInternal(const char* filename,
01098                                        const char* retry_config,
01099                                        int timeout_millisec,
01100                                        TessResultRenderer* renderer) {
01101 #ifndef ANDROID_BUILD
01102   PERF_COUNT_START("ProcessPages")
01103   bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
01104   if (stdInput) {
01105 #ifdef WIN32
01106     if (_setmode(_fileno(stdin), _O_BINARY) == -1)
01107       tprintf("ERROR: cin to binary: %s", strerror(errno));
01108 #endif  // WIN32
01109   }
01110 
01111   if (stream_filelist) {
01112     return ProcessPagesFileList(stdin, NULL, retry_config,
01113                                 timeout_millisec, renderer,
01114                                 tesseract_->tessedit_page_number);
01115   }
01116 
01117   // At this point we are officially in autodection territory.
01118   // That means we are going to buffer stdin so that it is
01119   // seekable. To keep code simple we will also buffer data
01120   // coming from a file.
01121   std::string buf;
01122   if (stdInput) {
01123     buf.assign((std::istreambuf_iterator<char>(std::cin)),
01124                (std::istreambuf_iterator<char>()));
01125   } else {
01126     std::ifstream ifs(filename, std::ios::binary);
01127     if (ifs) {
01128       buf.assign((std::istreambuf_iterator<char>(ifs)),
01129                  (std::istreambuf_iterator<char>()));
01130     } else {
01131       tprintf("ERROR: Can not open input file %s\n", filename);
01132       return false;
01133     }
01134   }
01135 
01136   // Here is our autodetection
01137   int format;
01138   const l_uint8 * data = reinterpret_cast<const l_uint8 *>(buf.c_str());
01139   findFileFormatBuffer(data, &format);
01140 
01141   // Maybe we have a filelist
01142   if (format == IFF_UNKNOWN) {
01143     STRING s(buf.c_str());
01144     return ProcessPagesFileList(NULL, &s, retry_config,
01145                                 timeout_millisec, renderer,
01146                                 tesseract_->tessedit_page_number);
01147   }
01148 
01149   // Maybe we have a TIFF which is potentially multipage
01150   bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS ||
01151                format == IFF_TIFF_RLE || format == IFF_TIFF_G3 ||
01152                format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
01153                format == IFF_TIFF_ZIP);
01154 
01155   // Fail early if we can, before producing any output
01156   Pix *pix = NULL;
01157   if (!tiff) {
01158     pix = pixReadMem(data, buf.size());
01159     if (pix == NULL) {
01160       return false;
01161     }
01162   }
01163 
01164   // Begin the output
01165   const char* kUnknownTitle = "";
01166   if (renderer && !renderer->BeginDocument(kUnknownTitle)) {
01167     pixDestroy(&pix);
01168     return false;
01169   }
01170 
01171   // Produce output
01172   bool r = false;
01173   if (tiff) {
01174     r = ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config,
01175                                   timeout_millisec, renderer,
01176                                   tesseract_->tessedit_page_number);
01177   } else {
01178     r = ProcessPage(pix, 0, filename, retry_config,
01179                     timeout_millisec, renderer);
01180     pixDestroy(&pix);
01181   }
01182 
01183   // End the output
01184   if (!r || (renderer && !renderer->EndDocument())) {
01185     return false;
01186   }
01187   PERF_COUNT_END
01188   return true;
01189 #else
01190   return false;
01191 #endif
01192 }
01193 
01194 bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
01195                               const char* retry_config, int timeout_millisec,
01196                               TessResultRenderer* renderer) {
01197   PERF_COUNT_START("ProcessPage")
01198   SetInputName(filename);
01199   SetImage(pix);
01200   bool failed = false;
01201 
01202   if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) {
01203     // Disabled character recognition
01204     PageIterator* it = AnalyseLayout();
01205 
01206     if (it == NULL) {
01207       failed = true;
01208     } else {
01209       delete it;
01210     }
01211   } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY) {
01212     failed = FindLines() != 0;
01213   } else if (timeout_millisec > 0) {
01214     // Running with a timeout.
01215     ETEXT_DESC monitor;
01216     monitor.cancel = NULL;
01217     monitor.cancel_this = NULL;
01218     monitor.set_deadline_msecs(timeout_millisec);
01219 
01220     // Now run the main recognition.
01221     failed = Recognize(&monitor) < 0;
01222   } else {
01223     // Normal layout and character recognition with no timeout.
01224     failed = Recognize(NULL) < 0;
01225   }
01226 
01227   if (tesseract_->tessedit_write_images) {
01228 #ifndef ANDROID_BUILD
01229     Pix* page_pix = GetThresholdedImage();
01230     pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4);
01231 #endif  // ANDROID_BUILD
01232   }
01233 
01234   if (failed && retry_config != NULL && retry_config[0] != '\0') {
01235     // Save current config variables before switching modes.
01236     FILE* fp = fopen(kOldVarsFile, "wb");
01237     PrintVariables(fp);
01238     fclose(fp);
01239     // Switch to alternate mode for retry.
01240     ReadConfigFile(retry_config);
01241     SetImage(pix);
01242     Recognize(NULL);
01243     // Restore saved config variables.
01244     ReadConfigFile(kOldVarsFile);
01245   }
01246 
01247   if (renderer && !failed) {
01248     failed = !renderer->AddImage(this);
01249   }
01250 
01251   PERF_COUNT_END
01252   return !failed;
01253 }
01254 
01259 LTRResultIterator* TessBaseAPI::GetLTRIterator() {
01260   if (tesseract_ == NULL || page_res_ == NULL)
01261     return NULL;
01262   return new LTRResultIterator(
01263       page_res_, tesseract_,
01264       thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(),
01265       rect_left_, rect_top_, rect_width_, rect_height_);
01266 }
01267 
01276 ResultIterator* TessBaseAPI::GetIterator() {
01277   if (tesseract_ == NULL || page_res_ == NULL)
01278     return NULL;
01279   return ResultIterator::StartOfParagraph(LTRResultIterator(
01280       page_res_, tesseract_,
01281       thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(),
01282       rect_left_, rect_top_, rect_width_, rect_height_));
01283 }
01284 
01293 MutableIterator* TessBaseAPI::GetMutableIterator() {
01294   if (tesseract_ == NULL || page_res_ == NULL)
01295     return NULL;
01296   return new MutableIterator(page_res_, tesseract_,
01297                              thresholder_->GetScaleFactor(),
01298                              thresholder_->GetScaledYResolution(),
01299                              rect_left_, rect_top_, rect_width_, rect_height_);
01300 }
01301 
01303 char* TessBaseAPI::GetUTF8Text() {
01304   if (tesseract_ == NULL ||
01305       (!recognition_done_ && Recognize(NULL) < 0))
01306     return NULL;
01307   STRING text("");
01308   ResultIterator *it = GetIterator();
01309   do {
01310     if (it->Empty(RIL_PARA)) continue;
01311     char *para_text = it->GetUTF8Text(RIL_PARA);
01312     text += para_text;
01313     delete []para_text;
01314   } while (it->Next(RIL_PARA));
01315   char* result = new char[text.length() + 1];
01316   strncpy(result, text.string(), text.length() + 1);
01317   delete it;
01318   return result;
01319 }
01320 
01324 static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
01325   tesseract::Orientation orientation;
01326   tesseract::WritingDirection writing_direction;
01327   tesseract::TextlineOrder textline_order;
01328   float deskew_angle;
01329   it->Orientation(&orientation, &writing_direction, &textline_order,
01330                   &deskew_angle);
01331   return orientation;
01332 }
01333 
01342 static void AddBaselineCoordsTohOCR(const PageIterator *it,
01343                                     PageIteratorLevel level,
01344                                     STRING* hocr_str) {
01345   tesseract::Orientation orientation = GetBlockTextOrientation(it);
01346   if (orientation != ORIENTATION_PAGE_UP) {
01347     hocr_str->add_str_int("; textangle ", 360 - orientation * 90);
01348     return;
01349   }
01350 
01351   int left, top, right, bottom;
01352   it->BoundingBox(level, &left, &top, &right, &bottom);
01353 
01354   // Try to get the baseline coordinates at this level.
01355   int x1, y1, x2, y2;
01356   if (!it->Baseline(level, &x1, &y1, &x2, &y2))
01357     return;
01358   // Following the description of this field of the hOCR spec, we convert the
01359   // baseline coordinates so that "the bottom left of the bounding box is the
01360   // origin".
01361   x1 -= left;
01362   x2 -= left;
01363   y1 -= bottom;
01364   y2 -= bottom;
01365 
01366   // Now fit a line through the points so we can extract coefficients for the
01367   // equation:  y = p1 x + p0
01368   double p1 = 0;
01369   double p0 = 0;
01370   if (x1 == x2) {
01371     // Problem computing the polynomial coefficients.
01372     return;
01373   }
01374   p1 = (y2 - y1) / static_cast<double>(x2 - x1);
01375   p0 = y1 - static_cast<double>(p1 * x1);
01376 
01377   hocr_str->add_str_double("; baseline ", round(p1 * 1000.0) / 1000.0);
01378   hocr_str->add_str_double(" ", round(p0 * 1000.0) / 1000.0);
01379 }
01380 
01381 static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, int num2) {
01382   unsigned long bufsize = base.length() + 2 * kMaxIntSize;
01383   char id_buffer[bufsize];
01384   if (num2 >= 0) {
01385     snprintf(id_buffer, bufsize - 1, "%s_%d_%d", base.c_str(), num1, num2);
01386   } else {
01387     snprintf(id_buffer, bufsize - 1, "%s_%d", base.c_str(), num1);
01388   }
01389   id_buffer[bufsize - 1] = '\0';
01390   *hocr_str += " id='";
01391   *hocr_str += id_buffer;
01392   *hocr_str += "'";
01393 }
01394 
01395 static void AddBoxTohOCR(const ResultIterator *it,
01396                          PageIteratorLevel level,
01397                          STRING* hocr_str) {
01398   int left, top, right, bottom;
01399   it->BoundingBox(level, &left, &top, &right, &bottom);
01400   // This is the only place we use double quotes instead of single quotes,
01401   // but it may too late to change for consistency
01402   hocr_str->add_str_int(" title=\"bbox ", left);
01403   hocr_str->add_str_int(" ", top);
01404   hocr_str->add_str_int(" ", right);
01405   hocr_str->add_str_int(" ", bottom);
01406   // Add baseline coordinates & heights for textlines only.
01407   if (level == RIL_TEXTLINE) {
01408     AddBaselineCoordsTohOCR(it, level, hocr_str);
01409     // add custom height measures
01410     float row_height, descenders, ascenders;  // row attributes
01411     it->RowAttributes(&row_height, &descenders, &ascenders);
01412     // TODO: Do we want to limit these to a single decimal place?
01413     hocr_str->add_str_double("; x_size ", row_height);
01414     hocr_str->add_str_double("; x_descenders ", descenders * -1);
01415     hocr_str->add_str_double("; x_ascenders ", ascenders);
01416   }
01417   *hocr_str += "\">";
01418 }
01419 
01428 char* TessBaseAPI::GetHOCRText(int page_number) {
01429   if (tesseract_ == NULL ||
01430       (page_res_ == NULL && Recognize(NULL) < 0))
01431     return NULL;
01432 
01433   int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
01434   int page_id = page_number + 1;  // hOCR uses 1-based page numbers.
01435   bool font_info = false;
01436   GetBoolVariable("hocr_font_info", &font_info);
01437 
01438   STRING hocr_str("");
01439 
01440   if (input_file_ == NULL)
01441       SetInputName(NULL);
01442 
01443 #ifdef _WIN32
01444   // convert input name from ANSI encoding to utf-8
01445   int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
01446                                       NULL, 0);
01447   wchar_t *uni16_str = new WCHAR[str16_len];
01448   str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
01449                                   uni16_str, str16_len);
01450   int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL,
01451                                      0, NULL, NULL);
01452   char *utf8_str = new char[utf8_len];
01453   WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
01454                       utf8_len, NULL, NULL);
01455   *input_file_ = utf8_str;
01456   delete[] uni16_str;
01457   delete[] utf8_str;
01458 #endif
01459 
01460   hocr_str += "  <div class='ocr_page'";
01461   AddIdTohOCR(&hocr_str, "page", page_id, -1);
01462   hocr_str += " title='image \"";
01463   if (input_file_) {
01464     hocr_str += HOcrEscape(input_file_->string());
01465   } else {
01466     hocr_str += "unknown";
01467   }
01468   hocr_str.add_str_int("\"; bbox ", rect_left_);
01469   hocr_str.add_str_int(" ", rect_top_);
01470   hocr_str.add_str_int(" ", rect_width_);
01471   hocr_str.add_str_int(" ", rect_height_);
01472   hocr_str.add_str_int("; ppageno ", page_number);
01473   hocr_str += "'>\n";
01474 
01475   ResultIterator *res_it = GetIterator();
01476   while (!res_it->Empty(RIL_BLOCK)) {
01477     if (res_it->Empty(RIL_WORD)) {
01478       res_it->Next(RIL_WORD);
01479       continue;
01480     }
01481 
01482     // Open any new block/paragraph/textline.
01483     if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
01484       hocr_str += "   <div class='ocr_carea'";
01485       AddIdTohOCR(&hocr_str, "block", page_id, bcnt);
01486       AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
01487     }
01488     if (res_it->IsAtBeginningOf(RIL_PARA)) {
01489       hocr_str += "\n    <p class='ocr_par'";
01490       if (res_it->ParagraphIsLtr()) {
01491         hocr_str += " dir='ltr'";
01492       } else {
01493         hocr_str += " dir='rtl'";
01494       }
01495       AddIdTohOCR(&hocr_str, "par", page_id, pcnt);
01496       AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
01497     }
01498     if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
01499       hocr_str += "\n     <span class='ocr_line'";
01500       AddIdTohOCR(&hocr_str, "line", page_id, lcnt);
01501       AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
01502     }
01503 
01504     // Now, process the word...
01505     hocr_str += "<span class='ocrx_word'";
01506     AddIdTohOCR(&hocr_str, "word", page_id, wcnt);
01507     int left, top, right, bottom;
01508     bool bold, italic, underlined, monospace, serif, smallcaps;
01509     int pointsize, font_id;
01510     const char *font_name;
01511     res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
01512     font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
01513                                            &monospace, &serif, &smallcaps,
01514                                            &pointsize, &font_id);
01515     hocr_str.add_str_int(" title='bbox ", left);
01516     hocr_str.add_str_int(" ", top);
01517     hocr_str.add_str_int(" ", right);
01518     hocr_str.add_str_int(" ", bottom);
01519     hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD));
01520     if (font_info) {
01521       if (font_name) {
01522         hocr_str += "; x_font ";
01523         hocr_str += HOcrEscape(font_name);
01524       }
01525       hocr_str.add_str_int("; x_fsize ", pointsize);
01526     }
01527     hocr_str += "'";
01528     if (res_it->WordRecognitionLanguage()) {
01529       hocr_str += " lang='";
01530       hocr_str += res_it->WordRecognitionLanguage();
01531       hocr_str += "'";
01532     }
01533     switch (res_it->WordDirection()) {
01534       case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break;
01535       case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break;
01536       default:  // Do nothing.
01537         break;
01538     }
01539     hocr_str += ">";
01540     bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
01541     bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
01542     bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
01543     if (bold) hocr_str += "<strong>";
01544     if (italic) hocr_str += "<em>";
01545     do {
01546       const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
01547       if (grapheme && grapheme[0] != 0) {
01548         hocr_str += HOcrEscape(grapheme);
01549       }
01550       delete []grapheme;
01551       res_it->Next(RIL_SYMBOL);
01552     } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
01553     if (italic) hocr_str += "</em>";
01554     if (bold) hocr_str += "</strong>";
01555     hocr_str += "</span> ";
01556     wcnt++;
01557     // Close any ending block/paragraph/textline.
01558     if (last_word_in_line) {
01559       hocr_str += "\n     </span>";
01560       lcnt++;
01561     }
01562     if (last_word_in_para) {
01563       hocr_str += "\n    </p>\n";
01564       pcnt++;
01565     }
01566     if (last_word_in_block) {
01567       hocr_str += "   </div>\n";
01568       bcnt++;
01569     }
01570   }
01571   hocr_str += "  </div>\n";
01572 
01573   char *ret = new char[hocr_str.length() + 1];
01574   strcpy(ret, hocr_str.string());
01575   delete res_it;
01576   return ret;
01577 }
01578 
01580 const int kNumbersPerBlob = 5;
01585 const int kBytesPerNumber = 5;
01591 const int kBytesPerBlob = kNumbersPerBlob * (kBytesPerNumber + 1) + 1;
01592 const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1;
01594 const int kBytesPer64BitNumber = 20;
01601 const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 +
01602     UNICHAR_LEN;
01603 
01609 char* TessBaseAPI::GetBoxText(int page_number) {
01610   if (tesseract_ == NULL ||
01611       (!recognition_done_ && Recognize(NULL) < 0))
01612     return NULL;
01613   int blob_count;
01614   int utf8_length = TextLength(&blob_count);
01615   int total_length = blob_count * kBytesPerBoxFileLine + utf8_length +
01616       kMaxBytesPerLine;
01617   char* result = new char[total_length];
01618   strcpy(result, "\0");
01619   int output_length = 0;
01620   LTRResultIterator* it = GetLTRIterator();
01621   do {
01622     int left, top, right, bottom;
01623     if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
01624       char* text = it->GetUTF8Text(RIL_SYMBOL);
01625       // Tesseract uses space for recognition failure. Fix to a reject
01626       // character, kTesseractReject so we don't create illegal box files.
01627       for (int i = 0; text[i] != '\0'; ++i) {
01628         if (text[i] == ' ')
01629           text[i] = kTesseractReject;
01630       }
01631       snprintf(result + output_length, total_length - output_length,
01632                "%s %d %d %d %d %d\n",
01633                text, left, image_height_ - bottom,
01634                right, image_height_ - top, page_number);
01635       output_length += strlen(result + output_length);
01636       delete [] text;
01637       // Just in case...
01638       if (output_length + kMaxBytesPerLine > total_length)
01639         break;
01640     }
01641   } while (it->Next(RIL_SYMBOL));
01642   delete it;
01643   return result;
01644 }
01645 
01651 const int kUniChs[] = {
01652   0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
01653 };
01655 const int kLatinChs[] = {
01656   0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
01657 };
01658 
01664 char* TessBaseAPI::GetUNLVText() {
01665   if (tesseract_ == NULL ||
01666       (!recognition_done_ && Recognize(NULL) < 0))
01667     return NULL;
01668   bool tilde_crunch_written = false;
01669   bool last_char_was_newline = true;
01670   bool last_char_was_tilde = false;
01671 
01672   int total_length = TextLength(NULL);
01673   PAGE_RES_IT   page_res_it(page_res_);
01674   char* result = new char[total_length];
01675   char* ptr = result;
01676   for (page_res_it.restart_page(); page_res_it.word () != NULL;
01677        page_res_it.forward()) {
01678     WERD_RES *word = page_res_it.word();
01679     // Process the current word.
01680     if (word->unlv_crunch_mode != CR_NONE) {
01681       if (word->unlv_crunch_mode != CR_DELETE &&
01682           (!tilde_crunch_written ||
01683            (word->unlv_crunch_mode == CR_KEEP_SPACE &&
01684             word->word->space() > 0 &&
01685             !word->word->flag(W_FUZZY_NON) &&
01686             !word->word->flag(W_FUZZY_SP)))) {
01687         if (!word->word->flag(W_BOL) &&
01688             word->word->space() > 0 &&
01689             !word->word->flag(W_FUZZY_NON) &&
01690             !word->word->flag(W_FUZZY_SP)) {
01691           /* Write a space to separate from preceding good text */
01692           *ptr++ = ' ';
01693           last_char_was_tilde = false;
01694         }
01695         if (!last_char_was_tilde) {
01696           // Write a reject char.
01697           last_char_was_tilde = true;
01698           *ptr++ = kUNLVReject;
01699           tilde_crunch_written = true;
01700           last_char_was_newline = false;
01701         }
01702       }
01703     } else {
01704       // NORMAL PROCESSING of non tilde crunched words.
01705       tilde_crunch_written = false;
01706       tesseract_->set_unlv_suspects(word);
01707       const char* wordstr = word->best_choice->unichar_string().string();
01708       const STRING& lengths = word->best_choice->unichar_lengths();
01709       int length = lengths.length();
01710       int i = 0;
01711       int offset = 0;
01712 
01713       if (last_char_was_tilde &&
01714           word->word->space() == 0 && wordstr[offset] == ' ') {
01715         // Prevent adjacent tilde across words - we know that adjacent tildes
01716         // within words have been removed.
01717         // Skip the first character.
01718         offset = lengths[i++];
01719       }
01720       if (i < length && wordstr[offset] != 0) {
01721         if (!last_char_was_newline)
01722           *ptr++ = ' ';
01723         else
01724           last_char_was_newline = false;
01725         for (; i < length; offset += lengths[i++]) {
01726           if (wordstr[offset] == ' ' ||
01727               wordstr[offset] == kTesseractReject) {
01728             *ptr++ = kUNLVReject;
01729             last_char_was_tilde = true;
01730           } else {
01731             if (word->reject_map[i].rejected())
01732               *ptr++ = kUNLVSuspect;
01733             UNICHAR ch(wordstr + offset, lengths[i]);
01734             int uni_ch = ch.first_uni();
01735             for (int j = 0; kUniChs[j] != 0; ++j) {
01736               if (kUniChs[j] == uni_ch) {
01737                 uni_ch = kLatinChs[j];
01738                 break;
01739               }
01740             }
01741             if (uni_ch <= 0xff) {
01742               *ptr++ = static_cast<char>(uni_ch);
01743               last_char_was_tilde = false;
01744             } else {
01745               *ptr++ = kUNLVReject;
01746               last_char_was_tilde = true;
01747             }
01748           }
01749         }
01750       }
01751     }
01752     if (word->word->flag(W_EOL) && !last_char_was_newline) {
01753       /* Add a new line output */
01754       *ptr++ = '\n';
01755       tilde_crunch_written = false;
01756       last_char_was_newline = true;
01757       last_char_was_tilde = false;
01758     }
01759   }
01760   *ptr++ = '\n';
01761   *ptr = '\0';
01762   return result;
01763 }
01764 
01770 char* TessBaseAPI::GetOsdText(int page_number) {
01771   OSResults osr;
01772 
01773   bool osd = DetectOS(&osr);
01774   if (!osd) {
01775      return NULL;
01776   }
01777 
01778   int orient_id = osr.best_result.orientation_id;
01779   int script_id = osr.get_best_script(orient_id);
01780   float orient_conf = osr.best_result.oconfidence;
01781   float script_conf = osr.best_result.sconfidence;
01782   const char* script_name = 
01783       osr.unicharset->get_script_from_script_id(script_id);
01784 
01785   // clockwise orientation of the input image, in degrees
01786   int orient_deg = orient_id * 90; 
01787 
01788   // clockwise rotation needed to make the page upright
01789   int rotate =  OrientationIdToValue(orient_id);
01790 
01791   char* osd_buf = new char[255];
01792   snprintf(osd_buf, 255,
01793           "Page number: %d\n"
01794           "Orientation in degrees: %d\n"
01795           "Rotate: %d\n"
01796           "Orientation confidence: %.2f\n"
01797           "Script: %s\n"
01798           "Script confidence: %.2f\n",
01799           page_number,
01800           orient_deg, rotate, orient_conf,
01801           script_name, script_conf);
01802 
01803   return osd_buf;
01804 }
01805 
01807 int TessBaseAPI::MeanTextConf() {
01808   int* conf = AllWordConfidences();
01809   if (!conf) return 0;
01810   int sum = 0;
01811   int *pt = conf;
01812   while (*pt >= 0) sum += *pt++;
01813   if (pt != conf) sum /= pt - conf;
01814   delete [] conf;
01815   return sum;
01816 }
01817 
01819 int* TessBaseAPI::AllWordConfidences() {
01820   if (tesseract_ == NULL ||
01821       (!recognition_done_ && Recognize(NULL) < 0))
01822     return NULL;
01823   int n_word = 0;
01824   PAGE_RES_IT res_it(page_res_);
01825   for (res_it.restart_page(); res_it.word() != NULL; res_it.forward())
01826     n_word++;
01827 
01828   int* conf = new int[n_word+1];
01829   n_word = 0;
01830   for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) {
01831     WERD_RES *word = res_it.word();
01832     WERD_CHOICE* choice = word->best_choice;
01833     int w_conf = static_cast<int>(100 + 5 * choice->certainty());
01834                  // This is the eq for converting Tesseract confidence to 1..100
01835     if (w_conf < 0) w_conf = 0;
01836     if (w_conf > 100) w_conf = 100;
01837     conf[n_word++] = w_conf;
01838   }
01839   conf[n_word] = -1;
01840   return conf;
01841 }
01842 
01853 bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char* wordstr) {
01854   int debug = 0;
01855   GetIntVariable("applybox_debug", &debug);
01856   bool success = true;
01857   PageSegMode current_psm = GetPageSegMode();
01858   SetPageSegMode(mode);
01859   SetVariable("classify_enable_learning", "0");
01860   char* text = GetUTF8Text();
01861   if (debug) {
01862     tprintf("Trying to adapt \"%s\" to \"%s\"\n", text, wordstr);
01863   }
01864   if (text != NULL) {
01865     PAGE_RES_IT it(page_res_);
01866     WERD_RES* word_res = it.word();
01867     if (word_res != NULL) {
01868       word_res->word->set_text(wordstr);
01869     } else {
01870       success = false;
01871     }
01872     // Check to see if text matches wordstr.
01873     int w = 0;
01874     int t = 0;
01875     for (t = 0; text[t] != '\0'; ++t) {
01876       if (text[t] == '\n' || text[t] == ' ')
01877         continue;
01878       while (wordstr[w] != '\0' && wordstr[w] == ' ')
01879         ++w;
01880       if (text[t] != wordstr[w])
01881         break;
01882       ++w;
01883     }
01884     if (text[t] != '\0' || wordstr[w] != '\0') {
01885       // No match.
01886       delete page_res_;
01887       GenericVector<TBOX> boxes;
01888       page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_);
01889       tesseract_->ReSegmentByClassification(page_res_);
01890       tesseract_->TidyUp(page_res_);
01891       PAGE_RES_IT pr_it(page_res_);
01892       if (pr_it.word() == NULL)
01893         success = false;
01894       else
01895         word_res = pr_it.word();
01896     } else {
01897       word_res->BestChoiceToCorrectText();
01898     }
01899     if (success) {
01900       tesseract_->EnableLearning = true;
01901       tesseract_->LearnWord(NULL, word_res);
01902     }
01903     delete [] text;
01904   } else {
01905     success = false;
01906   }
01907   SetPageSegMode(current_psm);
01908   return success;
01909 }
01910 
01917 void TessBaseAPI::Clear() {
01918   if (thresholder_ != NULL)
01919     thresholder_->Clear();
01920   ClearResults();
01921   SetInputImage(NULL);
01922 }
01923 
01930 void TessBaseAPI::End() {
01931   if (thresholder_ != NULL) {
01932     delete thresholder_;
01933     thresholder_ = NULL;
01934   }
01935   if (page_res_ != NULL) {
01936     delete page_res_;
01937     page_res_ = NULL;
01938   }
01939   if (block_list_ != NULL) {
01940     delete block_list_;
01941     block_list_ = NULL;
01942   }
01943   if (paragraph_models_ != NULL) {
01944     paragraph_models_->delete_data_pointers();
01945     delete paragraph_models_;
01946     paragraph_models_ = NULL;
01947   }
01948   if (tesseract_ != NULL) {
01949     delete tesseract_;
01950     if (osd_tesseract_ == tesseract_)
01951       osd_tesseract_ = NULL;
01952     tesseract_ = NULL;
01953   }
01954   if (osd_tesseract_ != NULL) {
01955     delete osd_tesseract_;
01956     osd_tesseract_ = NULL;
01957   }
01958   if (equ_detect_ != NULL) {
01959     delete equ_detect_;
01960     equ_detect_ = NULL;
01961   }
01962   if (input_file_ != NULL) {
01963     delete input_file_;
01964     input_file_ = NULL;
01965   }
01966   if (input_image_ != NULL) {
01967     pixDestroy(&input_image_);
01968     input_image_ = NULL;
01969   }
01970   if (output_file_ != NULL) {
01971     delete output_file_;
01972     output_file_ = NULL;
01973   }
01974   if (datapath_ != NULL) {
01975     delete datapath_;
01976     datapath_ = NULL;
01977   }
01978   if (language_ != NULL) {
01979     delete language_;
01980     language_ = NULL;
01981   }
01982 }
01983 
01984 // Clear any library-level memory caches.
01985 // There are a variety of expensive-to-load constant data structures (mostly
01986 // language dictionaries) that are cached globally -- surviving the Init()
01987 // and End() of individual TessBaseAPI's.  This function allows the clearing
01988 // of these caches.
01989 void TessBaseAPI::ClearPersistentCache() {
01990   Dict::GlobalDawgCache()->DeleteUnusedDawgs();
01991 }
01992 
01997 int TessBaseAPI::IsValidWord(const char *word) {
01998   return tesseract_->getDict().valid_word(word);
01999 }
02000 // Returns true if utf8_character is defined in the UniCharset.
02001 bool TessBaseAPI::IsValidCharacter(const char *utf8_character) {
02002     return tesseract_->unicharset.contains_unichar(utf8_character);
02003 }
02004 
02005 
02006 // TODO(rays) Obsolete this function and replace with a more aptly named
02007 // function that returns image coordinates rather than tesseract coordinates.
02008 bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) {
02009   PageIterator* it = AnalyseLayout();
02010   if (it == NULL) {
02011     return false;
02012   }
02013   int x1, x2, y1, y2;
02014   it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
02015   // Calculate offset and slope (NOTE: Kind of ugly)
02016   if (x2 <= x1) x2 = x1 + 1;
02017   // Convert the point pair to slope/offset of the baseline (in image coords.)
02018   *out_slope = static_cast<float>(y2 - y1) / (x2 - x1);
02019   *out_offset = static_cast<int>(y1 - *out_slope * x1);
02020   // Get the y-coord of the baseline at the left and right edges of the
02021   // textline's bounding box.
02022   int left, top, right, bottom;
02023   if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) {
02024     delete it;
02025     return false;
02026   }
02027   int left_y = IntCastRounded(*out_slope * left + *out_offset);
02028   int right_y = IntCastRounded(*out_slope * right + *out_offset);
02029   // Shift the baseline down so it passes through the nearest bottom-corner
02030   // of the textline's bounding box. This is the difference between the y
02031   // at the lowest (max) edge of the box and the actual box bottom.
02032   *out_offset += bottom - MAX(left_y, right_y);
02033   // Switch back to bottom-up tesseract coordinates. Requires negation of
02034   // the slope and height - offset for the offset.
02035   *out_slope = -*out_slope;
02036   *out_offset = rect_height_ - *out_offset;
02037   delete it;
02038 
02039   return true;
02040 }
02041 
02043 void TessBaseAPI::SetDictFunc(DictFunc f) {
02044   if (tesseract_ != NULL) {
02045     tesseract_->getDict().letter_is_okay_ = f;
02046   }
02047 }
02048 
02057 void TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) {
02058   if (tesseract_ != NULL) {
02059     tesseract_->getDict().probability_in_context_ = f;
02060     // Set it for the sublangs too.
02061     int num_subs = tesseract_->num_sub_langs();
02062     for (int i = 0; i < num_subs; ++i) {
02063       tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f;
02064     }
02065   }
02066 }
02067 
02069 void TessBaseAPI::SetFillLatticeFunc(FillLatticeFunc f) {
02070   if (tesseract_ != NULL) tesseract_->fill_lattice_ = f;
02071 }
02072 
02074 bool TessBaseAPI::InternalSetImage() {
02075   if (tesseract_ == NULL) {
02076     tprintf("Please call Init before attempting to set an image.");
02077     return false;
02078   }
02079   if (thresholder_ == NULL)
02080     thresholder_ = new ImageThresholder;
02081   ClearResults();
02082   return true;
02083 }
02084 
02091 void TessBaseAPI::Threshold(Pix** pix) {
02092   ASSERT_HOST(pix != NULL);
02093   if (*pix != NULL)
02094     pixDestroy(pix);
02095   // Zero resolution messes up the algorithms, so make sure it is credible.
02096   int y_res = thresholder_->GetScaledYResolution();
02097   if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) {
02098     // Use the minimum default resolution, as it is safer to under-estimate
02099     // than over-estimate resolution.
02100     thresholder_->SetSourceYResolution(kMinCredibleResolution);
02101   }
02102   PageSegMode pageseg_mode =
02103       static_cast<PageSegMode>(
02104           static_cast<int>(tesseract_->tessedit_pageseg_mode));
02105   thresholder_->ThresholdToPix(pageseg_mode, pix);
02106   thresholder_->GetImageSizes(&rect_left_, &rect_top_,
02107                               &rect_width_, &rect_height_,
02108                               &image_width_, &image_height_);
02109   if (!thresholder_->IsBinary()) {
02110     tesseract_->set_pix_thresholds(thresholder_->GetPixRectThresholds());
02111     tesseract_->set_pix_grey(thresholder_->GetPixRectGrey());
02112   } else {
02113     tesseract_->set_pix_thresholds(NULL);
02114     tesseract_->set_pix_grey(NULL);
02115   }
02116   // Set the internal resolution that is used for layout parameters from the
02117   // estimated resolution, rather than the image resolution, which may be
02118   // fabricated, but we will use the image resolution, if there is one, to
02119   // report output point sizes.
02120   int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(),
02121                                   kMinCredibleResolution,
02122                                   kMaxCredibleResolution);
02123   if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {
02124     tprintf("Estimated resolution %d out of range! Corrected to %d\n",
02125             thresholder_->GetScaledEstimatedResolution(), estimated_res);
02126   }
02127   tesseract_->set_source_resolution(estimated_res);
02128   SavePixForCrash(estimated_res, *pix);
02129 }
02130 
02132 int TessBaseAPI::FindLines() {
02133   if (thresholder_ == NULL || thresholder_->IsEmpty()) {
02134     tprintf("Please call SetImage before attempting recognition.");
02135     return -1;
02136   }
02137   if (recognition_done_)
02138     ClearResults();
02139   if (!block_list_->empty()) {
02140     return 0;
02141   }
02142   if (tesseract_ == NULL) {
02143     tesseract_ = new Tesseract;
02144     tesseract_->InitAdaptiveClassifier(false);
02145   }
02146   if (tesseract_->pix_binary() == NULL)
02147     Threshold(tesseract_->mutable_pix_binary());
02148   if (tesseract_->ImageWidth() > MAX_INT16 ||
02149       tesseract_->ImageHeight() > MAX_INT16) {
02150     tprintf("Image too large: (%d, %d)\n",
02151             tesseract_->ImageWidth(), tesseract_->ImageHeight());
02152     return -1;
02153   }
02154 
02155   tesseract_->PrepareForPageseg();
02156 
02157   if (tesseract_->textord_equation_detect) {
02158     if (equ_detect_ == NULL && datapath_ != NULL) {
02159       equ_detect_ = new EquationDetect(datapath_->string(), NULL);
02160     }
02161     tesseract_->SetEquationDetect(equ_detect_);
02162   }
02163 
02164   Tesseract* osd_tess = osd_tesseract_;
02165   OSResults osr;
02166   if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == NULL) {
02167     if (strcmp(language_->string(), "osd") == 0) {
02168       osd_tess = tesseract_;
02169     } else {
02170       osd_tesseract_ = new Tesseract;
02171       if (osd_tesseract_->init_tesseract(
02172           datapath_->string(), NULL, "osd", OEM_TESSERACT_ONLY,
02173           NULL, 0, NULL, NULL, false) == 0) {
02174         osd_tess = osd_tesseract_;
02175         osd_tesseract_->set_source_resolution(
02176             thresholder_->GetSourceYResolution());
02177       } else {
02178         tprintf("Warning: Auto orientation and script detection requested,"
02179                 " but osd language failed to load\n");
02180         delete osd_tesseract_;
02181         osd_tesseract_ = NULL;
02182       }
02183     }
02184   }
02185 
02186   if (tesseract_->SegmentPage(input_file_, block_list_, osd_tess, &osr) < 0)
02187     return -1;
02188   // If Devanagari is being recognized, we use different images for page seg
02189   // and for OCR.
02190   tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);
02191   return 0;
02192 }
02193 
02195 void TessBaseAPI::ClearResults() {
02196   if (tesseract_ != NULL) {
02197     tesseract_->Clear();
02198   }
02199   if (page_res_ != NULL) {
02200     delete page_res_;
02201     page_res_ = NULL;
02202   }
02203   recognition_done_ = false;
02204   if (block_list_ == NULL)
02205     block_list_ = new BLOCK_LIST;
02206   else
02207     block_list_->clear();
02208   if (paragraph_models_ != NULL) {
02209     paragraph_models_->delete_data_pointers();
02210     delete paragraph_models_;
02211     paragraph_models_ = NULL;
02212   }
02213   SavePixForCrash(0, NULL);
02214 }
02215 
02223 int TessBaseAPI::TextLength(int* blob_count) {
02224   if (tesseract_ == NULL || page_res_ == NULL)
02225     return 0;
02226 
02227   PAGE_RES_IT   page_res_it(page_res_);
02228   int total_length = 2;
02229   int total_blobs = 0;
02230   // Iterate over the data structures to extract the recognition result.
02231   for (page_res_it.restart_page(); page_res_it.word () != NULL;
02232        page_res_it.forward()) {
02233     WERD_RES *word = page_res_it.word();
02234     WERD_CHOICE* choice = word->best_choice;
02235     if (choice != NULL) {
02236       total_blobs += choice->length() + 2;
02237       total_length += choice->unichar_string().length() + 2;
02238       for (int i = 0; i < word->reject_map.length(); ++i) {
02239         if (word->reject_map[i].rejected())
02240           ++total_length;
02241       }
02242     }
02243   }
02244   if (blob_count != NULL)
02245     *blob_count = total_blobs;
02246   return total_length;
02247 }
02248 
02253 bool TessBaseAPI::DetectOS(OSResults* osr) {
02254   if (tesseract_ == NULL)
02255     return false;
02256   ClearResults();
02257   if (tesseract_->pix_binary() == NULL)
02258     Threshold(tesseract_->mutable_pix_binary());
02259   if (input_file_ == NULL)
02260     input_file_ = new STRING(kInputFile);
02261   return orientation_and_script_detection(*input_file_, osr, tesseract_);
02262 }
02263 
02264 void TessBaseAPI::set_min_orientation_margin(double margin) {
02265   tesseract_->min_orientation_margin.set_value(margin);
02266 }
02267 
02282 void TessBaseAPI::GetBlockTextOrientations(int** block_orientation,
02283                                            bool** vertical_writing) {
02284   delete[] *block_orientation;
02285   *block_orientation = NULL;
02286   delete[] *vertical_writing;
02287   *vertical_writing = NULL;
02288   BLOCK_IT block_it(block_list_);
02289 
02290   block_it.move_to_first();
02291   int num_blocks = 0;
02292   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
02293     if (!block_it.data()->poly_block()->IsText()) {
02294       continue;
02295     }
02296     ++num_blocks;
02297   }
02298   if (!num_blocks) {
02299     tprintf("WARNING: Found no blocks\n");
02300     return;
02301   }
02302   *block_orientation = new int[num_blocks];
02303   *vertical_writing = new bool[num_blocks];
02304   block_it.move_to_first();
02305   int i = 0;
02306   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
02307        block_it.forward()) {
02308     if (!block_it.data()->poly_block()->IsText()) {
02309       continue;
02310     }
02311     FCOORD re_rotation = block_it.data()->re_rotation();
02312     float re_theta = re_rotation.angle();
02313     FCOORD classify_rotation = block_it.data()->classify_rotation();
02314     float classify_theta = classify_rotation.angle();
02315     double rot_theta = - (re_theta - classify_theta) * 2.0 / PI;
02316     if (rot_theta < 0) rot_theta += 4;
02317     int num_rotations = static_cast<int>(rot_theta + 0.5);
02318     (*block_orientation)[i] = num_rotations;
02319     // The classify_rotation is non-zero only if the text has vertical
02320     // writing direction.
02321     (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
02322     ++i;
02323   }
02324 }
02325 
02326 // ____________________________________________________________________________
02327 // Ocropus add-ons.
02328 
02330 BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() {
02331   FindLines();
02332   BLOCK_LIST* result = block_list_;
02333   block_list_ = NULL;
02334   return result;
02335 }
02336 
02342 void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) {
02343   delete block_list;
02344 }
02345 
02346 
02347 ROW *TessBaseAPI::MakeTessOCRRow(float baseline,
02348                                  float xheight,
02349                                  float descender,
02350                                  float ascender) {
02351   inT32 xstarts[] = {-32000};
02352   double quad_coeffs[] = {0, 0, baseline};
02353   return new ROW(1,
02354                  xstarts,
02355                  quad_coeffs,
02356                  xheight,
02357                  ascender - (baseline + xheight),
02358                  descender - baseline,
02359                  0,
02360                  0);
02361 }
02362 
02364 TBLOB *TessBaseAPI::MakeTBLOB(Pix *pix) {
02365   int width = pixGetWidth(pix);
02366   int height = pixGetHeight(pix);
02367   BLOCK block("a character", TRUE, 0, 0, 0, 0, width, height);
02368 
02369   // Create C_BLOBs from the page
02370   extract_edges(pix, &block);
02371 
02372   // Merge all C_BLOBs
02373   C_BLOB_LIST *list = block.blob_list();
02374   C_BLOB_IT c_blob_it(list);
02375   if (c_blob_it.empty())
02376     return NULL;
02377   // Move all the outlines to the first blob.
02378   C_OUTLINE_IT ol_it(c_blob_it.data()->out_list());
02379   for (c_blob_it.forward();
02380        !c_blob_it.at_first();
02381        c_blob_it.forward()) {
02382       C_BLOB *c_blob = c_blob_it.data();
02383       ol_it.add_list_after(c_blob->out_list());
02384   }
02385   // Convert the first blob to the output TBLOB.
02386   return TBLOB::PolygonalCopy(false, c_blob_it.data());
02387 }
02388 
02394 void TessBaseAPI::NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode) {
02395   TBOX box = tblob->bounding_box();
02396   float x_center = (box.left() + box.right()) / 2.0f;
02397   float baseline = row->base_line(x_center);
02398   float scale = kBlnXHeight / row->x_height();
02399   tblob->Normalize(NULL, NULL, NULL, x_center, baseline, scale, scale,
02400                    0.0f, static_cast<float>(kBlnBaselineOffset), false, NULL);
02401 }
02402 
02407 TBLOB *make_tesseract_blob(float baseline, float xheight,
02408                            float descender, float ascender,
02409                            bool numeric_mode, Pix* pix) {
02410   TBLOB *tblob = TessBaseAPI::MakeTBLOB(pix);
02411 
02412   // Normalize TBLOB
02413   ROW *row =
02414       TessBaseAPI::MakeTessOCRRow(baseline, xheight, descender, ascender);
02415   TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode);
02416   delete row;
02417   return tblob;
02418 }
02419 
02425 void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
02426                                    int length,
02427                                    float baseline,
02428                                    float xheight,
02429                                    float descender,
02430                                    float ascender) {
02431   UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length);
02432   TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender,
02433                                     tesseract_->classify_bln_numeric_mode,
02434                                     tesseract_->pix_binary());
02435   float threshold;
02436   float best_rating = -100;
02437 
02438 
02439   // Classify to get a raw choice.
02440   BLOB_CHOICE_LIST choices;
02441   tesseract_->AdaptiveClassifier(blob, &choices);
02442   BLOB_CHOICE_IT choice_it;
02443   choice_it.set_to_list(&choices);
02444   for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
02445        choice_it.forward()) {
02446     if (choice_it.data()->rating() > best_rating) {
02447       best_rating = choice_it.data()->rating();
02448     }
02449   }
02450 
02451   threshold = tesseract_->matcher_good_threshold;
02452 
02453   if (blob->outlines)
02454     tesseract_->AdaptToChar(blob, id, kUnknownFontinfoId, threshold,
02455                             tesseract_->AdaptedTemplates);
02456   delete blob;
02457 }
02458 
02459 
02460 PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
02461   PAGE_RES *page_res = new PAGE_RES(false, block_list,
02462                                     &(tesseract_->prev_word_best_choice_));
02463   tesseract_->recog_all_words(page_res, NULL, NULL, NULL, 1);
02464   return page_res;
02465 }
02466 
02467 PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
02468                                         PAGE_RES* pass1_result) {
02469   if (!pass1_result)
02470     pass1_result = new PAGE_RES(false, block_list,
02471                                 &(tesseract_->prev_word_best_choice_));
02472   tesseract_->recog_all_words(pass1_result, NULL, NULL, NULL, 2);
02473   return pass1_result;
02474 }
02475 
02476 void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
02477   int debug_level = 0;
02478   GetIntVariable("paragraph_debug_level", &debug_level);
02479   if (paragraph_models_ == NULL)
02480     paragraph_models_ = new GenericVector<ParagraphModel*>;
02481   MutableIterator *result_it = GetMutableIterator();
02482   do {  // Detect paragraphs for this block
02483     GenericVector<ParagraphModel *> models;
02484     ::tesseract::DetectParagraphs(debug_level, after_text_recognition,
02485                                   result_it, &models);
02486     *paragraph_models_ += models;
02487   } while (result_it->Next(RIL_BLOCK));
02488   delete result_it;
02489 }
02490 
02491 struct TESS_CHAR : ELIST_LINK {
02492   char *unicode_repr;
02493   int length;  // of unicode_repr
02494   float cost;
02495   TBOX box;
02496 
02497   TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) {
02498     length = (len == -1 ? strlen(repr) : len);
02499     unicode_repr = new char[length + 1];
02500     strncpy(unicode_repr, repr, length);
02501   }
02502 
02503   TESS_CHAR() {  // Satisfies ELISTIZE.
02504   }
02505   ~TESS_CHAR() {
02506     delete [] unicode_repr;
02507   }
02508 };
02509 
02510 ELISTIZEH(TESS_CHAR)
02511 ELISTIZE(TESS_CHAR)
02512 
02513 static void add_space(TESS_CHAR_IT* it) {
02514   TESS_CHAR *t = new TESS_CHAR(0, " ");
02515   it->add_after_then_move(t);
02516 }
02517 
02518 
02519 static float rating_to_cost(float rating) {
02520   rating = 100 + rating;
02521   // cuddled that to save from coverage profiler
02522   // (I have never seen ratings worse than -100,
02523   //  but the check won't hurt)
02524   if (rating < 0) rating = 0;
02525   return rating;
02526 }
02527 
02532 static void extract_result(TESS_CHAR_IT* out,
02533                            PAGE_RES* page_res) {
02534   PAGE_RES_IT page_res_it(page_res);
02535   int word_count = 0;
02536   while (page_res_it.word() != NULL) {
02537     WERD_RES *word = page_res_it.word();
02538     const char *str = word->best_choice->unichar_string().string();
02539     const char *len = word->best_choice->unichar_lengths().string();
02540     TBOX real_rect = word->word->bounding_box();
02541 
02542     if (word_count)
02543       add_space(out);
02544     int n = strlen(len);
02545     for (int i = 0; i < n; i++) {
02546       TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()),
02547                                     str, *len);
02548       tc->box = real_rect.intersection(word->box_word->BlobBox(i));
02549       out->add_after_then_move(tc);
02550        str += *len;
02551       len++;
02552     }
02553     page_res_it.forward();
02554     word_count++;
02555   }
02556 }
02557 
02562 int TessBaseAPI::TesseractExtractResult(char** text,
02563                                         int** lengths,
02564                                         float** costs,
02565                                         int** x0,
02566                                         int** y0,
02567                                         int** x1,
02568                                         int** y1,
02569                                         PAGE_RES* page_res) {
02570   TESS_CHAR_LIST tess_chars;
02571   TESS_CHAR_IT tess_chars_it(&tess_chars);
02572   extract_result(&tess_chars_it, page_res);
02573   tess_chars_it.move_to_first();
02574   int n = tess_chars.length();
02575   int text_len = 0;
02576   *lengths = new int[n];
02577   *costs = new float[n];
02578   *x0 = new int[n];
02579   *y0 = new int[n];
02580   *x1 = new int[n];
02581   *y1 = new int[n];
02582   int i = 0;
02583   for (tess_chars_it.mark_cycle_pt();
02584        !tess_chars_it.cycled_list();
02585        tess_chars_it.forward(), i++) {
02586     TESS_CHAR *tc = tess_chars_it.data();
02587     text_len += (*lengths)[i] = tc->length;
02588     (*costs)[i] = tc->cost;
02589     (*x0)[i] = tc->box.left();
02590     (*y0)[i] = tc->box.bottom();
02591     (*x1)[i] = tc->box.right();
02592     (*y1)[i] = tc->box.top();
02593   }
02594   char *p = *text = new char[text_len];
02595 
02596   tess_chars_it.move_to_first();
02597   for (tess_chars_it.mark_cycle_pt();
02598         !tess_chars_it.cycled_list();
02599        tess_chars_it.forward()) {
02600     TESS_CHAR *tc = tess_chars_it.data();
02601     strncpy(p, tc->unicode_repr, tc->length);
02602     p += tc->length;
02603   }
02604   return n;
02605 }
02606 
02608 // The resulting features are returned in int_features, which must be
02609 // of size MAX_NUM_INT_FEATURES. The number of features is returned in
02610 // num_features (or 0 if there was a failure).
02611 // On return feature_outline_index is filled with an index of the outline
02612 // corresponding to each feature in int_features.
02613 // TODO(rays) Fix the caller to out outline_counts instead.
02614 void TessBaseAPI::GetFeaturesForBlob(TBLOB* blob,
02615                                      INT_FEATURE_STRUCT* int_features,
02616                                      int* num_features,
02617                                      int* feature_outline_index) {
02618   GenericVector<int> outline_counts;
02619   GenericVector<INT_FEATURE_STRUCT> bl_features;
02620   GenericVector<INT_FEATURE_STRUCT> cn_features;
02621   INT_FX_RESULT_STRUCT fx_info;
02622   tesseract_->ExtractFeatures(*blob, false, &bl_features,
02623                               &cn_features, &fx_info, &outline_counts);
02624   if (cn_features.size() == 0 || cn_features.size() > MAX_NUM_INT_FEATURES) {
02625     *num_features = 0;
02626     return;  // Feature extraction failed.
02627   }
02628   *num_features = cn_features.size();
02629   memcpy(int_features, &cn_features[0], *num_features * sizeof(cn_features[0]));
02630   // TODO(rays) Pass outline_counts back and simplify the calling code.
02631   if (feature_outline_index != NULL) {
02632     int f = 0;
02633     for (int i = 0; i < outline_counts.size(); ++i) {
02634       while (f < outline_counts[i])
02635         feature_outline_index[f++] = i;
02636     }
02637   }
02638 }
02639 
02640 // This method returns the row to which a box of specified dimensions would
02641 // belong. If no good match is found, it returns NULL.
02642 ROW* TessBaseAPI::FindRowForBox(BLOCK_LIST* blocks,
02643                                 int left, int top, int right, int bottom) {
02644   TBOX box(left, bottom, right, top);
02645   BLOCK_IT b_it(blocks);
02646   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
02647     BLOCK* block = b_it.data();
02648     if (!box.major_overlap(block->bounding_box()))
02649       continue;
02650     ROW_IT r_it(block->row_list());
02651     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
02652       ROW* row = r_it.data();
02653       if (!box.major_overlap(row->bounding_box()))
02654         continue;
02655       WERD_IT w_it(row->word_list());
02656       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
02657         WERD* word = w_it.data();
02658         if (box.major_overlap(word->bounding_box()))
02659           return row;
02660       }
02661     }
02662   }
02663   return NULL;
02664 }
02665 
02667 void TessBaseAPI::RunAdaptiveClassifier(TBLOB* blob,
02668                                         int num_max_matches,
02669                                         int* unichar_ids,
02670                                         float* ratings,
02671                                         int* num_matches_returned) {
02672   BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
02673   tesseract_->AdaptiveClassifier(blob, choices);
02674   BLOB_CHOICE_IT choices_it(choices);
02675   int& index = *num_matches_returned;
02676   index = 0;
02677   for (choices_it.mark_cycle_pt();
02678        !choices_it.cycled_list() && index < num_max_matches;
02679        choices_it.forward()) {
02680     BLOB_CHOICE* choice = choices_it.data();
02681     unichar_ids[index] = choice->unichar_id();
02682     ratings[index] = choice->rating();
02683     ++index;
02684   }
02685   *num_matches_returned = index;
02686   delete choices;
02687 }
02688 
02690 const char* TessBaseAPI::GetUnichar(int unichar_id) {
02691   return tesseract_->unicharset.id_to_unichar(unichar_id);
02692 }
02693 
02695 const Dawg *TessBaseAPI::GetDawg(int i) const {
02696   if (tesseract_ == NULL || i >= NumDawgs()) return NULL;
02697   return tesseract_->getDict().GetDawg(i);
02698 }
02699 
02701 int TessBaseAPI::NumDawgs() const {
02702   return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs();
02703 }
02704 
02705 #ifndef NO_CUBE_BUILD
02706 
02707 CubeRecoContext *TessBaseAPI::GetCubeRecoContext() const {
02708   return (tesseract_ == NULL) ? NULL : tesseract_->GetCubeRecoContext();
02709 }
02710 #endif  // NO_CUBE_BUILD
02711 
02713 STRING HOcrEscape(const char* text) {
02714   STRING ret;
02715   const char *ptr;
02716   for (ptr = text; *ptr; ptr++) {
02717     switch (*ptr) {
02718       case '<': ret += "&lt;"; break;
02719       case '>': ret += "&gt;"; break;
02720       case '&': ret += "&amp;"; break;
02721       case '"': ret += "&quot;"; break;
02722       case '\'': ret += "&#39;"; break;
02723       default: ret += *ptr;
02724     }
02725   }
02726   return ret;
02727 }
02728 
02729 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines