tesseract 3.04.01

ccmain/output.cpp

Go to the documentation of this file.
00001 /******************************************************************
00002  * File:        output.cpp  (Formerly output.c)
00003  * Description: Output pass
00004  * Author:                                      Phil Cheatle
00005  * Created:                                     Thu Aug  4 10:56:08 BST 1994
00006  *
00007  * (C) Copyright 1994, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #endif
00023 
00024 #include <string.h>
00025 #include <ctype.h>
00026 #ifdef __UNIX__
00027 #include          <assert.h>
00028 #include          <unistd.h>
00029 #include          <errno.h>
00030 #endif
00031 #include "helpers.h"
00032 #include "tessvars.h"
00033 #include "control.h"
00034 #include "reject.h"
00035 #include "docqual.h"
00036 #include "output.h"
00037 #include "globals.h"
00038 #include "tesseractclass.h"
00039 
00040 #define EPAPER_EXT      ".ep"
00041 #define PAGE_YSIZE      3508
00042 #define CTRL_INSET      '\024'   //dc4=text inset
00043 #define CTRL_FONT       '\016'   //so=font change
00044 #define CTRL_DEFAULT      '\017' //si=default font
00045 #define CTRL_SHIFT      '\022'   //dc2=x shift
00046 #define CTRL_TAB        '\011'   //tab
00047 #define CTRL_NEWLINE      '\012' //newline
00048 #define CTRL_HARDLINE   '\015'   //cr
00049 
00050 /**********************************************************************
00051  * pixels_to_pts
00052  *
00053  * Convert an integer number of pixels to the nearest integer
00054  * number of points.
00055  **********************************************************************/
00056 
00057 inT32 pixels_to_pts(               //convert coords
00058                     inT32 pixels,
00059                     inT32 pix_res  //resolution
00060                    ) {
00061   float pts;                     //converted value
00062 
00063   pts = pixels * 72.0 / pix_res;
00064   return (inT32) (pts + 0.5);    //round it
00065 }
00066 
00067 namespace tesseract {
00068 void Tesseract::output_pass(  //Tess output pass //send to api
00069                             PAGE_RES_IT &page_res_it,
00070                             const TBOX *target_word_box) {
00071   BLOCK_RES *block_of_last_word;
00072   BOOL8 force_eol;               //During output
00073   BLOCK *nextblock;              //block of next word
00074   WERD *nextword;                //next word
00075 
00076   page_res_it.restart_page ();
00077   block_of_last_word = NULL;
00078   while (page_res_it.word () != NULL) {
00079     check_debug_pt (page_res_it.word (), 120);
00080 
00081         if (target_word_box)
00082         {
00083 
00084                 TBOX current_word_box=page_res_it.word ()->word->bounding_box();
00085                 FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
00086                 if (!target_word_box->contains(center_pt))
00087                 {
00088                         page_res_it.forward ();
00089                         continue;
00090                 }
00091 
00092         }
00093     if (tessedit_write_block_separators &&
00094     block_of_last_word != page_res_it.block ()) {
00095       block_of_last_word = page_res_it.block ();
00096     }
00097 
00098     force_eol = (tessedit_write_block_separators &&
00099       (page_res_it.block () != page_res_it.next_block ())) ||
00100       (page_res_it.next_word () == NULL);
00101 
00102     if (page_res_it.next_word () != NULL)
00103       nextword = page_res_it.next_word ()->word;
00104     else
00105       nextword = NULL;
00106     if (page_res_it.next_block () != NULL)
00107       nextblock = page_res_it.next_block ()->block;
00108     else
00109       nextblock = NULL;
00110                                  //regardless of tilde crunching
00111     write_results(page_res_it,
00112                   determine_newline_type(page_res_it.word()->word,
00113                                          page_res_it.block()->block,
00114                                          nextword, nextblock), force_eol);
00115     page_res_it.forward();
00116   }
00117 }
00118 
00119 
00120 /*************************************************************************
00121  * write_results()
00122  *
00123  * All recognition and rejection has now been done. Generate the following:
00124  *   .txt file     - giving the final best choices with NO highlighting
00125  *   .raw file     - giving the tesseract top choice output for each word
00126  *   .map file     - showing how the .txt file has been rejected in the .ep file
00127  *   epchoice list - a list of one element per word, containing the text for the
00128  *                   epaper. Reject strings are inserted.
00129  *   inset list    - a list of bounding boxes of reject insets - indexed by the
00130  *                   reject strings in the epchoice text.
00131  *************************************************************************/
00132 void Tesseract::write_results(PAGE_RES_IT &page_res_it,
00133                               char newline_type,  // type of newline
00134                               BOOL8 force_eol) {  // override tilde crunch?
00135   WERD_RES *word = page_res_it.word();
00136   const UNICHARSET &uchset = *word->uch_set;
00137   int i;
00138   BOOL8 need_reject = FALSE;
00139   UNICHAR_ID space = uchset.unichar_to_id(" ");
00140 
00141   if ((word->unlv_crunch_mode != CR_NONE ||
00142        word->best_choice->length() == 0) &&
00143       !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
00144     if ((word->unlv_crunch_mode != CR_DELETE) &&
00145         (!stats_.tilde_crunch_written ||
00146          ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
00147           (word->word->space () > 0) &&
00148           !word->word->flag (W_FUZZY_NON) &&
00149           !word->word->flag (W_FUZZY_SP)))) {
00150       if (!word->word->flag (W_BOL) &&
00151           (word->word->space () > 0) &&
00152           !word->word->flag (W_FUZZY_NON) &&
00153           !word->word->flag (W_FUZZY_SP)) {
00154         stats_.last_char_was_tilde = false;
00155       }
00156       need_reject = TRUE;
00157     }
00158     if ((need_reject && !stats_.last_char_was_tilde) ||
00159         (force_eol && stats_.write_results_empty_block)) {
00160       /* Write a reject char - mark as rejected unless zero_rejection mode */
00161       stats_.last_char_was_tilde = TRUE;
00162       stats_.tilde_crunch_written = true;
00163       stats_.last_char_was_newline = false;
00164       stats_.write_results_empty_block = false;
00165     }
00166 
00167     if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
00168       stats_.tilde_crunch_written = false;
00169       stats_.last_char_was_newline = true;
00170       stats_.last_char_was_tilde = false;
00171     }
00172 
00173     if (force_eol)
00174       stats_.write_results_empty_block = true;
00175     return;
00176   }
00177 
00178   /* NORMAL PROCESSING of non tilde crunched words */
00179 
00180   stats_.tilde_crunch_written = false;
00181   if (newline_type)
00182     stats_.last_char_was_newline = true;
00183   else
00184     stats_.last_char_was_newline = false;
00185   stats_.write_results_empty_block = force_eol;  // about to write a real word
00186 
00187   if (unlv_tilde_crunching &&
00188       stats_.last_char_was_tilde &&
00189       (word->word->space() == 0) &&
00190       !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
00191       (word->best_choice->unichar_id(0) == space)) {
00192     /* Prevent adjacent tilde across words - we know that adjacent tildes within
00193        words have been removed */
00194     word->MergeAdjacentBlobs(0);
00195   }
00196   if (newline_type ||
00197     (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
00198     stats_.last_char_was_tilde = false;
00199   else {
00200     if (word->reject_map.length () > 0) {
00201       if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
00202         stats_.last_char_was_tilde = true;
00203       else
00204         stats_.last_char_was_tilde = false;
00205     }
00206     else if (word->word->space () > 0)
00207       stats_.last_char_was_tilde = false;
00208     /* else it is unchanged as there are no output chars */
00209   }
00210 
00211   ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
00212 
00213   set_unlv_suspects(word);
00214   check_debug_pt (word, 120);
00215   if (tessedit_rejection_debug) {
00216     tprintf ("Dict word: \"%s\": %d\n",
00217              word->best_choice->debug_string().string(),
00218              dict_word(*(word->best_choice)));
00219   }
00220   if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
00221     if (tessedit_zero_rejection) {
00222       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
00223       for (i = 0; i < word->best_choice->length(); ++i) {
00224         if (word->reject_map[i].rejected())
00225           word->reject_map[i].setrej_minimal_rej_accept();
00226       }
00227     }
00228     if (tessedit_minimal_rejection) {
00229       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
00230       for (i = 0; i < word->best_choice->length(); ++i) {
00231         if ((word->best_choice->unichar_id(i) != space) &&
00232             word->reject_map[i].rejected())
00233           word->reject_map[i].setrej_minimal_rej_accept();
00234       }
00235     }
00236   }
00237 }
00238 }  // namespace tesseract
00239 
00240 /**********************************************************************
00241  * determine_newline_type
00242  *
00243  * Find whether we have a wrapping or hard newline.
00244  * Return FALSE if not at end of line.
00245  **********************************************************************/
00246 
00247 char determine_newline_type(                   //test line ends
00248                             WERD *word,        //word to do
00249                             BLOCK *block,      //current block
00250                             WERD *next_word,   //next word
00251                             BLOCK *next_block  //block of next word
00252                            ) {
00253   inT16 end_gap;                 //to right edge
00254   inT16 width;                   //of next word
00255   TBOX word_box;                  //bounding
00256   TBOX next_box;                  //next word
00257   TBOX block_box;                 //block bounding
00258 
00259   if (!word->flag (W_EOL))
00260     return FALSE;                //not end of line
00261   if (next_word == NULL || next_block == NULL || block != next_block)
00262     return CTRL_NEWLINE;
00263   if (next_word->space () > 0)
00264     return CTRL_HARDLINE;        //it is tabbed
00265   word_box = word->bounding_box ();
00266   next_box = next_word->bounding_box ();
00267   block_box = block->bounding_box ();
00268                                  //gap to eol
00269   end_gap = block_box.right () - word_box.right ();
00270   end_gap -= (inT32) block->space ();
00271   width = next_box.right () - next_box.left ();
00272   //      tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
00273   //              block_box.right(),word_box.right(),end_gap,
00274   //              next_box.right(),next_box.left(),width,
00275   //              end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
00276   return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
00277 }
00278 
00279 /*************************************************************************
00280  * get_rep_char()
00281  * Return the first accepted character from the repetition string. This is the
00282  * character which is repeated - as determined earlier by fix_rep_char()
00283  *************************************************************************/
00284 namespace tesseract {
00285 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) {  // what char is repeated?
00286   int i;
00287   for (i = 0; ((i < word->reject_map.length()) &&
00288                (word->reject_map[i].rejected())); ++i);
00289 
00290   if (i < word->reject_map.length()) {
00291     return word->best_choice->unichar_id(i);
00292   } else {
00293     return word->uch_set->unichar_to_id(unrecognised_char.string());
00294   }
00295 }
00296 
00297 /*************************************************************************
00298  * SUSPECT LEVELS
00299  *
00300  * 0 - don't reject ANYTHING
00301  * 1,2 - partial rejection
00302  * 3 - BEST
00303  *
00304  * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
00305  * tessedit_minimal_rejection.
00306  *************************************************************************/
00307 void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
00308   int len = word_res->reject_map.length();
00309   const WERD_CHOICE &word = *(word_res->best_choice);
00310   const UNICHARSET &uchset = *word.unicharset();
00311   int i;
00312   float rating_per_ch;
00313 
00314   if (suspect_level == 0) {
00315     for (i = 0; i < len; i++) {
00316       if (word_res->reject_map[i].rejected())
00317         word_res->reject_map[i].setrej_minimal_rej_accept();
00318     }
00319     return;
00320   }
00321 
00322   if (suspect_level >= 3)
00323     return;                      //Use defaults
00324 
00325   /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
00326 
00327   if (safe_dict_word(word_res) &&
00328       (count_alphas(word) > suspect_short_words)) {
00329     /* Unreject alphas in dictionary words */
00330     for (i = 0; i < len; ++i) {
00331       if (word_res->reject_map[i].rejected() &&
00332           uchset.get_isalpha(word.unichar_id(i)))
00333         word_res->reject_map[i].setrej_minimal_rej_accept();
00334     }
00335   }
00336 
00337   rating_per_ch = word.rating() / word_res->reject_map.length();
00338 
00339   if (rating_per_ch >= suspect_rating_per_ch)
00340     return;                      //Don't touch bad ratings
00341 
00342   if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
00343     /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
00344     for (i = 0; i < len; ++i) {
00345       if (word_res->reject_map[i].rejected() &&
00346           (!uchset.eq(word.unichar_id(i), " ")))
00347         word_res->reject_map[i].setrej_minimal_rej_accept();
00348     }
00349   }
00350 
00351   for (i = 0; i < len; i++) {
00352     if (word_res->reject_map[i].rejected()) {
00353       if (word_res->reject_map[i].flag(R_DOC_REJ))
00354         word_res->reject_map[i].setrej_minimal_rej_accept();
00355       if (word_res->reject_map[i].flag(R_BLOCK_REJ))
00356         word_res->reject_map[i].setrej_minimal_rej_accept();
00357       if (word_res->reject_map[i].flag(R_ROW_REJ))
00358         word_res->reject_map[i].setrej_minimal_rej_accept();
00359     }
00360   }
00361 
00362   if (suspect_level == 2)
00363     return;
00364 
00365   if (!suspect_constrain_1Il ||
00366       (word_res->reject_map.length() <= suspect_short_words)) {
00367     for (i = 0; i < len; i++) {
00368       if (word_res->reject_map[i].rejected()) {
00369         if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
00370           word_res->reject_map[i].flag(R_POSTNN_1IL)))
00371           word_res->reject_map[i].setrej_minimal_rej_accept();
00372 
00373         if (!suspect_constrain_1Il &&
00374           word_res->reject_map[i].flag(R_MM_REJECT))
00375           word_res->reject_map[i].setrej_minimal_rej_accept();
00376       }
00377     }
00378   }
00379 
00380   if (acceptable_word_string(*word_res->uch_set,
00381                              word.unichar_string().string(),
00382                              word.unichar_lengths().string()) !=
00383                                  AC_UNACCEPTABLE ||
00384       acceptable_number_string(word.unichar_string().string(),
00385                                word.unichar_lengths().string())) {
00386     if (word_res->reject_map.length() > suspect_short_words) {
00387       for (i = 0; i < len; i++) {
00388         if (word_res->reject_map[i].rejected() &&
00389           (!word_res->reject_map[i].perm_rejected() ||
00390            word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
00391            word_res->reject_map[i].flag (R_POSTNN_1IL) ||
00392            word_res->reject_map[i].flag (R_MM_REJECT))) {
00393           word_res->reject_map[i].setrej_minimal_rej_accept();
00394         }
00395       }
00396     }
00397   }
00398 }
00399 
00400 inT16 Tesseract::count_alphas(const WERD_CHOICE &word) {
00401   int count = 0;
00402   for (int i = 0; i < word.length(); ++i) {
00403     if (word.unicharset()->get_isalpha(word.unichar_id(i)))
00404       count++;
00405   }
00406   return count;
00407 }
00408 
00409 
00410 inT16 Tesseract::count_alphanums(const WERD_CHOICE &word) {
00411   int count = 0;
00412   for (int i = 0; i < word.length(); ++i) {
00413     if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
00414         word.unicharset()->get_isdigit(word.unichar_id(i)))
00415       count++;
00416   }
00417   return count;
00418 }
00419 
00420 
00421 BOOL8 Tesseract::acceptable_number_string(const char *s,
00422                                           const char *lengths) {
00423   BOOL8 prev_digit = FALSE;
00424 
00425   if (*lengths == 1 && *s == '(')
00426     s++;
00427 
00428   if (*lengths == 1 &&
00429       ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
00430     s++;
00431 
00432   for (; *s != '\0'; s += *(lengths++)) {
00433     if (unicharset.get_isdigit(s, *lengths))
00434       prev_digit = TRUE;
00435     else if (prev_digit &&
00436              (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
00437       prev_digit = FALSE;
00438     else if (prev_digit && *lengths == 1 &&
00439              (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
00440       return TRUE;
00441     else if (prev_digit &&
00442              *lengths == 1 && (*s == '%') &&
00443              (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
00444              (*(s + *lengths + *(lengths + 1)) == '\0'))
00445       return TRUE;
00446     else
00447       return FALSE;
00448   }
00449   return TRUE;
00450 }
00451 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines