|
tesseract 3.04.01
|
00001 /****************************************************************** 00002 * File: output.cpp (Formerly output.c) 00003 * Description: Output pass 00004 * Author: Phil Cheatle 00005 * Created: Thu Aug 4 10:56:08 BST 1994 00006 * 00007 * (C) Copyright 1994, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #endif 00023 00024 #include <string.h> 00025 #include <ctype.h> 00026 #ifdef __UNIX__ 00027 #include <assert.h> 00028 #include <unistd.h> 00029 #include <errno.h> 00030 #endif 00031 #include "helpers.h" 00032 #include "tessvars.h" 00033 #include "control.h" 00034 #include "reject.h" 00035 #include "docqual.h" 00036 #include "output.h" 00037 #include "globals.h" 00038 #include "tesseractclass.h" 00039 00040 #define EPAPER_EXT ".ep" 00041 #define PAGE_YSIZE 3508 00042 #define CTRL_INSET '\024' //dc4=text inset 00043 #define CTRL_FONT '\016' //so=font change 00044 #define CTRL_DEFAULT '\017' //si=default font 00045 #define CTRL_SHIFT '\022' //dc2=x shift 00046 #define CTRL_TAB '\011' //tab 00047 #define CTRL_NEWLINE '\012' //newline 00048 #define CTRL_HARDLINE '\015' //cr 00049 00050 /********************************************************************** 00051 * pixels_to_pts 00052 * 00053 * Convert an integer number of pixels to the nearest integer 00054 * number of points. 00055 **********************************************************************/ 00056 00057 inT32 pixels_to_pts( //convert coords 00058 inT32 pixels, 00059 inT32 pix_res //resolution 00060 ) { 00061 float pts; //converted value 00062 00063 pts = pixels * 72.0 / pix_res; 00064 return (inT32) (pts + 0.5); //round it 00065 } 00066 00067 namespace tesseract { 00068 void Tesseract::output_pass( //Tess output pass //send to api 00069 PAGE_RES_IT &page_res_it, 00070 const TBOX *target_word_box) { 00071 BLOCK_RES *block_of_last_word; 00072 BOOL8 force_eol; //During output 00073 BLOCK *nextblock; //block of next word 00074 WERD *nextword; //next word 00075 00076 page_res_it.restart_page (); 00077 block_of_last_word = NULL; 00078 while (page_res_it.word () != NULL) { 00079 check_debug_pt (page_res_it.word (), 120); 00080 00081 if (target_word_box) 00082 { 00083 00084 TBOX current_word_box=page_res_it.word ()->word->bounding_box(); 00085 FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2); 00086 if (!target_word_box->contains(center_pt)) 00087 { 00088 page_res_it.forward (); 00089 continue; 00090 } 00091 00092 } 00093 if (tessedit_write_block_separators && 00094 block_of_last_word != page_res_it.block ()) { 00095 block_of_last_word = page_res_it.block (); 00096 } 00097 00098 force_eol = (tessedit_write_block_separators && 00099 (page_res_it.block () != page_res_it.next_block ())) || 00100 (page_res_it.next_word () == NULL); 00101 00102 if (page_res_it.next_word () != NULL) 00103 nextword = page_res_it.next_word ()->word; 00104 else 00105 nextword = NULL; 00106 if (page_res_it.next_block () != NULL) 00107 nextblock = page_res_it.next_block ()->block; 00108 else 00109 nextblock = NULL; 00110 //regardless of tilde crunching 00111 write_results(page_res_it, 00112 determine_newline_type(page_res_it.word()->word, 00113 page_res_it.block()->block, 00114 nextword, nextblock), force_eol); 00115 page_res_it.forward(); 00116 } 00117 } 00118 00119 00120 /************************************************************************* 00121 * write_results() 00122 * 00123 * All recognition and rejection has now been done. Generate the following: 00124 * .txt file - giving the final best choices with NO highlighting 00125 * .raw file - giving the tesseract top choice output for each word 00126 * .map file - showing how the .txt file has been rejected in the .ep file 00127 * epchoice list - a list of one element per word, containing the text for the 00128 * epaper. Reject strings are inserted. 00129 * inset list - a list of bounding boxes of reject insets - indexed by the 00130 * reject strings in the epchoice text. 00131 *************************************************************************/ 00132 void Tesseract::write_results(PAGE_RES_IT &page_res_it, 00133 char newline_type, // type of newline 00134 BOOL8 force_eol) { // override tilde crunch? 00135 WERD_RES *word = page_res_it.word(); 00136 const UNICHARSET &uchset = *word->uch_set; 00137 int i; 00138 BOOL8 need_reject = FALSE; 00139 UNICHAR_ID space = uchset.unichar_to_id(" "); 00140 00141 if ((word->unlv_crunch_mode != CR_NONE || 00142 word->best_choice->length() == 0) && 00143 !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { 00144 if ((word->unlv_crunch_mode != CR_DELETE) && 00145 (!stats_.tilde_crunch_written || 00146 ((word->unlv_crunch_mode == CR_KEEP_SPACE) && 00147 (word->word->space () > 0) && 00148 !word->word->flag (W_FUZZY_NON) && 00149 !word->word->flag (W_FUZZY_SP)))) { 00150 if (!word->word->flag (W_BOL) && 00151 (word->word->space () > 0) && 00152 !word->word->flag (W_FUZZY_NON) && 00153 !word->word->flag (W_FUZZY_SP)) { 00154 stats_.last_char_was_tilde = false; 00155 } 00156 need_reject = TRUE; 00157 } 00158 if ((need_reject && !stats_.last_char_was_tilde) || 00159 (force_eol && stats_.write_results_empty_block)) { 00160 /* Write a reject char - mark as rejected unless zero_rejection mode */ 00161 stats_.last_char_was_tilde = TRUE; 00162 stats_.tilde_crunch_written = true; 00163 stats_.last_char_was_newline = false; 00164 stats_.write_results_empty_block = false; 00165 } 00166 00167 if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) { 00168 stats_.tilde_crunch_written = false; 00169 stats_.last_char_was_newline = true; 00170 stats_.last_char_was_tilde = false; 00171 } 00172 00173 if (force_eol) 00174 stats_.write_results_empty_block = true; 00175 return; 00176 } 00177 00178 /* NORMAL PROCESSING of non tilde crunched words */ 00179 00180 stats_.tilde_crunch_written = false; 00181 if (newline_type) 00182 stats_.last_char_was_newline = true; 00183 else 00184 stats_.last_char_was_newline = false; 00185 stats_.write_results_empty_block = force_eol; // about to write a real word 00186 00187 if (unlv_tilde_crunching && 00188 stats_.last_char_was_tilde && 00189 (word->word->space() == 0) && 00190 !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) && 00191 (word->best_choice->unichar_id(0) == space)) { 00192 /* Prevent adjacent tilde across words - we know that adjacent tildes within 00193 words have been removed */ 00194 word->MergeAdjacentBlobs(0); 00195 } 00196 if (newline_type || 00197 (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) 00198 stats_.last_char_was_tilde = false; 00199 else { 00200 if (word->reject_map.length () > 0) { 00201 if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) 00202 stats_.last_char_was_tilde = true; 00203 else 00204 stats_.last_char_was_tilde = false; 00205 } 00206 else if (word->word->space () > 0) 00207 stats_.last_char_was_tilde = false; 00208 /* else it is unchanged as there are no output chars */ 00209 } 00210 00211 ASSERT_HOST (word->best_choice->length() == word->reject_map.length()); 00212 00213 set_unlv_suspects(word); 00214 check_debug_pt (word, 120); 00215 if (tessedit_rejection_debug) { 00216 tprintf ("Dict word: \"%s\": %d\n", 00217 word->best_choice->debug_string().string(), 00218 dict_word(*(word->best_choice))); 00219 } 00220 if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) { 00221 if (tessedit_zero_rejection) { 00222 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ 00223 for (i = 0; i < word->best_choice->length(); ++i) { 00224 if (word->reject_map[i].rejected()) 00225 word->reject_map[i].setrej_minimal_rej_accept(); 00226 } 00227 } 00228 if (tessedit_minimal_rejection) { 00229 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ 00230 for (i = 0; i < word->best_choice->length(); ++i) { 00231 if ((word->best_choice->unichar_id(i) != space) && 00232 word->reject_map[i].rejected()) 00233 word->reject_map[i].setrej_minimal_rej_accept(); 00234 } 00235 } 00236 } 00237 } 00238 } // namespace tesseract 00239 00240 /********************************************************************** 00241 * determine_newline_type 00242 * 00243 * Find whether we have a wrapping or hard newline. 00244 * Return FALSE if not at end of line. 00245 **********************************************************************/ 00246 00247 char determine_newline_type( //test line ends 00248 WERD *word, //word to do 00249 BLOCK *block, //current block 00250 WERD *next_word, //next word 00251 BLOCK *next_block //block of next word 00252 ) { 00253 inT16 end_gap; //to right edge 00254 inT16 width; //of next word 00255 TBOX word_box; //bounding 00256 TBOX next_box; //next word 00257 TBOX block_box; //block bounding 00258 00259 if (!word->flag (W_EOL)) 00260 return FALSE; //not end of line 00261 if (next_word == NULL || next_block == NULL || block != next_block) 00262 return CTRL_NEWLINE; 00263 if (next_word->space () > 0) 00264 return CTRL_HARDLINE; //it is tabbed 00265 word_box = word->bounding_box (); 00266 next_box = next_word->bounding_box (); 00267 block_box = block->bounding_box (); 00268 //gap to eol 00269 end_gap = block_box.right () - word_box.right (); 00270 end_gap -= (inT32) block->space (); 00271 width = next_box.right () - next_box.left (); 00272 // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n", 00273 // block_box.right(),word_box.right(),end_gap, 00274 // next_box.right(),next_box.left(),width, 00275 // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE); 00276 return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE; 00277 } 00278 00279 /************************************************************************* 00280 * get_rep_char() 00281 * Return the first accepted character from the repetition string. This is the 00282 * character which is repeated - as determined earlier by fix_rep_char() 00283 *************************************************************************/ 00284 namespace tesseract { 00285 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated? 00286 int i; 00287 for (i = 0; ((i < word->reject_map.length()) && 00288 (word->reject_map[i].rejected())); ++i); 00289 00290 if (i < word->reject_map.length()) { 00291 return word->best_choice->unichar_id(i); 00292 } else { 00293 return word->uch_set->unichar_to_id(unrecognised_char.string()); 00294 } 00295 } 00296 00297 /************************************************************************* 00298 * SUSPECT LEVELS 00299 * 00300 * 0 - don't reject ANYTHING 00301 * 1,2 - partial rejection 00302 * 3 - BEST 00303 * 00304 * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and 00305 * tessedit_minimal_rejection. 00306 *************************************************************************/ 00307 void Tesseract::set_unlv_suspects(WERD_RES *word_res) { 00308 int len = word_res->reject_map.length(); 00309 const WERD_CHOICE &word = *(word_res->best_choice); 00310 const UNICHARSET &uchset = *word.unicharset(); 00311 int i; 00312 float rating_per_ch; 00313 00314 if (suspect_level == 0) { 00315 for (i = 0; i < len; i++) { 00316 if (word_res->reject_map[i].rejected()) 00317 word_res->reject_map[i].setrej_minimal_rej_accept(); 00318 } 00319 return; 00320 } 00321 00322 if (suspect_level >= 3) 00323 return; //Use defaults 00324 00325 /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/ 00326 00327 if (safe_dict_word(word_res) && 00328 (count_alphas(word) > suspect_short_words)) { 00329 /* Unreject alphas in dictionary words */ 00330 for (i = 0; i < len; ++i) { 00331 if (word_res->reject_map[i].rejected() && 00332 uchset.get_isalpha(word.unichar_id(i))) 00333 word_res->reject_map[i].setrej_minimal_rej_accept(); 00334 } 00335 } 00336 00337 rating_per_ch = word.rating() / word_res->reject_map.length(); 00338 00339 if (rating_per_ch >= suspect_rating_per_ch) 00340 return; //Don't touch bad ratings 00341 00342 if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) { 00343 /* Unreject any Tess Acceptable word - but NOT tess reject chs*/ 00344 for (i = 0; i < len; ++i) { 00345 if (word_res->reject_map[i].rejected() && 00346 (!uchset.eq(word.unichar_id(i), " "))) 00347 word_res->reject_map[i].setrej_minimal_rej_accept(); 00348 } 00349 } 00350 00351 for (i = 0; i < len; i++) { 00352 if (word_res->reject_map[i].rejected()) { 00353 if (word_res->reject_map[i].flag(R_DOC_REJ)) 00354 word_res->reject_map[i].setrej_minimal_rej_accept(); 00355 if (word_res->reject_map[i].flag(R_BLOCK_REJ)) 00356 word_res->reject_map[i].setrej_minimal_rej_accept(); 00357 if (word_res->reject_map[i].flag(R_ROW_REJ)) 00358 word_res->reject_map[i].setrej_minimal_rej_accept(); 00359 } 00360 } 00361 00362 if (suspect_level == 2) 00363 return; 00364 00365 if (!suspect_constrain_1Il || 00366 (word_res->reject_map.length() <= suspect_short_words)) { 00367 for (i = 0; i < len; i++) { 00368 if (word_res->reject_map[i].rejected()) { 00369 if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) || 00370 word_res->reject_map[i].flag(R_POSTNN_1IL))) 00371 word_res->reject_map[i].setrej_minimal_rej_accept(); 00372 00373 if (!suspect_constrain_1Il && 00374 word_res->reject_map[i].flag(R_MM_REJECT)) 00375 word_res->reject_map[i].setrej_minimal_rej_accept(); 00376 } 00377 } 00378 } 00379 00380 if (acceptable_word_string(*word_res->uch_set, 00381 word.unichar_string().string(), 00382 word.unichar_lengths().string()) != 00383 AC_UNACCEPTABLE || 00384 acceptable_number_string(word.unichar_string().string(), 00385 word.unichar_lengths().string())) { 00386 if (word_res->reject_map.length() > suspect_short_words) { 00387 for (i = 0; i < len; i++) { 00388 if (word_res->reject_map[i].rejected() && 00389 (!word_res->reject_map[i].perm_rejected() || 00390 word_res->reject_map[i].flag (R_1IL_CONFLICT) || 00391 word_res->reject_map[i].flag (R_POSTNN_1IL) || 00392 word_res->reject_map[i].flag (R_MM_REJECT))) { 00393 word_res->reject_map[i].setrej_minimal_rej_accept(); 00394 } 00395 } 00396 } 00397 } 00398 } 00399 00400 inT16 Tesseract::count_alphas(const WERD_CHOICE &word) { 00401 int count = 0; 00402 for (int i = 0; i < word.length(); ++i) { 00403 if (word.unicharset()->get_isalpha(word.unichar_id(i))) 00404 count++; 00405 } 00406 return count; 00407 } 00408 00409 00410 inT16 Tesseract::count_alphanums(const WERD_CHOICE &word) { 00411 int count = 0; 00412 for (int i = 0; i < word.length(); ++i) { 00413 if (word.unicharset()->get_isalpha(word.unichar_id(i)) || 00414 word.unicharset()->get_isdigit(word.unichar_id(i))) 00415 count++; 00416 } 00417 return count; 00418 } 00419 00420 00421 BOOL8 Tesseract::acceptable_number_string(const char *s, 00422 const char *lengths) { 00423 BOOL8 prev_digit = FALSE; 00424 00425 if (*lengths == 1 && *s == '(') 00426 s++; 00427 00428 if (*lengths == 1 && 00429 ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) 00430 s++; 00431 00432 for (; *s != '\0'; s += *(lengths++)) { 00433 if (unicharset.get_isdigit(s, *lengths)) 00434 prev_digit = TRUE; 00435 else if (prev_digit && 00436 (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) 00437 prev_digit = FALSE; 00438 else if (prev_digit && *lengths == 1 && 00439 (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')'))) 00440 return TRUE; 00441 else if (prev_digit && 00442 *lengths == 1 && (*s == '%') && 00443 (*(lengths + 1) == 1 && *(s + *lengths) == ')') && 00444 (*(s + *lengths + *(lengths + 1)) == '\0')) 00445 return TRUE; 00446 else 00447 return FALSE; 00448 } 00449 return TRUE; 00450 } 00451 } // namespace tesseract