|
tesseract 3.04.01
|
00001 /****************************************************************** 00002 * File: docqual.cpp (Formerly docqual.c) 00003 * Description: Document Quality Metrics 00004 * Author: Phil Cheatle 00005 * Created: Mon May 9 11:27:28 BST 1994 00006 * 00007 * (C) Copyright 1994, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #endif 00023 00024 #include <ctype.h> 00025 #include "docqual.h" 00026 #include "reject.h" 00027 #include "tesscallback.h" 00028 #include "tessvars.h" 00029 #include "globals.h" 00030 #include "tesseractclass.h" 00031 00032 namespace tesseract{ 00033 00034 // A little class to provide the callbacks as we have no pre-bound args. 00035 struct DocQualCallbacks { 00036 explicit DocQualCallbacks(WERD_RES* word0) 00037 : word(word0), match_count(0), accepted_match_count(0) {} 00038 00039 void CountMatchingBlobs(int index) { 00040 ++match_count; 00041 } 00042 00043 void CountAcceptedBlobs(int index) { 00044 if (word->reject_map[index].accepted()) 00045 ++accepted_match_count; 00046 ++match_count; 00047 } 00048 00049 void AcceptIfGoodQuality(int index) { 00050 if (word->reject_map[index].accept_if_good_quality()) 00051 word->reject_map[index].setrej_quality_accept(); 00052 } 00053 00054 WERD_RES* word; 00055 inT16 match_count; 00056 inT16 accepted_match_count; 00057 }; 00058 00059 /************************************************************************* 00060 * word_blob_quality() 00061 * How many blobs in the box_word are identical to those of the inword? 00062 * ASSUME blobs in both initial word and box_word are in ascending order of 00063 * left hand blob edge. 00064 *************************************************************************/ 00065 inT16 Tesseract::word_blob_quality(WERD_RES *word, ROW *row) { 00066 if (word->bln_boxes == NULL || 00067 word->rebuild_word == NULL || word->rebuild_word->blobs.empty()) 00068 return 0; 00069 00070 DocQualCallbacks cb(word); 00071 word->bln_boxes->ProcessMatchedBlobs( 00072 *word->rebuild_word, 00073 NewPermanentTessCallback(&cb, &DocQualCallbacks::CountMatchingBlobs)); 00074 return cb.match_count; 00075 } 00076 00077 inT16 Tesseract::word_outline_errs(WERD_RES *word) { 00078 inT16 i = 0; 00079 inT16 err_count = 0; 00080 00081 if (word->rebuild_word != NULL) { 00082 for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) { 00083 TBLOB* blob = word->rebuild_word->blobs[b]; 00084 err_count += count_outline_errs(word->best_choice->unichar_string()[i], 00085 blob->NumOutlines()); 00086 i++; 00087 } 00088 } 00089 return err_count; 00090 } 00091 00092 /************************************************************************* 00093 * word_char_quality() 00094 * Combination of blob quality and outline quality - how many good chars are 00095 * there? - I.e chars which pass the blob AND outline tests. 00096 *************************************************************************/ 00097 void Tesseract::word_char_quality(WERD_RES *word, 00098 ROW *row, 00099 inT16 *match_count, 00100 inT16 *accepted_match_count) { 00101 if (word->bln_boxes == NULL || 00102 word->rebuild_word == NULL || word->rebuild_word->blobs.empty()) 00103 return; 00104 00105 DocQualCallbacks cb(word); 00106 word->bln_boxes->ProcessMatchedBlobs( 00107 *word->rebuild_word, 00108 NewPermanentTessCallback(&cb, &DocQualCallbacks::CountAcceptedBlobs)); 00109 *match_count = cb.match_count; 00110 *accepted_match_count = cb.accepted_match_count; 00111 } 00112 00113 /************************************************************************* 00114 * unrej_good_chs() 00115 * Unreject POTENTIAL rejects if the blob passes the blob and outline checks 00116 *************************************************************************/ 00117 void Tesseract::unrej_good_chs(WERD_RES *word, ROW *row) { 00118 if (word->bln_boxes == NULL || 00119 word->rebuild_word == NULL || word->rebuild_word->blobs.empty()) 00120 return; 00121 00122 DocQualCallbacks cb(word); 00123 word->bln_boxes->ProcessMatchedBlobs( 00124 *word->rebuild_word, 00125 NewPermanentTessCallback(&cb, &DocQualCallbacks::AcceptIfGoodQuality)); 00126 } 00127 00128 inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) { 00129 int expected_outline_count; 00130 00131 if (STRING (outlines_odd).contains (c)) 00132 return 0; //Don't use this char 00133 else if (STRING (outlines_2).contains (c)) 00134 expected_outline_count = 2; 00135 else 00136 expected_outline_count = 1; 00137 return abs (outline_count - expected_outline_count); 00138 } 00139 00140 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, 00141 BOOL8 good_quality_doc) { 00142 if ((tessedit_good_quality_unrej && good_quality_doc)) 00143 unrej_good_quality_words(page_res_it); 00144 doc_and_block_rejection(page_res_it, good_quality_doc); 00145 if (unlv_tilde_crunching) { 00146 tilde_crunch(page_res_it); 00147 tilde_delete(page_res_it); 00148 } 00149 } 00150 00151 00152 /************************************************************************* 00153 * unrej_good_quality_words() 00154 * Accept potential rejects in words which pass the following checks: 00155 * - Contains a potential reject 00156 * - Word looks like a sensible alpha word. 00157 * - Word segmentation is the same as the original image 00158 * - All characters have the expected number of outlines 00159 * NOTE - the rejection counts are recalculated after unrejection 00160 * - CAN'T do it in a single pass without a bit of fiddling 00161 * - keep it simple but inefficient 00162 *************************************************************************/ 00163 void Tesseract::unrej_good_quality_words( //unreject potential 00164 PAGE_RES_IT &page_res_it) { 00165 WERD_RES *word; 00166 ROW_RES *current_row; 00167 BLOCK_RES *current_block; 00168 int i; 00169 00170 page_res_it.restart_page (); 00171 while (page_res_it.word () != NULL) { 00172 check_debug_pt (page_res_it.word (), 100); 00173 if (bland_unrej) { 00174 word = page_res_it.word (); 00175 for (i = 0; i < word->reject_map.length (); i++) { 00176 if (word->reject_map[i].accept_if_good_quality ()) 00177 word->reject_map[i].setrej_quality_accept (); 00178 } 00179 page_res_it.forward (); 00180 } 00181 else if ((page_res_it.row ()->char_count > 0) && 00182 ((page_res_it.row ()->rej_count / 00183 (float) page_res_it.row ()->char_count) <= 00184 quality_rowrej_pc)) { 00185 word = page_res_it.word (); 00186 if (word->reject_map.quality_recoverable_rejects() && 00187 (tessedit_unrej_any_wd || 00188 acceptable_word_string(*word->uch_set, 00189 word->best_choice->unichar_string().string(), 00190 word->best_choice->unichar_lengths().string()) 00191 != AC_UNACCEPTABLE)) { 00192 unrej_good_chs(word, page_res_it.row ()->row); 00193 } 00194 page_res_it.forward (); 00195 } 00196 else { 00197 /* Skip to end of dodgy row */ 00198 current_row = page_res_it.row (); 00199 while ((page_res_it.word () != NULL) && 00200 (page_res_it.row () == current_row)) 00201 page_res_it.forward (); 00202 } 00203 check_debug_pt (page_res_it.word (), 110); 00204 } 00205 page_res_it.restart_page (); 00206 page_res_it.page_res->char_count = 0; 00207 page_res_it.page_res->rej_count = 0; 00208 current_block = NULL; 00209 current_row = NULL; 00210 while (page_res_it.word () != NULL) { 00211 if (current_block != page_res_it.block ()) { 00212 current_block = page_res_it.block (); 00213 current_block->char_count = 0; 00214 current_block->rej_count = 0; 00215 } 00216 if (current_row != page_res_it.row ()) { 00217 current_row = page_res_it.row (); 00218 current_row->char_count = 0; 00219 current_row->rej_count = 0; 00220 current_row->whole_word_rej_count = 0; 00221 } 00222 page_res_it.rej_stat_word (); 00223 page_res_it.forward (); 00224 } 00225 } 00226 00227 00228 /************************************************************************* 00229 * doc_and_block_rejection() 00230 * 00231 * If the page has too many rejects - reject all of it. 00232 * If any block has too many rejects - reject all words in the block 00233 *************************************************************************/ 00234 00235 void Tesseract::doc_and_block_rejection( //reject big chunks 00236 PAGE_RES_IT &page_res_it, 00237 BOOL8 good_quality_doc) { 00238 inT16 block_no = 0; 00239 inT16 row_no = 0; 00240 BLOCK_RES *current_block; 00241 ROW_RES *current_row; 00242 00243 BOOL8 rej_word; 00244 BOOL8 prev_word_rejected; 00245 inT16 char_quality = 0; 00246 inT16 accepted_char_quality; 00247 00248 if (page_res_it.page_res->rej_count * 100.0 / 00249 page_res_it.page_res->char_count > tessedit_reject_doc_percent) { 00250 reject_whole_page(page_res_it); 00251 if (tessedit_debug_doc_rejection) { 00252 tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", 00253 page_res_it.page_res->char_count, 00254 page_res_it.page_res->rej_count); 00255 } 00256 } else { 00257 if (tessedit_debug_doc_rejection) { 00258 tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", 00259 page_res_it.page_res->char_count, 00260 page_res_it.page_res->rej_count); 00261 } 00262 00263 /* Walk blocks testing for block rejection */ 00264 00265 page_res_it.restart_page(); 00266 WERD_RES* word; 00267 while ((word = page_res_it.word()) != NULL) { 00268 current_block = page_res_it.block(); 00269 block_no = current_block->block->index(); 00270 if (current_block->char_count > 0 && 00271 (current_block->rej_count * 100.0 / current_block->char_count) > 00272 tessedit_reject_block_percent) { 00273 if (tessedit_debug_block_rejection) { 00274 tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", 00275 block_no, current_block->char_count, 00276 current_block->rej_count); 00277 } 00278 prev_word_rejected = FALSE; 00279 while ((word = page_res_it.word()) != NULL && 00280 (page_res_it.block() == current_block)) { 00281 if (tessedit_preserve_blk_rej_perfect_wds) { 00282 rej_word = word->reject_map.reject_count() > 0 || 00283 word->reject_map.length () < tessedit_preserve_min_wd_len; 00284 if (rej_word && tessedit_dont_blkrej_good_wds && 00285 word->reject_map.length() >= tessedit_preserve_min_wd_len && 00286 acceptable_word_string( 00287 *word->uch_set, 00288 word->best_choice->unichar_string().string(), 00289 word->best_choice->unichar_lengths().string()) != 00290 AC_UNACCEPTABLE) { 00291 word_char_quality(word, page_res_it.row()->row, 00292 &char_quality, 00293 &accepted_char_quality); 00294 rej_word = char_quality != word->reject_map.length(); 00295 } 00296 } else { 00297 rej_word = TRUE; 00298 } 00299 if (rej_word) { 00300 /* 00301 Reject spacing if both current and prev words are rejected. 00302 NOTE - this is NOT restricted to FUZZY spaces. - When tried this 00303 generated more space errors. 00304 */ 00305 if (tessedit_use_reject_spaces && 00306 prev_word_rejected && 00307 page_res_it.prev_row() == page_res_it.row() && 00308 word->word->space() == 1) 00309 word->reject_spaces = TRUE; 00310 word->reject_map.rej_word_block_rej(); 00311 } 00312 prev_word_rejected = rej_word; 00313 page_res_it.forward(); 00314 } 00315 } else { 00316 if (tessedit_debug_block_rejection) { 00317 tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", 00318 block_no, page_res_it.block()->char_count, 00319 page_res_it.block()->rej_count); 00320 } 00321 00322 /* Walk rows in block testing for row rejection */ 00323 row_no = 0; 00324 while (page_res_it.word() != NULL && 00325 page_res_it.block() == current_block) { 00326 current_row = page_res_it.row(); 00327 row_no++; 00328 /* Reject whole row if: 00329 fraction of chars on row which are rejected exceed a limit AND 00330 fraction rejects which occur in WHOLE WERD rejects is LESS THAN a 00331 limit 00332 */ 00333 if (current_row->char_count > 0 && 00334 (current_row->rej_count * 100.0 / current_row->char_count) > 00335 tessedit_reject_row_percent && 00336 (current_row->whole_word_rej_count * 100.0 / 00337 current_row->rej_count) < 00338 tessedit_whole_wd_rej_row_percent) { 00339 if (tessedit_debug_block_rejection) { 00340 tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n", 00341 row_no, current_row->char_count, 00342 current_row->rej_count); 00343 } 00344 prev_word_rejected = FALSE; 00345 while ((word = page_res_it.word()) != NULL && 00346 page_res_it.row () == current_row) { 00347 /* Preserve words on good docs unless they are mostly rejected*/ 00348 if (!tessedit_row_rej_good_docs && good_quality_doc) { 00349 rej_word = word->reject_map.reject_count() / 00350 static_cast<float>(word->reject_map.length()) > 00351 tessedit_good_doc_still_rowrej_wd; 00352 } else if (tessedit_preserve_row_rej_perfect_wds) { 00353 /* Preserve perfect words anyway */ 00354 rej_word = word->reject_map.reject_count() > 0 || 00355 word->reject_map.length () < tessedit_preserve_min_wd_len; 00356 if (rej_word && tessedit_dont_rowrej_good_wds && 00357 word->reject_map.length() >= tessedit_preserve_min_wd_len && 00358 acceptable_word_string(*word->uch_set, 00359 word->best_choice->unichar_string().string(), 00360 word->best_choice->unichar_lengths().string()) != 00361 AC_UNACCEPTABLE) { 00362 word_char_quality(word, page_res_it.row()->row, 00363 &char_quality, 00364 &accepted_char_quality); 00365 rej_word = char_quality != word->reject_map.length(); 00366 } 00367 } else { 00368 rej_word = TRUE; 00369 } 00370 if (rej_word) { 00371 /* 00372 Reject spacing if both current and prev words are rejected. 00373 NOTE - this is NOT restricted to FUZZY spaces. - When tried 00374 this generated more space errors. 00375 */ 00376 if (tessedit_use_reject_spaces && 00377 prev_word_rejected && 00378 page_res_it.prev_row() == page_res_it.row() && 00379 word->word->space () == 1) 00380 word->reject_spaces = TRUE; 00381 word->reject_map.rej_word_row_rej(); 00382 } 00383 prev_word_rejected = rej_word; 00384 page_res_it.forward(); 00385 } 00386 } else { 00387 if (tessedit_debug_block_rejection) { 00388 tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", 00389 row_no, current_row->char_count, current_row->rej_count); 00390 } 00391 while (page_res_it.word() != NULL && 00392 page_res_it.row() == current_row) 00393 page_res_it.forward(); 00394 } 00395 } 00396 } 00397 } 00398 } 00399 } 00400 00401 } // namespace tesseract 00402 00403 00404 /************************************************************************* 00405 * reject_whole_page() 00406 * Don't believe any of it - set the reject map to 00..00 in all words 00407 * 00408 *************************************************************************/ 00409 00410 void reject_whole_page(PAGE_RES_IT &page_res_it) { 00411 page_res_it.restart_page (); 00412 while (page_res_it.word () != NULL) { 00413 page_res_it.word ()->reject_map.rej_word_doc_rej (); 00414 page_res_it.forward (); 00415 } 00416 //whole page is rejected 00417 page_res_it.page_res->rejected = TRUE; 00418 } 00419 00420 namespace tesseract { 00421 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) { 00422 WERD_RES *word; 00423 GARBAGE_LEVEL garbage_level; 00424 PAGE_RES_IT copy_it; 00425 BOOL8 prev_potential_marked = FALSE; 00426 BOOL8 found_terrible_word = FALSE; 00427 BOOL8 ok_dict_word; 00428 00429 page_res_it.restart_page(); 00430 while (page_res_it.word() != NULL) { 00431 POLY_BLOCK* pb = page_res_it.block()->block->poly_block(); 00432 if (pb != NULL && !pb->IsText()) { 00433 page_res_it.forward(); 00434 continue; 00435 } 00436 word = page_res_it.word(); 00437 00438 if (crunch_early_convert_bad_unlv_chs) 00439 convert_bad_unlv_chs(word); 00440 00441 if (crunch_early_merge_tess_fails) 00442 word->merge_tess_fails(); 00443 00444 if (word->reject_map.accept_count () != 0) { 00445 found_terrible_word = FALSE; 00446 //Forget earlier potential crunches 00447 prev_potential_marked = FALSE; 00448 } 00449 else { 00450 ok_dict_word = safe_dict_word(word); 00451 garbage_level = garbage_word (word, ok_dict_word); 00452 00453 if ((garbage_level != G_NEVER_CRUNCH) && 00454 (terrible_word_crunch (word, garbage_level))) { 00455 if (crunch_debug > 0) { 00456 tprintf ("T CRUNCHING: \"%s\"\n", 00457 word->best_choice->unichar_string().string()); 00458 } 00459 word->unlv_crunch_mode = CR_KEEP_SPACE; 00460 if (prev_potential_marked) { 00461 while (copy_it.word () != word) { 00462 if (crunch_debug > 0) { 00463 tprintf ("P1 CRUNCHING: \"%s\"\n", 00464 copy_it.word()->best_choice->unichar_string().string()); 00465 } 00466 copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE; 00467 copy_it.forward (); 00468 } 00469 prev_potential_marked = FALSE; 00470 } 00471 found_terrible_word = TRUE; 00472 } 00473 else if ((garbage_level != G_NEVER_CRUNCH) && 00474 (potential_word_crunch (word, 00475 garbage_level, ok_dict_word))) { 00476 if (found_terrible_word) { 00477 if (crunch_debug > 0) { 00478 tprintf ("P2 CRUNCHING: \"%s\"\n", 00479 word->best_choice->unichar_string().string()); 00480 } 00481 word->unlv_crunch_mode = CR_KEEP_SPACE; 00482 } 00483 else if (!prev_potential_marked) { 00484 copy_it = page_res_it; 00485 prev_potential_marked = TRUE; 00486 if (crunch_debug > 1) { 00487 tprintf ("P3 CRUNCHING: \"%s\"\n", 00488 word->best_choice->unichar_string().string()); 00489 } 00490 } 00491 } 00492 else { 00493 found_terrible_word = FALSE; 00494 //Forget earlier potential crunches 00495 prev_potential_marked = FALSE; 00496 if (crunch_debug > 2) { 00497 tprintf ("NO CRUNCH: \"%s\"\n", 00498 word->best_choice->unichar_string().string()); 00499 } 00500 } 00501 } 00502 page_res_it.forward (); 00503 } 00504 } 00505 00506 00507 BOOL8 Tesseract::terrible_word_crunch(WERD_RES *word, 00508 GARBAGE_LEVEL garbage_level) { 00509 float rating_per_ch; 00510 int adjusted_len; 00511 int crunch_mode = 0; 00512 00513 if ((word->best_choice->unichar_string().length () == 0) || 00514 (strspn (word->best_choice->unichar_string().string(), " ") == 00515 word->best_choice->unichar_string().length ())) 00516 crunch_mode = 1; 00517 else { 00518 adjusted_len = word->reject_map.length (); 00519 if (adjusted_len > crunch_rating_max) 00520 adjusted_len = crunch_rating_max; 00521 rating_per_ch = word->best_choice->rating () / adjusted_len; 00522 00523 if (rating_per_ch > crunch_terrible_rating) 00524 crunch_mode = 2; 00525 else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) 00526 crunch_mode = 3; 00527 else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) && 00528 (garbage_level != G_OK)) 00529 crunch_mode = 4; 00530 else if ((rating_per_ch > crunch_poor_garbage_rate) && 00531 (garbage_level != G_OK)) 00532 crunch_mode = 5; 00533 } 00534 if (crunch_mode > 0) { 00535 if (crunch_debug > 2) { 00536 tprintf ("Terrible_word_crunch (%d) on \"%s\"\n", 00537 crunch_mode, word->best_choice->unichar_string().string()); 00538 } 00539 return TRUE; 00540 } 00541 else 00542 return FALSE; 00543 } 00544 00545 BOOL8 Tesseract::potential_word_crunch(WERD_RES *word, 00546 GARBAGE_LEVEL garbage_level, 00547 BOOL8 ok_dict_word) { 00548 float rating_per_ch; 00549 int adjusted_len; 00550 const char *str = word->best_choice->unichar_string().string(); 00551 const char *lengths = word->best_choice->unichar_lengths().string(); 00552 BOOL8 word_crunchable; 00553 int poor_indicator_count = 0; 00554 00555 word_crunchable = !crunch_leave_accept_strings || 00556 word->reject_map.length() < 3 || 00557 (acceptable_word_string(*word->uch_set, 00558 str, lengths) == AC_UNACCEPTABLE && 00559 !ok_dict_word); 00560 00561 adjusted_len = word->reject_map.length(); 00562 if (adjusted_len > 10) 00563 adjusted_len = 10; 00564 rating_per_ch = word->best_choice->rating() / adjusted_len; 00565 00566 if (rating_per_ch > crunch_pot_poor_rate) { 00567 if (crunch_debug > 2) { 00568 tprintf("Potential poor rating on \"%s\"\n", 00569 word->best_choice->unichar_string().string()); 00570 } 00571 poor_indicator_count++; 00572 } 00573 00574 if (word_crunchable && 00575 word->best_choice->certainty() < crunch_pot_poor_cert) { 00576 if (crunch_debug > 2) { 00577 tprintf("Potential poor cert on \"%s\"\n", 00578 word->best_choice->unichar_string().string()); 00579 } 00580 poor_indicator_count++; 00581 } 00582 00583 if (garbage_level != G_OK) { 00584 if (crunch_debug > 2) { 00585 tprintf("Potential garbage on \"%s\"\n", 00586 word->best_choice->unichar_string().string()); 00587 } 00588 poor_indicator_count++; 00589 } 00590 return poor_indicator_count >= crunch_pot_indicators; 00591 } 00592 00593 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) { 00594 WERD_RES *word; 00595 PAGE_RES_IT copy_it; 00596 BOOL8 deleting_from_bol = FALSE; 00597 BOOL8 marked_delete_point = FALSE; 00598 inT16 debug_delete_mode; 00599 CRUNCH_MODE delete_mode; 00600 inT16 x_debug_delete_mode; 00601 CRUNCH_MODE x_delete_mode; 00602 00603 page_res_it.restart_page(); 00604 while (page_res_it.word() != NULL) { 00605 word = page_res_it.word(); 00606 00607 delete_mode = word_deletable (word, debug_delete_mode); 00608 if (delete_mode != CR_NONE) { 00609 if (word->word->flag (W_BOL) || deleting_from_bol) { 00610 if (crunch_debug > 0) { 00611 tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n", 00612 debug_delete_mode, 00613 word->best_choice->unichar_string().string()); 00614 } 00615 word->unlv_crunch_mode = delete_mode; 00616 deleting_from_bol = TRUE; 00617 } else if (word->word->flag(W_EOL)) { 00618 if (marked_delete_point) { 00619 while (copy_it.word() != word) { 00620 x_delete_mode = word_deletable (copy_it.word (), 00621 x_debug_delete_mode); 00622 if (crunch_debug > 0) { 00623 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", 00624 x_debug_delete_mode, 00625 copy_it.word()->best_choice->unichar_string().string()); 00626 } 00627 copy_it.word ()->unlv_crunch_mode = x_delete_mode; 00628 copy_it.forward (); 00629 } 00630 } 00631 if (crunch_debug > 0) { 00632 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", 00633 debug_delete_mode, 00634 word->best_choice->unichar_string().string()); 00635 } 00636 word->unlv_crunch_mode = delete_mode; 00637 deleting_from_bol = FALSE; 00638 marked_delete_point = FALSE; 00639 } 00640 else { 00641 if (!marked_delete_point) { 00642 copy_it = page_res_it; 00643 marked_delete_point = TRUE; 00644 } 00645 } 00646 } 00647 else { 00648 deleting_from_bol = FALSE; 00649 //Forget earlier potential crunches 00650 marked_delete_point = FALSE; 00651 } 00652 /* 00653 The following step has been left till now as the tess fails are used to 00654 determine if the word is deletable. 00655 */ 00656 if (!crunch_early_merge_tess_fails) 00657 word->merge_tess_fails(); 00658 page_res_it.forward (); 00659 } 00660 } 00661 00662 00663 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) { 00664 int i; 00665 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-"); 00666 UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" "); 00667 UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~"); 00668 UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^"); 00669 for (i = 0; i < word_res->reject_map.length(); ++i) { 00670 if (word_res->best_choice->unichar_id(i) == unichar_tilde) { 00671 word_res->best_choice->set_unichar_id(unichar_dash, i); 00672 if (word_res->reject_map[i].accepted ()) 00673 word_res->reject_map[i].setrej_unlv_rej (); 00674 } 00675 if (word_res->best_choice->unichar_id(i) == unichar_pow) { 00676 word_res->best_choice->set_unichar_id(unichar_space, i); 00677 if (word_res->reject_map[i].accepted ()) 00678 word_res->reject_map[i].setrej_unlv_rej (); 00679 } 00680 } 00681 } 00682 00683 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) { 00684 enum STATES 00685 { 00686 JUNK, 00687 FIRST_UPPER, 00688 FIRST_LOWER, 00689 FIRST_NUM, 00690 SUBSEQUENT_UPPER, 00691 SUBSEQUENT_LOWER, 00692 SUBSEQUENT_NUM 00693 }; 00694 const char *str = word->best_choice->unichar_string().string(); 00695 const char *lengths = word->best_choice->unichar_lengths().string(); 00696 STATES state = JUNK; 00697 int len = 0; 00698 int isolated_digits = 0; 00699 int isolated_alphas = 0; 00700 int bad_char_count = 0; 00701 int tess_rejs = 0; 00702 int dodgy_chars = 0; 00703 int ok_chars; 00704 UNICHAR_ID last_char = -1; 00705 int alpha_repetition_count = 0; 00706 int longest_alpha_repetition_count = 0; 00707 int longest_lower_run_len = 0; 00708 int lower_string_count = 0; 00709 int longest_upper_run_len = 0; 00710 int upper_string_count = 0; 00711 int total_alpha_count = 0; 00712 int total_digit_count = 0; 00713 00714 for (; *str != '\0'; str += *(lengths++)) { 00715 len++; 00716 if (word->uch_set->get_isupper (str, *lengths)) { 00717 total_alpha_count++; 00718 switch (state) { 00719 case SUBSEQUENT_UPPER: 00720 case FIRST_UPPER: 00721 state = SUBSEQUENT_UPPER; 00722 upper_string_count++; 00723 if (longest_upper_run_len < upper_string_count) 00724 longest_upper_run_len = upper_string_count; 00725 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) { 00726 alpha_repetition_count++; 00727 if (longest_alpha_repetition_count < alpha_repetition_count) { 00728 longest_alpha_repetition_count = alpha_repetition_count; 00729 } 00730 } 00731 else { 00732 last_char = word->uch_set->unichar_to_id(str, *lengths); 00733 alpha_repetition_count = 1; 00734 } 00735 break; 00736 case FIRST_NUM: 00737 isolated_digits++; 00738 default: 00739 state = FIRST_UPPER; 00740 last_char = word->uch_set->unichar_to_id(str, *lengths); 00741 alpha_repetition_count = 1; 00742 upper_string_count = 1; 00743 break; 00744 } 00745 } 00746 else if (word->uch_set->get_islower (str, *lengths)) { 00747 total_alpha_count++; 00748 switch (state) { 00749 case SUBSEQUENT_LOWER: 00750 case FIRST_LOWER: 00751 state = SUBSEQUENT_LOWER; 00752 lower_string_count++; 00753 if (longest_lower_run_len < lower_string_count) 00754 longest_lower_run_len = lower_string_count; 00755 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) { 00756 alpha_repetition_count++; 00757 if (longest_alpha_repetition_count < alpha_repetition_count) { 00758 longest_alpha_repetition_count = alpha_repetition_count; 00759 } 00760 } 00761 else { 00762 last_char = word->uch_set->unichar_to_id(str, *lengths); 00763 alpha_repetition_count = 1; 00764 } 00765 break; 00766 case FIRST_NUM: 00767 isolated_digits++; 00768 default: 00769 state = FIRST_LOWER; 00770 last_char = word->uch_set->unichar_to_id(str, *lengths); 00771 alpha_repetition_count = 1; 00772 lower_string_count = 1; 00773 break; 00774 } 00775 } 00776 else if (word->uch_set->get_isdigit (str, *lengths)) { 00777 total_digit_count++; 00778 switch (state) { 00779 case FIRST_NUM: 00780 state = SUBSEQUENT_NUM; 00781 case SUBSEQUENT_NUM: 00782 break; 00783 case FIRST_UPPER: 00784 case FIRST_LOWER: 00785 isolated_alphas++; 00786 default: 00787 state = FIRST_NUM; 00788 break; 00789 } 00790 } 00791 else { 00792 if (*lengths == 1 && *str == ' ') 00793 tess_rejs++; 00794 else 00795 bad_char_count++; 00796 switch (state) { 00797 case FIRST_NUM: 00798 isolated_digits++; 00799 break; 00800 case FIRST_UPPER: 00801 case FIRST_LOWER: 00802 isolated_alphas++; 00803 default: 00804 break; 00805 } 00806 state = JUNK; 00807 } 00808 } 00809 00810 switch (state) { 00811 case FIRST_NUM: 00812 isolated_digits++; 00813 break; 00814 case FIRST_UPPER: 00815 case FIRST_LOWER: 00816 isolated_alphas++; 00817 default: 00818 break; 00819 } 00820 00821 if (crunch_include_numerals) { 00822 total_alpha_count += total_digit_count - isolated_digits; 00823 } 00824 00825 if (crunch_leave_ok_strings && len >= 4 && 00826 2 * (total_alpha_count - isolated_alphas) > len && 00827 longest_alpha_repetition_count < crunch_long_repetitions) { 00828 if ((crunch_accept_ok && 00829 acceptable_word_string(*word->uch_set, str, lengths) != 00830 AC_UNACCEPTABLE) || 00831 longest_lower_run_len > crunch_leave_lc_strings || 00832 longest_upper_run_len > crunch_leave_uc_strings) 00833 return G_NEVER_CRUNCH; 00834 } 00835 if (word->reject_map.length() > 1 && 00836 strpbrk(str, " ") == NULL && 00837 (word->best_choice->permuter() == SYSTEM_DAWG_PERM || 00838 word->best_choice->permuter() == FREQ_DAWG_PERM || 00839 word->best_choice->permuter() == USER_DAWG_PERM || 00840 word->best_choice->permuter() == NUMBER_PERM || 00841 acceptable_word_string(*word->uch_set, str, lengths) != 00842 AC_UNACCEPTABLE || ok_dict_word)) 00843 return G_OK; 00844 00845 ok_chars = len - bad_char_count - isolated_digits - 00846 isolated_alphas - tess_rejs; 00847 00848 if (crunch_debug > 3) { 00849 tprintf("garbage_word: \"%s\"\n", 00850 word->best_choice->unichar_string().string()); 00851 tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", 00852 len, 00853 bad_char_count, isolated_digits, isolated_alphas, tess_rejs); 00854 } 00855 if (bad_char_count == 0 && 00856 tess_rejs == 0 && 00857 (len > isolated_digits + isolated_alphas || len <= 2)) 00858 return G_OK; 00859 00860 if (tess_rejs > ok_chars || 00861 (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) 00862 return G_TERRIBLE; 00863 00864 if (len > 4) { 00865 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + 00866 isolated_alphas; 00867 if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5) 00868 return G_DODGY; 00869 else 00870 return G_OK; 00871 } else { 00872 dodgy_chars = 2 * tess_rejs + bad_char_count; 00873 if ((len == 4 && dodgy_chars > 2) || 00874 (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) 00875 return G_DODGY; 00876 else 00877 return G_OK; 00878 } 00879 } 00880 00881 00882 /************************************************************************* 00883 * word_deletable() 00884 * DELETE WERDS AT ENDS OF ROWS IF 00885 * Word is crunched && 00886 * ( string length = 0 OR 00887 * > 50% of chars are "|" (before merging) OR 00888 * certainty < -10 OR 00889 * rating /char > 60 OR 00890 * TOP of word is more than 0.5 xht BELOW baseline OR 00891 * BOTTOM of word is more than 0.5 xht ABOVE xht OR 00892 * length of word < 3xht OR 00893 * height of word < 0.7 xht OR 00894 * height of word > 3.0 xht OR 00895 * >75% of the outline BBs have longest dimension < 0.5xht 00896 *************************************************************************/ 00897 00898 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, inT16 &delete_mode) { 00899 int word_len = word->reject_map.length (); 00900 float rating_per_ch; 00901 TBOX box; //BB of word 00902 00903 if (word->unlv_crunch_mode == CR_NONE) { 00904 delete_mode = 0; 00905 return CR_NONE; 00906 } 00907 00908 if (word_len == 0) { 00909 delete_mode = 1; 00910 return CR_DELETE; 00911 } 00912 00913 if (word->rebuild_word != NULL) { 00914 // Cube leaves rebuild_word NULL. 00915 box = word->rebuild_word->bounding_box(); 00916 if (box.height () < crunch_del_min_ht * kBlnXHeight) { 00917 delete_mode = 4; 00918 return CR_DELETE; 00919 } 00920 00921 if (noise_outlines(word->rebuild_word)) { 00922 delete_mode = 5; 00923 return CR_DELETE; 00924 } 00925 } 00926 00927 if ((failure_count (word) * 1.5) > word_len) { 00928 delete_mode = 2; 00929 return CR_LOOSE_SPACE; 00930 } 00931 00932 if (word->best_choice->certainty () < crunch_del_cert) { 00933 delete_mode = 7; 00934 return CR_LOOSE_SPACE; 00935 } 00936 00937 rating_per_ch = word->best_choice->rating () / word_len; 00938 00939 if (rating_per_ch > crunch_del_rating) { 00940 delete_mode = 8; 00941 return CR_LOOSE_SPACE; 00942 } 00943 00944 if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) { 00945 delete_mode = 9; 00946 return CR_LOOSE_SPACE; 00947 } 00948 00949 if (box.bottom () > 00950 kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) { 00951 delete_mode = 10; 00952 return CR_LOOSE_SPACE; 00953 } 00954 00955 if (box.height () > crunch_del_max_ht * kBlnXHeight) { 00956 delete_mode = 11; 00957 return CR_LOOSE_SPACE; 00958 } 00959 00960 if (box.width () < crunch_del_min_width * kBlnXHeight) { 00961 delete_mode = 3; 00962 return CR_LOOSE_SPACE; 00963 } 00964 00965 delete_mode = 0; 00966 return CR_NONE; 00967 } 00968 00969 inT16 Tesseract::failure_count(WERD_RES *word) { 00970 const char *str = word->best_choice->unichar_string().string(); 00971 int tess_rejs = 0; 00972 00973 for (; *str != '\0'; str++) { 00974 if (*str == ' ') 00975 tess_rejs++; 00976 } 00977 return tess_rejs; 00978 } 00979 00980 00981 BOOL8 Tesseract::noise_outlines(TWERD *word) { 00982 TBOX box; // BB of outline 00983 inT16 outline_count = 0; 00984 inT16 small_outline_count = 0; 00985 inT16 max_dimension; 00986 float small_limit = kBlnXHeight * crunch_small_outlines_size; 00987 00988 for (int b = 0; b < word->NumBlobs(); ++b) { 00989 TBLOB* blob = word->blobs[b]; 00990 for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) { 00991 outline_count++; 00992 box = ol->bounding_box(); 00993 if (box.height() > box.width()) 00994 max_dimension = box.height(); 00995 else 00996 max_dimension = box.width(); 00997 if (max_dimension < small_limit) 00998 small_outline_count++; 00999 } 01000 } 01001 return small_outline_count >= outline_count; 01002 } 01003 01004 } // namespace tesseract