|
tesseract 3.04.01
|
00001 /****************************************************************** 00002 * File: fixspace.cpp (Formerly fixspace.c) 00003 * Description: Implements a pass over the page res, exploring the alternative 00004 * spacing possibilities, trying to use context to improve the 00005 * word spacing 00006 * Author: Phil Cheatle 00007 * Created: Thu Oct 21 11:38:43 BST 1993 00008 * 00009 * (C) Copyright 1993, Hewlett-Packard Ltd. 00010 ** Licensed under the Apache License, Version 2.0 (the "License"); 00011 ** you may not use this file except in compliance with the License. 00012 ** You may obtain a copy of the License at 00013 ** http://www.apache.org/licenses/LICENSE-2.0 00014 ** Unless required by applicable law or agreed to in writing, software 00015 ** distributed under the License is distributed on an "AS IS" BASIS, 00016 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00017 ** See the License for the specific language governing permissions and 00018 ** limitations under the License. 00019 * 00020 **********************************************************************/ 00021 00022 #include <ctype.h> 00023 #include "reject.h" 00024 #include "statistc.h" 00025 #include "control.h" 00026 #include "fixspace.h" 00027 #include "genblob.h" 00028 #include "tessvars.h" 00029 #include "tessbox.h" 00030 #include "globals.h" 00031 #include "tesseractclass.h" 00032 00033 #define PERFECT_WERDS 999 00034 #define MAXSPACING 128 /*max expected spacing in pix */ 00035 00036 namespace tesseract { 00037 00048 void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, 00049 inT32 word_count, 00050 PAGE_RES *page_res) { 00051 BLOCK_RES_IT block_res_it; 00052 ROW_RES_IT row_res_it; 00053 WERD_RES_IT word_res_it_from; 00054 WERD_RES_IT word_res_it_to; 00055 WERD_RES *word_res; 00056 WERD_RES_LIST fuzzy_space_words; 00057 inT16 new_length; 00058 BOOL8 prevent_null_wd_fixsp; // DON'T process blobless wds 00059 inT32 word_index; // current word 00060 00061 block_res_it.set_to_list(&page_res->block_res_list); 00062 word_index = 0; 00063 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); 00064 block_res_it.forward()) { 00065 row_res_it.set_to_list(&block_res_it.data()->row_res_list); 00066 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); 00067 row_res_it.forward()) { 00068 word_res_it_from.set_to_list(&row_res_it.data()->word_res_list); 00069 while (!word_res_it_from.at_last()) { 00070 word_res = word_res_it_from.data(); 00071 while (!word_res_it_from.at_last() && 00072 !(word_res->combination || 00073 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) || 00074 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) { 00075 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, 00076 block_res_it.data()->block); 00077 word_res = word_res_it_from.forward(); 00078 word_index++; 00079 if (monitor != NULL) { 00080 monitor->ocr_alive = TRUE; 00081 monitor->progress = 90 + 5 * word_index / word_count; 00082 if (monitor->deadline_exceeded() || 00083 (monitor->cancel != NULL && 00084 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) 00085 return; 00086 } 00087 } 00088 00089 if (!word_res_it_from.at_last()) { 00090 word_res_it_to = word_res_it_from; 00091 prevent_null_wd_fixsp = 00092 word_res->word->cblob_list()->empty(); 00093 if (check_debug_pt(word_res, 60)) 00094 debug_fix_space_level.set_value(10); 00095 word_res_it_to.forward(); 00096 word_index++; 00097 if (monitor != NULL) { 00098 monitor->ocr_alive = TRUE; 00099 monitor->progress = 90 + 5 * word_index / word_count; 00100 if (monitor->deadline_exceeded() || 00101 (monitor->cancel != NULL && 00102 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) 00103 return; 00104 } 00105 while (!word_res_it_to.at_last () && 00106 (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) || 00107 word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) { 00108 if (check_debug_pt(word_res, 60)) 00109 debug_fix_space_level.set_value(10); 00110 if (word_res->word->cblob_list()->empty()) 00111 prevent_null_wd_fixsp = TRUE; 00112 word_res = word_res_it_to.forward(); 00113 } 00114 if (check_debug_pt(word_res, 60)) 00115 debug_fix_space_level.set_value(10); 00116 if (word_res->word->cblob_list()->empty()) 00117 prevent_null_wd_fixsp = TRUE; 00118 if (prevent_null_wd_fixsp) { 00119 word_res_it_from = word_res_it_to; 00120 } else { 00121 fuzzy_space_words.assign_to_sublist(&word_res_it_from, 00122 &word_res_it_to); 00123 fix_fuzzy_space_list(fuzzy_space_words, 00124 row_res_it.data()->row, 00125 block_res_it.data()->block); 00126 new_length = fuzzy_space_words.length(); 00127 word_res_it_from.add_list_before(&fuzzy_space_words); 00128 for (; 00129 !word_res_it_from.at_last() && new_length > 0; 00130 new_length--) { 00131 word_res_it_from.forward(); 00132 } 00133 } 00134 if (test_pt) 00135 debug_fix_space_level.set_value(0); 00136 } 00137 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, 00138 block_res_it.data()->block); 00139 // Last word in row 00140 } 00141 } 00142 } 00143 } 00144 00145 void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, 00146 ROW *row, 00147 BLOCK* block) { 00148 inT16 best_score; 00149 WERD_RES_LIST current_perm; 00150 inT16 current_score; 00151 BOOL8 improved = FALSE; 00152 00153 best_score = eval_word_spacing(best_perm); // default score 00154 dump_words(best_perm, best_score, 1, improved); 00155 00156 if (best_score != PERFECT_WERDS) 00157 initialise_search(best_perm, current_perm); 00158 00159 while ((best_score != PERFECT_WERDS) && !current_perm.empty()) { 00160 match_current_words(current_perm, row, block); 00161 current_score = eval_word_spacing(current_perm); 00162 dump_words(current_perm, current_score, 2, improved); 00163 if (current_score > best_score) { 00164 best_perm.clear(); 00165 best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy); 00166 best_score = current_score; 00167 improved = TRUE; 00168 } 00169 if (current_score < PERFECT_WERDS) 00170 transform_to_next_perm(current_perm); 00171 } 00172 dump_words(best_perm, best_score, 3, improved); 00173 } 00174 00175 } // namespace tesseract 00176 00177 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) { 00178 WERD_RES_IT src_it(&src_list); 00179 WERD_RES_IT new_it(&new_list); 00180 WERD_RES *src_wd; 00181 WERD_RES *new_wd; 00182 00183 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { 00184 src_wd = src_it.data(); 00185 if (!src_wd->combination) { 00186 new_wd = WERD_RES::deep_copy(src_wd); 00187 new_wd->combination = FALSE; 00188 new_wd->part_of_combo = FALSE; 00189 new_it.add_after_then_move(new_wd); 00190 } 00191 } 00192 } 00193 00194 00195 namespace tesseract { 00196 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, 00197 BLOCK* block) { 00198 WERD_RES_IT word_it(&words); 00199 WERD_RES *word; 00200 // Since we are not using PAGE_RES to iterate over words, we need to update 00201 // prev_word_best_choice_ before calling classify_word_pass2(). 00202 prev_word_best_choice_ = NULL; 00203 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00204 word = word_it.data(); 00205 if ((!word->part_of_combo) && (word->box_word == NULL)) { 00206 WordData word_data(block, row, word); 00207 SetupWordPassN(2, &word_data); 00208 classify_word_and_language(2, NULL, &word_data); 00209 } 00210 prev_word_best_choice_ = word->best_choice; 00211 } 00212 } 00213 00214 00240 inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) { 00241 WERD_RES_IT word_res_it(&word_res_list); 00242 inT16 total_score = 0; 00243 inT16 word_count = 0; 00244 inT16 done_word_count = 0; 00245 inT16 word_len; 00246 inT16 i; 00247 inT16 offset; 00248 WERD_RES *word; // current word 00249 inT16 prev_word_score = 0; 00250 BOOL8 prev_word_done = FALSE; 00251 BOOL8 prev_char_1 = FALSE; // prev ch a "1/I/l"? 00252 BOOL8 prev_char_digit = FALSE; // prev ch 2..9 or 0 00253 BOOL8 current_char_1 = FALSE; 00254 BOOL8 current_word_ok_so_far; 00255 STRING punct_chars = "!\"`',.:;"; 00256 BOOL8 prev_char_punct = FALSE; 00257 BOOL8 current_char_punct = FALSE; 00258 BOOL8 word_done = FALSE; 00259 00260 do { 00261 word = word_res_it.data(); 00262 word_done = fixspace_thinks_word_done(word); 00263 word_count++; 00264 if (word->tess_failed) { 00265 total_score += prev_word_score; 00266 if (prev_word_done) 00267 done_word_count++; 00268 prev_word_score = 0; 00269 prev_char_1 = FALSE; 00270 prev_char_digit = FALSE; 00271 prev_word_done = FALSE; 00272 } else { 00273 /* 00274 Can we add the prev word score and potentially count this word? 00275 Yes IF it didn't end in a 1 when the first char of this word is a digit 00276 AND it didn't end in a digit when the first char of this word is a 1 00277 */ 00278 word_len = word->reject_map.length(); 00279 current_word_ok_so_far = FALSE; 00280 if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) || 00281 (prev_char_digit && ( 00282 (word_done && 00283 word->best_choice->unichar_lengths().string()[0] == 1 && 00284 word->best_choice->unichar_string()[0] == '1') || 00285 (!word_done && STRING(conflict_set_I_l_1).contains( 00286 word->best_choice->unichar_string()[0])))))) { 00287 total_score += prev_word_score; 00288 if (prev_word_done) 00289 done_word_count++; 00290 current_word_ok_so_far = word_done; 00291 } 00292 00293 if (current_word_ok_so_far) { 00294 prev_word_done = TRUE; 00295 prev_word_score = word_len; 00296 } else { 00297 prev_word_done = FALSE; 00298 prev_word_score = 0; 00299 } 00300 00301 /* Add 1 to total score for every joined 1 regardless of context and 00302 rejtn */ 00303 for (i = 0, prev_char_1 = FALSE; i < word_len; i++) { 00304 current_char_1 = word->best_choice->unichar_string()[i] == '1'; 00305 if (prev_char_1 || (current_char_1 && (i > 0))) 00306 total_score++; 00307 prev_char_1 = current_char_1; 00308 } 00309 00310 /* Add 1 to total score for every joined punctuation regardless of context 00311 and rejtn */ 00312 if (tessedit_prefer_joined_punct) { 00313 for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len; 00314 offset += word->best_choice->unichar_lengths()[i++]) { 00315 current_char_punct = 00316 punct_chars.contains(word->best_choice->unichar_string()[offset]); 00317 if (prev_char_punct || (current_char_punct && i > 0)) 00318 total_score++; 00319 prev_char_punct = current_char_punct; 00320 } 00321 } 00322 prev_char_digit = digit_or_numeric_punct(word, word_len - 1); 00323 for (i = 0, offset = 0; i < word_len - 1; 00324 offset += word->best_choice->unichar_lengths()[i++]); 00325 prev_char_1 = 00326 ((word_done && (word->best_choice->unichar_string()[offset] == '1')) 00327 || (!word_done && STRING(conflict_set_I_l_1).contains( 00328 word->best_choice->unichar_string()[offset]))); 00329 } 00330 /* Find next word */ 00331 do { 00332 word_res_it.forward(); 00333 } while (word_res_it.data()->part_of_combo); 00334 } while (!word_res_it.at_first()); 00335 total_score += prev_word_score; 00336 if (prev_word_done) 00337 done_word_count++; 00338 if (done_word_count == word_count) 00339 return PERFECT_WERDS; 00340 else 00341 return total_score; 00342 } 00343 00344 BOOL8 Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) { 00345 int i; 00346 int offset; 00347 00348 for (i = 0, offset = 0; i < char_position; 00349 offset += word->best_choice->unichar_lengths()[i++]); 00350 return ( 00351 word->uch_set->get_isdigit( 00352 word->best_choice->unichar_string().string() + offset, 00353 word->best_choice->unichar_lengths()[i]) || 00354 (word->best_choice->permuter() == NUMBER_PERM && 00355 STRING(numeric_punctuation).contains( 00356 word->best_choice->unichar_string().string()[offset]))); 00357 } 00358 00359 } // namespace tesseract 00360 00361 00373 void transform_to_next_perm(WERD_RES_LIST &words) { 00374 WERD_RES_IT word_it(&words); 00375 WERD_RES_IT prev_word_it(&words); 00376 WERD_RES *word; 00377 WERD_RES *prev_word; 00378 WERD_RES *combo; 00379 WERD *copy_word; 00380 inT16 prev_right = -MAX_INT16; 00381 TBOX box; 00382 inT16 gap; 00383 inT16 min_gap = MAX_INT16; 00384 00385 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00386 word = word_it.data(); 00387 if (!word->part_of_combo) { 00388 box = word->word->bounding_box(); 00389 if (prev_right > -MAX_INT16) { 00390 gap = box.left() - prev_right; 00391 if (gap < min_gap) 00392 min_gap = gap; 00393 } 00394 prev_right = box.right(); 00395 } 00396 } 00397 if (min_gap < MAX_INT16) { 00398 prev_right = -MAX_INT16; // back to start 00399 word_it.set_to_list(&words); 00400 // Note: we can't use cycle_pt due to inserted combos at start of list. 00401 for (; (prev_right == -MAX_INT16) || !word_it.at_first(); 00402 word_it.forward()) { 00403 word = word_it.data(); 00404 if (!word->part_of_combo) { 00405 box = word->word->bounding_box(); 00406 if (prev_right > -MAX_INT16) { 00407 gap = box.left() - prev_right; 00408 if (gap <= min_gap) { 00409 prev_word = prev_word_it.data(); 00410 if (prev_word->combination) { 00411 combo = prev_word; 00412 } else { 00413 /* Make a new combination and insert before 00414 * the first word being joined. */ 00415 copy_word = new WERD; 00416 *copy_word = *(prev_word->word); 00417 // deep copy 00418 combo = new WERD_RES(copy_word); 00419 combo->combination = TRUE; 00420 combo->x_height = prev_word->x_height; 00421 prev_word->part_of_combo = TRUE; 00422 prev_word_it.add_before_then_move(combo); 00423 } 00424 combo->word->set_flag(W_EOL, word->word->flag(W_EOL)); 00425 if (word->combination) { 00426 combo->word->join_on(word->word); 00427 // Move blobs to combo 00428 // old combo no longer needed 00429 delete word_it.extract(); 00430 } else { 00431 // Copy current wd to combo 00432 combo->copy_on(word); 00433 word->part_of_combo = TRUE; 00434 } 00435 combo->done = FALSE; 00436 combo->ClearResults(); 00437 } else { 00438 prev_word_it = word_it; // catch up 00439 } 00440 } 00441 prev_right = box.right(); 00442 } 00443 } 00444 } else { 00445 words.clear(); // signal termination 00446 } 00447 } 00448 00449 namespace tesseract { 00450 void Tesseract::dump_words(WERD_RES_LIST &perm, inT16 score, 00451 inT16 mode, BOOL8 improved) { 00452 WERD_RES_IT word_res_it(&perm); 00453 00454 if (debug_fix_space_level > 0) { 00455 if (mode == 1) { 00456 stats_.dump_words_str = ""; 00457 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); 00458 word_res_it.forward()) { 00459 if (!word_res_it.data()->part_of_combo) { 00460 stats_.dump_words_str += 00461 word_res_it.data()->best_choice->unichar_string(); 00462 stats_.dump_words_str += ' '; 00463 } 00464 } 00465 } 00466 00467 if (debug_fix_space_level > 1) { 00468 switch (mode) { 00469 case 1: 00470 tprintf("EXTRACTED (%d): \"", score); 00471 break; 00472 case 2: 00473 tprintf("TESTED (%d): \"", score); 00474 break; 00475 case 3: 00476 tprintf("RETURNED (%d): \"", score); 00477 break; 00478 } 00479 00480 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); 00481 word_res_it.forward()) { 00482 if (!word_res_it.data()->part_of_combo) { 00483 tprintf("%s/%1d ", 00484 word_res_it.data()->best_choice->unichar_string().string(), 00485 (int)word_res_it.data()->best_choice->permuter()); 00486 } 00487 } 00488 tprintf("\"\n"); 00489 } else if (improved) { 00490 tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string()); 00491 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); 00492 word_res_it.forward()) { 00493 if (!word_res_it.data()->part_of_combo) { 00494 tprintf("%s/%1d ", 00495 word_res_it.data()->best_choice->unichar_string().string(), 00496 (int)word_res_it.data()->best_choice->permuter()); 00497 } 00498 } 00499 tprintf("\"\n"); 00500 } 00501 } 00502 } 00503 00504 BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) { 00505 if (word->done) 00506 return TRUE; 00507 00508 /* 00509 Use all the standard pass 2 conditions for mode 5 in set_done() in 00510 reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T 00511 CARE WHETHER WE HAVE of/at on/an etc. 00512 */ 00513 if (fixsp_done_mode > 0 && 00514 (word->tess_accepted || 00515 (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) || 00516 fixsp_done_mode == 3) && 00517 (strchr(word->best_choice->unichar_string().string(), ' ') == NULL) && 00518 ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) || 00519 (word->best_choice->permuter() == FREQ_DAWG_PERM) || 00520 (word->best_choice->permuter() == USER_DAWG_PERM) || 00521 (word->best_choice->permuter() == NUMBER_PERM))) { 00522 return TRUE; 00523 } else { 00524 return FALSE; 00525 } 00526 } 00527 00528 00536 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, 00537 BLOCK* block) { 00538 WERD_RES *word_res; 00539 WERD_RES_LIST sub_word_list; 00540 WERD_RES_IT sub_word_list_it(&sub_word_list); 00541 inT16 blob_index; 00542 inT16 new_length; 00543 float junk; 00544 00545 word_res = word_res_it.data(); 00546 if (word_res->word->flag(W_REP_CHAR) || 00547 word_res->combination || 00548 word_res->part_of_combo || 00549 !word_res->word->flag(W_DONT_CHOP)) 00550 return; 00551 00552 blob_index = worst_noise_blob(word_res, &junk); 00553 if (blob_index < 0) 00554 return; 00555 00556 if (debug_fix_space_level > 1) { 00557 tprintf("FP fixspace working on \"%s\"\n", 00558 word_res->best_choice->unichar_string().string()); 00559 } 00560 word_res->word->rej_cblob_list()->sort(c_blob_comparator); 00561 sub_word_list_it.add_after_stay_put(word_res_it.extract()); 00562 fix_noisy_space_list(sub_word_list, row, block); 00563 new_length = sub_word_list.length(); 00564 word_res_it.add_list_before(&sub_word_list); 00565 for (; !word_res_it.at_last() && new_length > 1; new_length--) { 00566 word_res_it.forward(); 00567 } 00568 } 00569 00570 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, 00571 BLOCK* block) { 00572 inT16 best_score; 00573 WERD_RES_IT best_perm_it(&best_perm); 00574 WERD_RES_LIST current_perm; 00575 WERD_RES_IT current_perm_it(¤t_perm); 00576 WERD_RES *old_word_res; 00577 inT16 current_score; 00578 BOOL8 improved = FALSE; 00579 00580 best_score = fp_eval_word_spacing(best_perm); // default score 00581 00582 dump_words(best_perm, best_score, 1, improved); 00583 00584 old_word_res = best_perm_it.data(); 00585 // Even deep_copy doesn't copy the underlying WERD unless its combination 00586 // flag is true!. 00587 old_word_res->combination = TRUE; // Kludge to force deep copy 00588 current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res)); 00589 old_word_res->combination = FALSE; // Undo kludge 00590 00591 break_noisiest_blob_word(current_perm); 00592 00593 while (best_score != PERFECT_WERDS && !current_perm.empty()) { 00594 match_current_words(current_perm, row, block); 00595 current_score = fp_eval_word_spacing(current_perm); 00596 dump_words(current_perm, current_score, 2, improved); 00597 if (current_score > best_score) { 00598 best_perm.clear(); 00599 best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy); 00600 best_score = current_score; 00601 improved = TRUE; 00602 } 00603 if (current_score < PERFECT_WERDS) { 00604 break_noisiest_blob_word(current_perm); 00605 } 00606 } 00607 dump_words(best_perm, best_score, 3, improved); 00608 } 00609 00610 00616 void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) { 00617 WERD_RES_IT word_it(&words); 00618 WERD_RES_IT worst_word_it; 00619 float worst_noise_score = 9999; 00620 int worst_blob_index = -1; // Noisiest blob of noisiest wd 00621 int blob_index; // of wds noisiest blob 00622 float noise_score; // of wds noisiest blob 00623 WERD_RES *word_res; 00624 C_BLOB_IT blob_it; 00625 C_BLOB_IT rej_cblob_it; 00626 C_BLOB_LIST new_blob_list; 00627 C_BLOB_IT new_blob_it; 00628 C_BLOB_IT new_rej_cblob_it; 00629 WERD *new_word; 00630 inT16 start_of_noise_blob; 00631 inT16 i; 00632 00633 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00634 blob_index = worst_noise_blob(word_it.data(), &noise_score); 00635 if (blob_index > -1 && worst_noise_score > noise_score) { 00636 worst_noise_score = noise_score; 00637 worst_blob_index = blob_index; 00638 worst_word_it = word_it; 00639 } 00640 } 00641 if (worst_blob_index < 0) { 00642 words.clear(); // signal termination 00643 return; 00644 } 00645 00646 /* Now split the worst_word_it */ 00647 00648 word_res = worst_word_it.data(); 00649 00650 /* Move blobs before noise blob to a new bloblist */ 00651 00652 new_blob_it.set_to_list(&new_blob_list); 00653 blob_it.set_to_list(word_res->word->cblob_list()); 00654 for (i = 0; i < worst_blob_index; i++, blob_it.forward()) { 00655 new_blob_it.add_after_then_move(blob_it.extract()); 00656 } 00657 start_of_noise_blob = blob_it.data()->bounding_box().left(); 00658 delete blob_it.extract(); // throw out noise blob 00659 00660 new_word = new WERD(&new_blob_list, word_res->word); 00661 new_word->set_flag(W_EOL, FALSE); 00662 word_res->word->set_flag(W_BOL, FALSE); 00663 word_res->word->set_blanks(1); // After break 00664 00665 new_rej_cblob_it.set_to_list(new_word->rej_cblob_list()); 00666 rej_cblob_it.set_to_list(word_res->word->rej_cblob_list()); 00667 for (; 00668 (!rej_cblob_it.empty() && 00669 (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob)); 00670 rej_cblob_it.forward()) { 00671 new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract()); 00672 } 00673 00674 WERD_RES* new_word_res = new WERD_RES(new_word); 00675 new_word_res->combination = TRUE; 00676 worst_word_it.add_before_then_move(new_word_res); 00677 00678 word_res->ClearResults(); 00679 } 00680 00681 inT16 Tesseract::worst_noise_blob(WERD_RES *word_res, 00682 float *worst_noise_score) { 00683 float noise_score[512]; 00684 int i; 00685 int min_noise_blob; // 1st contender 00686 int max_noise_blob; // last contender 00687 int non_noise_count; 00688 int worst_noise_blob; // Worst blob 00689 float small_limit = kBlnXHeight * fixsp_small_outlines_size; 00690 float non_noise_limit = kBlnXHeight * 0.8; 00691 00692 if (word_res->rebuild_word == NULL) 00693 return -1; // Can't handle cube words. 00694 00695 // Normalised. 00696 int blob_count = word_res->box_word->length(); 00697 ASSERT_HOST(blob_count <= 512); 00698 if (blob_count < 5) 00699 return -1; // too short to split 00700 00701 /* Get the noise scores for all blobs */ 00702 00703 #ifndef SECURE_NAMES 00704 if (debug_fix_space_level > 5) 00705 tprintf("FP fixspace Noise metrics for \"%s\": ", 00706 word_res->best_choice->unichar_string().string()); 00707 #endif 00708 00709 for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) { 00710 TBLOB* blob = word_res->rebuild_word->blobs[i]; 00711 if (word_res->reject_map[i].accepted()) 00712 noise_score[i] = non_noise_limit; 00713 else 00714 noise_score[i] = blob_noise_score(blob); 00715 00716 if (debug_fix_space_level > 5) 00717 tprintf("%1.1f ", noise_score[i]); 00718 } 00719 if (debug_fix_space_level > 5) 00720 tprintf("\n"); 00721 00722 /* Now find the worst one which is far enough away from the end of the word */ 00723 00724 non_noise_count = 0; 00725 for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) { 00726 if (noise_score[i] >= non_noise_limit) { 00727 non_noise_count++; 00728 } 00729 } 00730 if (non_noise_count < fixsp_non_noise_limit) 00731 return -1; 00732 00733 min_noise_blob = i; 00734 00735 non_noise_count = 0; 00736 for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; 00737 i--) { 00738 if (noise_score[i] >= non_noise_limit) { 00739 non_noise_count++; 00740 } 00741 } 00742 if (non_noise_count < fixsp_non_noise_limit) 00743 return -1; 00744 00745 max_noise_blob = i; 00746 00747 if (min_noise_blob > max_noise_blob) 00748 return -1; 00749 00750 *worst_noise_score = small_limit; 00751 worst_noise_blob = -1; 00752 for (i = min_noise_blob; i <= max_noise_blob; i++) { 00753 if (noise_score[i] < *worst_noise_score) { 00754 worst_noise_blob = i; 00755 *worst_noise_score = noise_score[i]; 00756 } 00757 } 00758 return worst_noise_blob; 00759 } 00760 00761 float Tesseract::blob_noise_score(TBLOB *blob) { 00762 TBOX box; // BB of outline 00763 inT16 outline_count = 0; 00764 inT16 max_dimension; 00765 inT16 largest_outline_dimension = 0; 00766 00767 for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) { 00768 outline_count++; 00769 box = ol->bounding_box(); 00770 if (box.height() > box.width()) { 00771 max_dimension = box.height(); 00772 } else { 00773 max_dimension = box.width(); 00774 } 00775 00776 if (largest_outline_dimension < max_dimension) 00777 largest_outline_dimension = max_dimension; 00778 } 00779 00780 if (outline_count > 5) { 00781 // penalise LOTS of blobs 00782 largest_outline_dimension *= 2; 00783 } 00784 00785 box = blob->bounding_box(); 00786 if (box.bottom() > kBlnBaselineOffset * 4 || 00787 box.top() < kBlnBaselineOffset / 2) { 00788 // Lax blob is if high or low 00789 largest_outline_dimension /= 2; 00790 } 00791 00792 return largest_outline_dimension; 00793 } 00794 } // namespace tesseract 00795 00796 void fixspace_dbg(WERD_RES *word) { 00797 TBOX box = word->word->bounding_box(); 00798 BOOL8 show_map_detail = FALSE; 00799 inT16 i; 00800 00801 box.print(); 00802 tprintf(" \"%s\" ", word->best_choice->unichar_string().string()); 00803 tprintf("Blob count: %d (word); %d/%d (rebuild word)\n", 00804 word->word->cblob_list()->length(), 00805 word->rebuild_word->NumBlobs(), 00806 word->box_word->length()); 00807 word->reject_map.print(debug_fp); 00808 tprintf("\n"); 00809 if (show_map_detail) { 00810 tprintf("\"%s\"\n", word->best_choice->unichar_string().string()); 00811 for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { 00812 tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); 00813 word->reject_map[i].full_print(debug_fp); 00814 } 00815 } 00816 00817 tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); 00818 tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); 00819 } 00820 00821 00830 namespace tesseract { 00831 inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) { 00832 WERD_RES_IT word_it(&word_res_list); 00833 WERD_RES *word; 00834 inT16 score = 0; 00835 inT16 i; 00836 float small_limit = kBlnXHeight * fixsp_small_outlines_size; 00837 00838 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00839 word = word_it.data(); 00840 if (word->rebuild_word == NULL) 00841 continue; // Can't handle cube words. 00842 if (word->done || 00843 word->tess_accepted || 00844 word->best_choice->permuter() == SYSTEM_DAWG_PERM || 00845 word->best_choice->permuter() == FREQ_DAWG_PERM || 00846 word->best_choice->permuter() == USER_DAWG_PERM || 00847 safe_dict_word(word) > 0) { 00848 int num_blobs = word->rebuild_word->NumBlobs(); 00849 UNICHAR_ID space = word->uch_set->unichar_to_id(" "); 00850 for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) { 00851 TBLOB* blob = word->rebuild_word->blobs[i]; 00852 if (word->best_choice->unichar_id(i) == space || 00853 blob_noise_score(blob) < small_limit) { 00854 score -= 1; // penalise possibly erroneous non-space 00855 } else if (word->reject_map[i].accepted()) { 00856 score++; 00857 } 00858 } 00859 } 00860 } 00861 if (score < 0) 00862 score = 0; 00863 return score; 00864 } 00865 00866 } // namespace tesseract