tesseract 3.04.01

ccmain/fixspace.cpp

Go to the documentation of this file.
00001 /******************************************************************
00002  * File:        fixspace.cpp  (Formerly fixspace.c)
00003  * Description: Implements a pass over the page res, exploring the alternative
00004  *              spacing possibilities, trying to use context to improve the
00005  *              word spacing
00006 * Author:               Phil Cheatle
00007 * Created:              Thu Oct 21 11:38:43 BST 1993
00008 *
00009 * (C) Copyright 1993, Hewlett-Packard Ltd.
00010 ** Licensed under the Apache License, Version 2.0 (the "License");
00011 ** you may not use this file except in compliance with the License.
00012 ** You may obtain a copy of the License at
00013 ** http://www.apache.org/licenses/LICENSE-2.0
00014 ** Unless required by applicable law or agreed to in writing, software
00015 ** distributed under the License is distributed on an "AS IS" BASIS,
00016 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00017 ** See the License for the specific language governing permissions and
00018 ** limitations under the License.
00019 *
00020 **********************************************************************/
00021 
00022 #include <ctype.h>
00023 #include "reject.h"
00024 #include "statistc.h"
00025 #include "control.h"
00026 #include "fixspace.h"
00027 #include "genblob.h"
00028 #include "tessvars.h"
00029 #include "tessbox.h"
00030 #include "globals.h"
00031 #include "tesseractclass.h"
00032 
00033 #define PERFECT_WERDS   999
00034 #define MAXSPACING      128      /*max expected spacing in pix */
00035 
00036 namespace tesseract {
00037 
00048 void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
00049                                  inT32 word_count,
00050                                  PAGE_RES *page_res) {
00051   BLOCK_RES_IT block_res_it;
00052   ROW_RES_IT row_res_it;
00053   WERD_RES_IT word_res_it_from;
00054   WERD_RES_IT word_res_it_to;
00055   WERD_RES *word_res;
00056   WERD_RES_LIST fuzzy_space_words;
00057   inT16 new_length;
00058   BOOL8 prevent_null_wd_fixsp;   // DON'T process blobless wds
00059   inT32 word_index;              // current word
00060 
00061   block_res_it.set_to_list(&page_res->block_res_list);
00062   word_index = 0;
00063   for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
00064        block_res_it.forward()) {
00065     row_res_it.set_to_list(&block_res_it.data()->row_res_list);
00066     for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
00067          row_res_it.forward()) {
00068       word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
00069       while (!word_res_it_from.at_last()) {
00070         word_res = word_res_it_from.data();
00071         while (!word_res_it_from.at_last() &&
00072                !(word_res->combination ||
00073                  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
00074                  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
00075           fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
00076                          block_res_it.data()->block);
00077           word_res = word_res_it_from.forward();
00078           word_index++;
00079           if (monitor != NULL) {
00080             monitor->ocr_alive = TRUE;
00081             monitor->progress = 90 + 5 * word_index / word_count;
00082             if (monitor->deadline_exceeded() ||
00083                 (monitor->cancel != NULL &&
00084                  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
00085             return;
00086           }
00087         }
00088 
00089         if (!word_res_it_from.at_last()) {
00090           word_res_it_to = word_res_it_from;
00091           prevent_null_wd_fixsp =
00092             word_res->word->cblob_list()->empty();
00093           if (check_debug_pt(word_res, 60))
00094             debug_fix_space_level.set_value(10);
00095           word_res_it_to.forward();
00096           word_index++;
00097           if (monitor != NULL) {
00098             monitor->ocr_alive = TRUE;
00099             monitor->progress = 90 + 5 * word_index / word_count;
00100             if (monitor->deadline_exceeded() ||
00101                 (monitor->cancel != NULL &&
00102                  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
00103             return;
00104           }
00105           while (!word_res_it_to.at_last () &&
00106                  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
00107                   word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
00108             if (check_debug_pt(word_res, 60))
00109               debug_fix_space_level.set_value(10);
00110             if (word_res->word->cblob_list()->empty())
00111               prevent_null_wd_fixsp = TRUE;
00112             word_res = word_res_it_to.forward();
00113           }
00114           if (check_debug_pt(word_res, 60))
00115             debug_fix_space_level.set_value(10);
00116           if (word_res->word->cblob_list()->empty())
00117             prevent_null_wd_fixsp = TRUE;
00118           if (prevent_null_wd_fixsp) {
00119             word_res_it_from = word_res_it_to;
00120           } else {
00121             fuzzy_space_words.assign_to_sublist(&word_res_it_from,
00122                                                 &word_res_it_to);
00123             fix_fuzzy_space_list(fuzzy_space_words,
00124                                  row_res_it.data()->row,
00125                                  block_res_it.data()->block);
00126             new_length = fuzzy_space_words.length();
00127             word_res_it_from.add_list_before(&fuzzy_space_words);
00128             for (;
00129                  !word_res_it_from.at_last() && new_length > 0;
00130                  new_length--) {
00131               word_res_it_from.forward();
00132             }
00133           }
00134           if (test_pt)
00135             debug_fix_space_level.set_value(0);
00136         }
00137         fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
00138                        block_res_it.data()->block);
00139         // Last word in row
00140       }
00141     }
00142   }
00143 }
00144 
00145 void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
00146                                      ROW *row,
00147                                      BLOCK* block) {
00148   inT16 best_score;
00149   WERD_RES_LIST current_perm;
00150   inT16 current_score;
00151   BOOL8 improved = FALSE;
00152 
00153   best_score = eval_word_spacing(best_perm);  // default score
00154   dump_words(best_perm, best_score, 1, improved);
00155 
00156   if (best_score != PERFECT_WERDS)
00157     initialise_search(best_perm, current_perm);
00158 
00159   while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
00160     match_current_words(current_perm, row, block);
00161     current_score = eval_word_spacing(current_perm);
00162     dump_words(current_perm, current_score, 2, improved);
00163     if (current_score > best_score) {
00164       best_perm.clear();
00165       best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
00166       best_score = current_score;
00167       improved = TRUE;
00168     }
00169     if (current_score < PERFECT_WERDS)
00170       transform_to_next_perm(current_perm);
00171   }
00172   dump_words(best_perm, best_score, 3, improved);
00173 }
00174 
00175 }  // namespace tesseract
00176 
00177 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
00178   WERD_RES_IT src_it(&src_list);
00179   WERD_RES_IT new_it(&new_list);
00180   WERD_RES *src_wd;
00181   WERD_RES *new_wd;
00182 
00183   for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
00184     src_wd = src_it.data();
00185     if (!src_wd->combination) {
00186       new_wd = WERD_RES::deep_copy(src_wd);
00187       new_wd->combination = FALSE;
00188       new_wd->part_of_combo = FALSE;
00189       new_it.add_after_then_move(new_wd);
00190     }
00191   }
00192 }
00193 
00194 
00195 namespace tesseract {
00196 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
00197                                     BLOCK* block) {
00198   WERD_RES_IT word_it(&words);
00199   WERD_RES *word;
00200   // Since we are not using PAGE_RES to iterate over words, we need to update
00201   // prev_word_best_choice_ before calling classify_word_pass2().
00202   prev_word_best_choice_ = NULL;
00203   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00204     word = word_it.data();
00205     if ((!word->part_of_combo) && (word->box_word == NULL)) {
00206       WordData word_data(block, row, word);
00207       SetupWordPassN(2, &word_data);
00208       classify_word_and_language(2, NULL, &word_data);
00209     }
00210     prev_word_best_choice_ = word->best_choice;
00211   }
00212 }
00213 
00214 
00240 inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
00241   WERD_RES_IT word_res_it(&word_res_list);
00242   inT16 total_score = 0;
00243   inT16 word_count = 0;
00244   inT16 done_word_count = 0;
00245   inT16 word_len;
00246   inT16 i;
00247   inT16 offset;
00248   WERD_RES *word;                 // current word
00249   inT16 prev_word_score = 0;
00250   BOOL8 prev_word_done = FALSE;
00251   BOOL8 prev_char_1 = FALSE;      // prev ch a "1/I/l"?
00252   BOOL8 prev_char_digit = FALSE;  // prev ch 2..9 or 0
00253   BOOL8 current_char_1 = FALSE;
00254   BOOL8 current_word_ok_so_far;
00255   STRING punct_chars = "!\"`',.:;";
00256   BOOL8 prev_char_punct = FALSE;
00257   BOOL8 current_char_punct = FALSE;
00258   BOOL8 word_done = FALSE;
00259 
00260   do {
00261     word = word_res_it.data();
00262     word_done = fixspace_thinks_word_done(word);
00263     word_count++;
00264     if (word->tess_failed) {
00265       total_score += prev_word_score;
00266       if (prev_word_done)
00267         done_word_count++;
00268       prev_word_score = 0;
00269       prev_char_1 = FALSE;
00270       prev_char_digit = FALSE;
00271       prev_word_done = FALSE;
00272     } else {
00273       /*
00274         Can we add the prev word score and potentially count this word?
00275         Yes IF it didn't end in a 1 when the first char of this word is a digit
00276           AND it didn't end in a digit when the first char of this word is a 1
00277       */
00278       word_len = word->reject_map.length();
00279       current_word_ok_so_far = FALSE;
00280       if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
00281             (prev_char_digit && (
00282                 (word_done &&
00283                  word->best_choice->unichar_lengths().string()[0] == 1 &&
00284                  word->best_choice->unichar_string()[0] == '1') ||
00285                 (!word_done && STRING(conflict_set_I_l_1).contains(
00286                       word->best_choice->unichar_string()[0])))))) {
00287         total_score += prev_word_score;
00288         if (prev_word_done)
00289           done_word_count++;
00290         current_word_ok_so_far = word_done;
00291       }
00292 
00293       if (current_word_ok_so_far) {
00294         prev_word_done = TRUE;
00295         prev_word_score = word_len;
00296       } else {
00297         prev_word_done = FALSE;
00298         prev_word_score = 0;
00299       }
00300 
00301       /* Add 1 to total score for every joined 1 regardless of context and
00302          rejtn */
00303       for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
00304         current_char_1 = word->best_choice->unichar_string()[i] == '1';
00305         if (prev_char_1 || (current_char_1 && (i > 0)))
00306           total_score++;
00307         prev_char_1 = current_char_1;
00308       }
00309 
00310       /* Add 1 to total score for every joined punctuation regardless of context
00311         and rejtn */
00312       if (tessedit_prefer_joined_punct) {
00313         for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len;
00314              offset += word->best_choice->unichar_lengths()[i++]) {
00315           current_char_punct =
00316             punct_chars.contains(word->best_choice->unichar_string()[offset]);
00317           if (prev_char_punct || (current_char_punct && i > 0))
00318             total_score++;
00319           prev_char_punct = current_char_punct;
00320         }
00321       }
00322       prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
00323       for (i = 0, offset = 0; i < word_len - 1;
00324            offset += word->best_choice->unichar_lengths()[i++]);
00325       prev_char_1 =
00326           ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
00327            || (!word_done && STRING(conflict_set_I_l_1).contains(
00328                    word->best_choice->unichar_string()[offset])));
00329     }
00330     /* Find next word */
00331     do {
00332       word_res_it.forward();
00333     } while (word_res_it.data()->part_of_combo);
00334   } while (!word_res_it.at_first());
00335   total_score += prev_word_score;
00336   if (prev_word_done)
00337     done_word_count++;
00338   if (done_word_count == word_count)
00339     return PERFECT_WERDS;
00340   else
00341     return total_score;
00342 }
00343 
00344 BOOL8 Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
00345   int i;
00346   int offset;
00347 
00348   for (i = 0, offset = 0; i < char_position;
00349        offset += word->best_choice->unichar_lengths()[i++]);
00350   return (
00351       word->uch_set->get_isdigit(
00352           word->best_choice->unichar_string().string() + offset,
00353           word->best_choice->unichar_lengths()[i]) ||
00354       (word->best_choice->permuter() == NUMBER_PERM &&
00355        STRING(numeric_punctuation).contains(
00356            word->best_choice->unichar_string().string()[offset])));
00357 }
00358 
00359 }  // namespace tesseract
00360 
00361 
00373 void transform_to_next_perm(WERD_RES_LIST &words) {
00374   WERD_RES_IT word_it(&words);
00375   WERD_RES_IT prev_word_it(&words);
00376   WERD_RES *word;
00377   WERD_RES *prev_word;
00378   WERD_RES *combo;
00379   WERD *copy_word;
00380   inT16 prev_right = -MAX_INT16;
00381   TBOX box;
00382   inT16 gap;
00383   inT16 min_gap = MAX_INT16;
00384 
00385   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00386     word = word_it.data();
00387     if (!word->part_of_combo) {
00388       box = word->word->bounding_box();
00389       if (prev_right > -MAX_INT16) {
00390         gap = box.left() - prev_right;
00391         if (gap < min_gap)
00392           min_gap = gap;
00393       }
00394       prev_right = box.right();
00395     }
00396   }
00397   if (min_gap < MAX_INT16) {
00398     prev_right = -MAX_INT16;        // back to start
00399     word_it.set_to_list(&words);
00400     // Note: we can't use cycle_pt due to inserted combos at start of list.
00401     for (; (prev_right == -MAX_INT16) || !word_it.at_first();
00402          word_it.forward()) {
00403       word = word_it.data();
00404       if (!word->part_of_combo) {
00405         box = word->word->bounding_box();
00406         if (prev_right > -MAX_INT16) {
00407           gap = box.left() - prev_right;
00408           if (gap <= min_gap) {
00409             prev_word = prev_word_it.data();
00410             if (prev_word->combination) {
00411               combo = prev_word;
00412             } else {
00413               /* Make a new combination and insert before
00414                * the first word being joined. */
00415               copy_word = new WERD;
00416               *copy_word = *(prev_word->word);
00417               // deep copy
00418               combo = new WERD_RES(copy_word);
00419               combo->combination = TRUE;
00420               combo->x_height = prev_word->x_height;
00421               prev_word->part_of_combo = TRUE;
00422               prev_word_it.add_before_then_move(combo);
00423             }
00424             combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
00425             if (word->combination) {
00426               combo->word->join_on(word->word);
00427               // Move blobs to combo
00428               // old combo no longer needed
00429               delete word_it.extract();
00430             } else {
00431               // Copy current wd to combo
00432               combo->copy_on(word);
00433               word->part_of_combo = TRUE;
00434             }
00435             combo->done = FALSE;
00436             combo->ClearResults();
00437           } else {
00438             prev_word_it = word_it;  // catch up
00439           }
00440         }
00441         prev_right = box.right();
00442       }
00443     }
00444   } else {
00445     words.clear();  // signal termination
00446   }
00447 }
00448 
00449 namespace tesseract {
00450 void Tesseract::dump_words(WERD_RES_LIST &perm, inT16 score,
00451                            inT16 mode, BOOL8 improved) {
00452   WERD_RES_IT word_res_it(&perm);
00453 
00454   if (debug_fix_space_level > 0) {
00455     if (mode == 1) {
00456       stats_.dump_words_str = "";
00457       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
00458            word_res_it.forward()) {
00459         if (!word_res_it.data()->part_of_combo) {
00460           stats_.dump_words_str +=
00461               word_res_it.data()->best_choice->unichar_string();
00462           stats_.dump_words_str += ' ';
00463         }
00464       }
00465     }
00466 
00467     if (debug_fix_space_level > 1) {
00468       switch (mode) {
00469         case 1:
00470           tprintf("EXTRACTED (%d): \"", score);
00471           break;
00472         case 2:
00473           tprintf("TESTED (%d): \"", score);
00474           break;
00475         case 3:
00476           tprintf("RETURNED (%d): \"", score);
00477           break;
00478       }
00479 
00480       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
00481            word_res_it.forward()) {
00482         if (!word_res_it.data()->part_of_combo) {
00483           tprintf("%s/%1d ",
00484                   word_res_it.data()->best_choice->unichar_string().string(),
00485                   (int)word_res_it.data()->best_choice->permuter());
00486         }
00487       }
00488       tprintf("\"\n");
00489     } else if (improved) {
00490       tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
00491       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
00492            word_res_it.forward()) {
00493         if (!word_res_it.data()->part_of_combo) {
00494           tprintf("%s/%1d ",
00495                   word_res_it.data()->best_choice->unichar_string().string(),
00496                   (int)word_res_it.data()->best_choice->permuter());
00497         }
00498       }
00499       tprintf("\"\n");
00500     }
00501   }
00502 }
00503 
00504 BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
00505   if (word->done)
00506     return TRUE;
00507 
00508   /*
00509     Use all the standard pass 2 conditions for mode 5 in set_done() in
00510     reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
00511     CARE WHETHER WE HAVE of/at on/an etc.
00512   */
00513   if (fixsp_done_mode > 0 &&
00514       (word->tess_accepted ||
00515        (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
00516        fixsp_done_mode == 3) &&
00517       (strchr(word->best_choice->unichar_string().string(), ' ') == NULL) &&
00518       ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
00519        (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
00520        (word->best_choice->permuter() == USER_DAWG_PERM) ||
00521        (word->best_choice->permuter() == NUMBER_PERM))) {
00522     return TRUE;
00523   } else {
00524     return FALSE;
00525   }
00526 }
00527 
00528 
00536 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
00537                                BLOCK* block) {
00538   WERD_RES *word_res;
00539   WERD_RES_LIST sub_word_list;
00540   WERD_RES_IT sub_word_list_it(&sub_word_list);
00541   inT16 blob_index;
00542   inT16 new_length;
00543   float junk;
00544 
00545   word_res = word_res_it.data();
00546   if (word_res->word->flag(W_REP_CHAR) ||
00547       word_res->combination ||
00548       word_res->part_of_combo ||
00549       !word_res->word->flag(W_DONT_CHOP))
00550     return;
00551 
00552   blob_index = worst_noise_blob(word_res, &junk);
00553   if (blob_index < 0)
00554     return;
00555 
00556   if (debug_fix_space_level > 1) {
00557     tprintf("FP fixspace working on \"%s\"\n",
00558             word_res->best_choice->unichar_string().string());
00559   }
00560   word_res->word->rej_cblob_list()->sort(c_blob_comparator);
00561   sub_word_list_it.add_after_stay_put(word_res_it.extract());
00562   fix_noisy_space_list(sub_word_list, row, block);
00563   new_length = sub_word_list.length();
00564   word_res_it.add_list_before(&sub_word_list);
00565   for (; !word_res_it.at_last() && new_length > 1; new_length--) {
00566     word_res_it.forward();
00567   }
00568 }
00569 
00570 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
00571                                      BLOCK* block) {
00572   inT16 best_score;
00573   WERD_RES_IT best_perm_it(&best_perm);
00574   WERD_RES_LIST current_perm;
00575   WERD_RES_IT current_perm_it(&current_perm);
00576   WERD_RES *old_word_res;
00577   inT16 current_score;
00578   BOOL8 improved = FALSE;
00579 
00580   best_score = fp_eval_word_spacing(best_perm);  // default score
00581 
00582   dump_words(best_perm, best_score, 1, improved);
00583 
00584   old_word_res = best_perm_it.data();
00585   // Even deep_copy doesn't copy the underlying WERD unless its combination
00586   // flag is true!.
00587   old_word_res->combination = TRUE;   // Kludge to force deep copy
00588   current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
00589   old_word_res->combination = FALSE;  // Undo kludge
00590 
00591   break_noisiest_blob_word(current_perm);
00592 
00593   while (best_score != PERFECT_WERDS && !current_perm.empty()) {
00594     match_current_words(current_perm, row, block);
00595     current_score = fp_eval_word_spacing(current_perm);
00596     dump_words(current_perm, current_score, 2, improved);
00597     if (current_score > best_score) {
00598       best_perm.clear();
00599       best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
00600       best_score = current_score;
00601       improved = TRUE;
00602     }
00603     if (current_score < PERFECT_WERDS) {
00604       break_noisiest_blob_word(current_perm);
00605     }
00606   }
00607   dump_words(best_perm, best_score, 3, improved);
00608 }
00609 
00610 
00616 void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
00617   WERD_RES_IT word_it(&words);
00618   WERD_RES_IT worst_word_it;
00619   float worst_noise_score = 9999;
00620   int worst_blob_index = -1;     // Noisiest blob of noisiest wd
00621   int blob_index;                // of wds noisiest blob
00622   float noise_score;             // of wds noisiest blob
00623   WERD_RES *word_res;
00624   C_BLOB_IT blob_it;
00625   C_BLOB_IT rej_cblob_it;
00626   C_BLOB_LIST new_blob_list;
00627   C_BLOB_IT new_blob_it;
00628   C_BLOB_IT new_rej_cblob_it;
00629   WERD *new_word;
00630   inT16 start_of_noise_blob;
00631   inT16 i;
00632 
00633   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00634     blob_index = worst_noise_blob(word_it.data(), &noise_score);
00635     if (blob_index > -1 && worst_noise_score > noise_score) {
00636       worst_noise_score = noise_score;
00637       worst_blob_index = blob_index;
00638       worst_word_it = word_it;
00639     }
00640   }
00641   if (worst_blob_index < 0) {
00642     words.clear();          // signal termination
00643     return;
00644   }
00645 
00646   /* Now split the worst_word_it */
00647 
00648   word_res = worst_word_it.data();
00649 
00650   /* Move blobs before noise blob to a new bloblist */
00651 
00652   new_blob_it.set_to_list(&new_blob_list);
00653   blob_it.set_to_list(word_res->word->cblob_list());
00654   for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
00655     new_blob_it.add_after_then_move(blob_it.extract());
00656   }
00657   start_of_noise_blob = blob_it.data()->bounding_box().left();
00658   delete blob_it.extract();     // throw out noise blob
00659 
00660   new_word = new WERD(&new_blob_list, word_res->word);
00661   new_word->set_flag(W_EOL, FALSE);
00662   word_res->word->set_flag(W_BOL, FALSE);
00663   word_res->word->set_blanks(1);  // After break
00664 
00665   new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
00666   rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
00667   for (;
00668        (!rej_cblob_it.empty() &&
00669         (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
00670        rej_cblob_it.forward()) {
00671     new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
00672   }
00673 
00674   WERD_RES* new_word_res = new WERD_RES(new_word);
00675   new_word_res->combination = TRUE;
00676   worst_word_it.add_before_then_move(new_word_res);
00677 
00678   word_res->ClearResults();
00679 }
00680 
00681 inT16 Tesseract::worst_noise_blob(WERD_RES *word_res,
00682                                   float *worst_noise_score) {
00683   float noise_score[512];
00684   int i;
00685   int min_noise_blob;            // 1st contender
00686   int max_noise_blob;            // last contender
00687   int non_noise_count;
00688   int worst_noise_blob;          // Worst blob
00689   float small_limit = kBlnXHeight * fixsp_small_outlines_size;
00690   float non_noise_limit = kBlnXHeight * 0.8;
00691 
00692   if (word_res->rebuild_word == NULL)
00693     return -1;  // Can't handle cube words.
00694 
00695   // Normalised.
00696   int blob_count = word_res->box_word->length();
00697   ASSERT_HOST(blob_count <= 512);
00698   if (blob_count < 5)
00699     return -1;                   // too short to split
00700 
00701   /* Get the noise scores for all blobs */
00702 
00703   #ifndef SECURE_NAMES
00704   if (debug_fix_space_level > 5)
00705     tprintf("FP fixspace Noise metrics for \"%s\": ",
00706             word_res->best_choice->unichar_string().string());
00707   #endif
00708 
00709   for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
00710     TBLOB* blob = word_res->rebuild_word->blobs[i];
00711     if (word_res->reject_map[i].accepted())
00712       noise_score[i] = non_noise_limit;
00713     else
00714       noise_score[i] = blob_noise_score(blob);
00715 
00716     if (debug_fix_space_level > 5)
00717       tprintf("%1.1f ", noise_score[i]);
00718   }
00719   if (debug_fix_space_level > 5)
00720     tprintf("\n");
00721 
00722   /* Now find the worst one which is far enough away from the end of the word */
00723 
00724   non_noise_count = 0;
00725   for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
00726     if (noise_score[i] >= non_noise_limit) {
00727       non_noise_count++;
00728     }
00729   }
00730   if (non_noise_count < fixsp_non_noise_limit)
00731     return -1;
00732 
00733   min_noise_blob = i;
00734 
00735   non_noise_count = 0;
00736   for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
00737        i--) {
00738     if (noise_score[i] >= non_noise_limit) {
00739       non_noise_count++;
00740     }
00741   }
00742   if (non_noise_count < fixsp_non_noise_limit)
00743     return -1;
00744 
00745   max_noise_blob = i;
00746 
00747   if (min_noise_blob > max_noise_blob)
00748     return -1;
00749 
00750   *worst_noise_score = small_limit;
00751   worst_noise_blob = -1;
00752   for (i = min_noise_blob; i <= max_noise_blob; i++) {
00753     if (noise_score[i] < *worst_noise_score) {
00754       worst_noise_blob = i;
00755       *worst_noise_score = noise_score[i];
00756     }
00757   }
00758   return worst_noise_blob;
00759 }
00760 
00761 float Tesseract::blob_noise_score(TBLOB *blob) {
00762   TBOX box;                       // BB of outline
00763   inT16 outline_count = 0;
00764   inT16 max_dimension;
00765   inT16 largest_outline_dimension = 0;
00766 
00767   for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) {
00768     outline_count++;
00769     box = ol->bounding_box();
00770     if (box.height() > box.width()) {
00771       max_dimension = box.height();
00772     } else {
00773       max_dimension = box.width();
00774     }
00775 
00776     if (largest_outline_dimension < max_dimension)
00777       largest_outline_dimension = max_dimension;
00778   }
00779 
00780   if (outline_count > 5) {
00781     // penalise LOTS of blobs
00782     largest_outline_dimension *= 2;
00783   }
00784 
00785   box = blob->bounding_box();
00786   if (box.bottom() > kBlnBaselineOffset * 4 ||
00787       box.top() < kBlnBaselineOffset / 2) {
00788     // Lax blob is if high or low
00789     largest_outline_dimension /= 2;
00790   }
00791 
00792   return largest_outline_dimension;
00793 }
00794 }  // namespace tesseract
00795 
00796 void fixspace_dbg(WERD_RES *word) {
00797   TBOX box = word->word->bounding_box();
00798   BOOL8 show_map_detail = FALSE;
00799   inT16 i;
00800 
00801   box.print();
00802   tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
00803   tprintf("Blob count: %d (word); %d/%d (rebuild word)\n",
00804           word->word->cblob_list()->length(),
00805           word->rebuild_word->NumBlobs(),
00806           word->box_word->length());
00807   word->reject_map.print(debug_fp);
00808   tprintf("\n");
00809   if (show_map_detail) {
00810     tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
00811     for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
00812       tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
00813       word->reject_map[i].full_print(debug_fp);
00814     }
00815   }
00816 
00817   tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
00818   tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
00819 }
00820 
00821 
00830 namespace tesseract {
00831 inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
00832   WERD_RES_IT word_it(&word_res_list);
00833   WERD_RES *word;
00834   inT16 score = 0;
00835   inT16 i;
00836   float small_limit = kBlnXHeight * fixsp_small_outlines_size;
00837 
00838   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00839     word = word_it.data();
00840     if (word->rebuild_word == NULL)
00841       continue;  // Can't handle cube words.
00842     if (word->done ||
00843         word->tess_accepted ||
00844         word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
00845         word->best_choice->permuter() == FREQ_DAWG_PERM ||
00846         word->best_choice->permuter() == USER_DAWG_PERM ||
00847         safe_dict_word(word) > 0) {
00848       int num_blobs = word->rebuild_word->NumBlobs();
00849       UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
00850       for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
00851         TBLOB* blob = word->rebuild_word->blobs[i];
00852         if (word->best_choice->unichar_id(i) == space ||
00853             blob_noise_score(blob) < small_limit) {
00854           score -= 1;  // penalise possibly erroneous non-space
00855         } else if (word->reject_map[i].accepted()) {
00856           score++;
00857         }
00858       }
00859     }
00860   }
00861   if (score < 0)
00862     score = 0;
00863   return score;
00864 }
00865 
00866 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines