tesseract 3.04.01

ccmain/docqual.cpp

Go to the documentation of this file.
00001 /******************************************************************
00002  * File:        docqual.cpp  (Formerly docqual.c)
00003  * Description: Document Quality Metrics
00004  * Author:              Phil Cheatle
00005  * Created:             Mon May  9 11:27:28 BST 1994
00006  *
00007  * (C) Copyright 1994, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #endif
00023 
00024 #include          <ctype.h>
00025 #include          "docqual.h"
00026 #include          "reject.h"
00027 #include          "tesscallback.h"
00028 #include          "tessvars.h"
00029 #include          "globals.h"
00030 #include          "tesseractclass.h"
00031 
00032 namespace tesseract{
00033 
00034 // A little class to provide the callbacks as we have no pre-bound args.
00035 struct DocQualCallbacks {
00036   explicit DocQualCallbacks(WERD_RES* word0)
00037     : word(word0), match_count(0), accepted_match_count(0) {}
00038 
00039   void CountMatchingBlobs(int index) {
00040     ++match_count;
00041   }
00042 
00043   void CountAcceptedBlobs(int index) {
00044     if (word->reject_map[index].accepted())
00045       ++accepted_match_count;
00046     ++match_count;
00047   }
00048 
00049   void AcceptIfGoodQuality(int index) {
00050     if (word->reject_map[index].accept_if_good_quality())
00051       word->reject_map[index].setrej_quality_accept();
00052   }
00053 
00054   WERD_RES* word;
00055   inT16 match_count;
00056   inT16 accepted_match_count;
00057 };
00058 
00059 /*************************************************************************
00060  * word_blob_quality()
00061  * How many blobs in the box_word are identical to those of the inword?
00062  * ASSUME blobs in both initial word and box_word are in ascending order of
00063  * left hand blob edge.
00064  *************************************************************************/
00065 inT16 Tesseract::word_blob_quality(WERD_RES *word, ROW *row) {
00066   if (word->bln_boxes == NULL ||
00067       word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
00068     return 0;
00069 
00070   DocQualCallbacks cb(word);
00071   word->bln_boxes->ProcessMatchedBlobs(
00072       *word->rebuild_word,
00073       NewPermanentTessCallback(&cb, &DocQualCallbacks::CountMatchingBlobs));
00074   return cb.match_count;
00075 }
00076 
00077 inT16 Tesseract::word_outline_errs(WERD_RES *word) {
00078   inT16 i = 0;
00079   inT16 err_count = 0;
00080 
00081   if (word->rebuild_word != NULL) {
00082     for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
00083       TBLOB* blob = word->rebuild_word->blobs[b];
00084       err_count += count_outline_errs(word->best_choice->unichar_string()[i],
00085                                       blob->NumOutlines());
00086       i++;
00087     }
00088   }
00089   return err_count;
00090 }
00091 
00092 /*************************************************************************
00093  * word_char_quality()
00094  * Combination of blob quality and outline quality - how many good chars are
00095  * there? - I.e chars which pass the blob AND outline tests.
00096  *************************************************************************/
00097 void Tesseract::word_char_quality(WERD_RES *word,
00098                                   ROW *row,
00099                                   inT16 *match_count,
00100                                   inT16 *accepted_match_count) {
00101   if (word->bln_boxes == NULL ||
00102       word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
00103     return;
00104 
00105   DocQualCallbacks cb(word);
00106   word->bln_boxes->ProcessMatchedBlobs(
00107       *word->rebuild_word,
00108       NewPermanentTessCallback(&cb, &DocQualCallbacks::CountAcceptedBlobs));
00109   *match_count = cb.match_count;
00110   *accepted_match_count = cb.accepted_match_count;
00111 }
00112 
00113 /*************************************************************************
00114  * unrej_good_chs()
00115  * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
00116  *************************************************************************/
00117 void Tesseract::unrej_good_chs(WERD_RES *word, ROW *row) {
00118   if (word->bln_boxes == NULL ||
00119       word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
00120     return;
00121 
00122   DocQualCallbacks cb(word);
00123   word->bln_boxes->ProcessMatchedBlobs(
00124       *word->rebuild_word,
00125       NewPermanentTessCallback(&cb, &DocQualCallbacks::AcceptIfGoodQuality));
00126 }
00127 
00128 inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) {
00129   int expected_outline_count;
00130 
00131   if (STRING (outlines_odd).contains (c))
00132     return 0;                    //Don't use this char
00133   else if (STRING (outlines_2).contains (c))
00134     expected_outline_count = 2;
00135   else
00136     expected_outline_count = 1;
00137   return abs (outline_count - expected_outline_count);
00138 }
00139 
00140 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
00141                                         BOOL8 good_quality_doc) {
00142   if ((tessedit_good_quality_unrej && good_quality_doc))
00143     unrej_good_quality_words(page_res_it);
00144   doc_and_block_rejection(page_res_it, good_quality_doc);
00145   if (unlv_tilde_crunching) {
00146     tilde_crunch(page_res_it);
00147     tilde_delete(page_res_it);
00148   }
00149 }
00150 
00151 
00152 /*************************************************************************
00153  * unrej_good_quality_words()
00154  * Accept potential rejects in words which pass the following checks:
00155  *    - Contains a potential reject
00156  *    - Word looks like a sensible alpha word.
00157  *    - Word segmentation is the same as the original image
00158  *              - All characters have the expected number of outlines
00159  * NOTE - the rejection counts are recalculated after unrejection
00160  *      - CAN'T do it in a single pass without a bit of fiddling
00161  *              - keep it simple but inefficient
00162  *************************************************************************/
00163 void Tesseract::unrej_good_quality_words(  //unreject potential
00164                                          PAGE_RES_IT &page_res_it) {
00165   WERD_RES *word;
00166   ROW_RES *current_row;
00167   BLOCK_RES *current_block;
00168   int i;
00169 
00170   page_res_it.restart_page ();
00171   while (page_res_it.word () != NULL) {
00172     check_debug_pt (page_res_it.word (), 100);
00173     if (bland_unrej) {
00174       word = page_res_it.word ();
00175       for (i = 0; i < word->reject_map.length (); i++) {
00176         if (word->reject_map[i].accept_if_good_quality ())
00177           word->reject_map[i].setrej_quality_accept ();
00178       }
00179       page_res_it.forward ();
00180     }
00181     else if ((page_res_it.row ()->char_count > 0) &&
00182       ((page_res_it.row ()->rej_count /
00183       (float) page_res_it.row ()->char_count) <=
00184     quality_rowrej_pc)) {
00185       word = page_res_it.word ();
00186       if (word->reject_map.quality_recoverable_rejects() &&
00187           (tessedit_unrej_any_wd ||
00188            acceptable_word_string(*word->uch_set,
00189                                   word->best_choice->unichar_string().string(),
00190                                   word->best_choice->unichar_lengths().string())
00191                != AC_UNACCEPTABLE)) {
00192         unrej_good_chs(word, page_res_it.row ()->row);
00193       }
00194       page_res_it.forward ();
00195     }
00196     else {
00197       /* Skip to end of dodgy row */
00198       current_row = page_res_it.row ();
00199       while ((page_res_it.word () != NULL) &&
00200         (page_res_it.row () == current_row))
00201         page_res_it.forward ();
00202     }
00203     check_debug_pt (page_res_it.word (), 110);
00204   }
00205   page_res_it.restart_page ();
00206   page_res_it.page_res->char_count = 0;
00207   page_res_it.page_res->rej_count = 0;
00208   current_block = NULL;
00209   current_row = NULL;
00210   while (page_res_it.word () != NULL) {
00211     if (current_block != page_res_it.block ()) {
00212       current_block = page_res_it.block ();
00213       current_block->char_count = 0;
00214       current_block->rej_count = 0;
00215     }
00216     if (current_row != page_res_it.row ()) {
00217       current_row = page_res_it.row ();
00218       current_row->char_count = 0;
00219       current_row->rej_count = 0;
00220       current_row->whole_word_rej_count = 0;
00221     }
00222     page_res_it.rej_stat_word ();
00223     page_res_it.forward ();
00224   }
00225 }
00226 
00227 
00228 /*************************************************************************
00229  * doc_and_block_rejection()
00230  *
00231  * If the page has too many rejects - reject all of it.
00232  * If any block has too many rejects - reject all words in the block
00233  *************************************************************************/
00234 
00235 void Tesseract::doc_and_block_rejection(  //reject big chunks
00236                                         PAGE_RES_IT &page_res_it,
00237                                         BOOL8 good_quality_doc) {
00238   inT16 block_no = 0;
00239   inT16 row_no = 0;
00240   BLOCK_RES *current_block;
00241   ROW_RES *current_row;
00242 
00243   BOOL8 rej_word;
00244   BOOL8 prev_word_rejected;
00245   inT16 char_quality = 0;
00246   inT16 accepted_char_quality;
00247 
00248   if (page_res_it.page_res->rej_count * 100.0 /
00249       page_res_it.page_res->char_count > tessedit_reject_doc_percent) {
00250     reject_whole_page(page_res_it);
00251     if (tessedit_debug_doc_rejection) {
00252       tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
00253               page_res_it.page_res->char_count,
00254               page_res_it.page_res->rej_count);
00255     }
00256   } else {
00257     if (tessedit_debug_doc_rejection) {
00258       tprintf("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n",
00259               page_res_it.page_res->char_count,
00260               page_res_it.page_res->rej_count);
00261     }
00262 
00263     /* Walk blocks testing for block rejection */
00264 
00265     page_res_it.restart_page();
00266     WERD_RES* word;
00267     while ((word = page_res_it.word()) != NULL) {
00268       current_block = page_res_it.block();
00269       block_no = current_block->block->index();
00270       if (current_block->char_count > 0 &&
00271           (current_block->rej_count * 100.0 / current_block->char_count) >
00272            tessedit_reject_block_percent) {
00273         if (tessedit_debug_block_rejection) {
00274           tprintf("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n",
00275                   block_no, current_block->char_count,
00276                   current_block->rej_count);
00277         }
00278         prev_word_rejected = FALSE;
00279         while ((word = page_res_it.word()) != NULL &&
00280                (page_res_it.block() == current_block)) {
00281           if (tessedit_preserve_blk_rej_perfect_wds) {
00282             rej_word = word->reject_map.reject_count() > 0 ||
00283                 word->reject_map.length () < tessedit_preserve_min_wd_len;
00284             if (rej_word && tessedit_dont_blkrej_good_wds &&
00285                 word->reject_map.length() >= tessedit_preserve_min_wd_len &&
00286                 acceptable_word_string(
00287                     *word->uch_set,
00288                     word->best_choice->unichar_string().string(),
00289                     word->best_choice->unichar_lengths().string()) !=
00290                 AC_UNACCEPTABLE) {
00291               word_char_quality(word, page_res_it.row()->row,
00292                                 &char_quality,
00293                                 &accepted_char_quality);
00294               rej_word = char_quality !=  word->reject_map.length();
00295             }
00296           } else {
00297             rej_word = TRUE;
00298           }
00299           if (rej_word) {
00300             /*
00301               Reject spacing if both current and prev words are rejected.
00302               NOTE - this is NOT restricted to FUZZY spaces. - When tried this
00303               generated more space errors.
00304             */
00305             if (tessedit_use_reject_spaces &&
00306                 prev_word_rejected &&
00307                 page_res_it.prev_row() == page_res_it.row() &&
00308                 word->word->space() == 1)
00309               word->reject_spaces = TRUE;
00310             word->reject_map.rej_word_block_rej();
00311           }
00312           prev_word_rejected = rej_word;
00313           page_res_it.forward();
00314         }
00315       } else {
00316         if (tessedit_debug_block_rejection) {
00317           tprintf("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n",
00318                   block_no, page_res_it.block()->char_count,
00319                   page_res_it.block()->rej_count);
00320         }
00321 
00322         /* Walk rows in block testing for row rejection */
00323         row_no = 0;
00324         while (page_res_it.word() != NULL &&
00325                page_res_it.block() == current_block) {
00326           current_row = page_res_it.row();
00327           row_no++;
00328           /* Reject whole row if:
00329             fraction of chars on row which are rejected exceed a limit AND
00330             fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
00331             limit
00332           */
00333           if (current_row->char_count > 0 &&
00334               (current_row->rej_count * 100.0 / current_row->char_count) >
00335               tessedit_reject_row_percent &&
00336               (current_row->whole_word_rej_count * 100.0 /
00337                   current_row->rej_count) <
00338               tessedit_whole_wd_rej_row_percent) {
00339             if (tessedit_debug_block_rejection) {
00340               tprintf("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n",
00341                       row_no, current_row->char_count,
00342                       current_row->rej_count);
00343             }
00344             prev_word_rejected = FALSE;
00345             while ((word = page_res_it.word()) != NULL &&
00346                    page_res_it.row () == current_row) {
00347               /* Preserve words on good docs unless they are mostly rejected*/
00348               if (!tessedit_row_rej_good_docs && good_quality_doc) {
00349                 rej_word = word->reject_map.reject_count() /
00350                     static_cast<float>(word->reject_map.length()) >
00351                     tessedit_good_doc_still_rowrej_wd;
00352               } else if (tessedit_preserve_row_rej_perfect_wds) {
00353                 /* Preserve perfect words anyway */
00354                 rej_word = word->reject_map.reject_count() > 0 ||
00355                     word->reject_map.length () < tessedit_preserve_min_wd_len;
00356                 if (rej_word && tessedit_dont_rowrej_good_wds &&
00357                     word->reject_map.length() >= tessedit_preserve_min_wd_len &&
00358                     acceptable_word_string(*word->uch_set,
00359                         word->best_choice->unichar_string().string(),
00360                         word->best_choice->unichar_lengths().string()) !=
00361                             AC_UNACCEPTABLE) {
00362                   word_char_quality(word, page_res_it.row()->row,
00363                                     &char_quality,
00364                                     &accepted_char_quality);
00365                   rej_word = char_quality != word->reject_map.length();
00366                 }
00367               } else {
00368                 rej_word = TRUE;
00369               }
00370               if (rej_word) {
00371                 /*
00372                   Reject spacing if both current and prev words are rejected.
00373                   NOTE - this is NOT restricted to FUZZY spaces. - When tried
00374                   this generated more space errors.
00375                 */
00376                 if (tessedit_use_reject_spaces &&
00377                     prev_word_rejected &&
00378                     page_res_it.prev_row() == page_res_it.row() &&
00379                     word->word->space () == 1)
00380                   word->reject_spaces = TRUE;
00381                 word->reject_map.rej_word_row_rej();
00382               }
00383               prev_word_rejected = rej_word;
00384               page_res_it.forward();
00385             }
00386           } else {
00387             if (tessedit_debug_block_rejection) {
00388               tprintf("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n",
00389                       row_no, current_row->char_count, current_row->rej_count);
00390             }
00391             while (page_res_it.word() != NULL &&
00392                    page_res_it.row() == current_row)
00393               page_res_it.forward();
00394           }
00395         }
00396       }
00397     }
00398   }
00399 }
00400 
00401 }  // namespace tesseract
00402 
00403 
00404 /*************************************************************************
00405  * reject_whole_page()
00406  * Don't believe any of it - set the reject map to 00..00 in all words
00407  *
00408  *************************************************************************/
00409 
00410 void reject_whole_page(PAGE_RES_IT &page_res_it) {
00411   page_res_it.restart_page ();
00412   while (page_res_it.word () != NULL) {
00413     page_res_it.word ()->reject_map.rej_word_doc_rej ();
00414     page_res_it.forward ();
00415   }
00416                                  //whole page is rejected
00417   page_res_it.page_res->rejected = TRUE;
00418 }
00419 
00420 namespace tesseract {
00421 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
00422   WERD_RES *word;
00423   GARBAGE_LEVEL garbage_level;
00424   PAGE_RES_IT copy_it;
00425   BOOL8 prev_potential_marked = FALSE;
00426   BOOL8 found_terrible_word = FALSE;
00427   BOOL8 ok_dict_word;
00428 
00429   page_res_it.restart_page();
00430   while (page_res_it.word() != NULL) {
00431     POLY_BLOCK* pb = page_res_it.block()->block->poly_block();
00432     if (pb != NULL && !pb->IsText()) {
00433       page_res_it.forward();
00434       continue;
00435     }
00436     word = page_res_it.word();
00437 
00438     if (crunch_early_convert_bad_unlv_chs)
00439       convert_bad_unlv_chs(word);
00440 
00441     if (crunch_early_merge_tess_fails)
00442       word->merge_tess_fails();
00443 
00444     if (word->reject_map.accept_count () != 0) {
00445       found_terrible_word = FALSE;
00446                                  //Forget earlier potential crunches
00447       prev_potential_marked = FALSE;
00448     }
00449     else {
00450       ok_dict_word = safe_dict_word(word);
00451       garbage_level = garbage_word (word, ok_dict_word);
00452 
00453       if ((garbage_level != G_NEVER_CRUNCH) &&
00454       (terrible_word_crunch (word, garbage_level))) {
00455         if (crunch_debug > 0) {
00456           tprintf ("T CRUNCHING: \"%s\"\n",
00457             word->best_choice->unichar_string().string());
00458         }
00459         word->unlv_crunch_mode = CR_KEEP_SPACE;
00460         if (prev_potential_marked) {
00461           while (copy_it.word () != word) {
00462             if (crunch_debug > 0) {
00463               tprintf ("P1 CRUNCHING: \"%s\"\n",
00464                 copy_it.word()->best_choice->unichar_string().string());
00465             }
00466             copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
00467             copy_it.forward ();
00468           }
00469           prev_potential_marked = FALSE;
00470         }
00471         found_terrible_word = TRUE;
00472       }
00473       else if ((garbage_level != G_NEVER_CRUNCH) &&
00474         (potential_word_crunch (word,
00475       garbage_level, ok_dict_word))) {
00476         if (found_terrible_word) {
00477           if (crunch_debug > 0) {
00478             tprintf ("P2 CRUNCHING: \"%s\"\n",
00479               word->best_choice->unichar_string().string());
00480           }
00481           word->unlv_crunch_mode = CR_KEEP_SPACE;
00482         }
00483         else if (!prev_potential_marked) {
00484           copy_it = page_res_it;
00485           prev_potential_marked = TRUE;
00486           if (crunch_debug > 1) {
00487             tprintf ("P3 CRUNCHING: \"%s\"\n",
00488               word->best_choice->unichar_string().string());
00489           }
00490         }
00491       }
00492       else {
00493         found_terrible_word = FALSE;
00494                                  //Forget earlier potential crunches
00495         prev_potential_marked = FALSE;
00496         if (crunch_debug > 2) {
00497           tprintf ("NO CRUNCH: \"%s\"\n",
00498             word->best_choice->unichar_string().string());
00499         }
00500       }
00501     }
00502     page_res_it.forward ();
00503   }
00504 }
00505 
00506 
00507 BOOL8 Tesseract::terrible_word_crunch(WERD_RES *word,
00508                                       GARBAGE_LEVEL garbage_level) {
00509   float rating_per_ch;
00510   int adjusted_len;
00511   int crunch_mode = 0;
00512 
00513   if ((word->best_choice->unichar_string().length () == 0) ||
00514     (strspn (word->best_choice->unichar_string().string(), " ") ==
00515     word->best_choice->unichar_string().length ()))
00516     crunch_mode = 1;
00517   else {
00518     adjusted_len = word->reject_map.length ();
00519     if (adjusted_len > crunch_rating_max)
00520       adjusted_len = crunch_rating_max;
00521     rating_per_ch = word->best_choice->rating () / adjusted_len;
00522 
00523     if (rating_per_ch > crunch_terrible_rating)
00524       crunch_mode = 2;
00525     else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
00526       crunch_mode = 3;
00527     else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
00528       (garbage_level != G_OK))
00529       crunch_mode = 4;
00530     else if ((rating_per_ch > crunch_poor_garbage_rate) &&
00531       (garbage_level != G_OK))
00532       crunch_mode = 5;
00533   }
00534   if (crunch_mode > 0) {
00535     if (crunch_debug > 2) {
00536       tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
00537         crunch_mode, word->best_choice->unichar_string().string());
00538     }
00539     return TRUE;
00540   }
00541   else
00542     return FALSE;
00543 }
00544 
00545 BOOL8 Tesseract::potential_word_crunch(WERD_RES *word,
00546                                        GARBAGE_LEVEL garbage_level,
00547                                        BOOL8 ok_dict_word) {
00548   float rating_per_ch;
00549   int adjusted_len;
00550   const char *str = word->best_choice->unichar_string().string();
00551   const char *lengths = word->best_choice->unichar_lengths().string();
00552   BOOL8 word_crunchable;
00553   int poor_indicator_count = 0;
00554 
00555   word_crunchable = !crunch_leave_accept_strings ||
00556                     word->reject_map.length() < 3 ||
00557                     (acceptable_word_string(*word->uch_set,
00558                                             str, lengths) == AC_UNACCEPTABLE &&
00559                      !ok_dict_word);
00560 
00561   adjusted_len = word->reject_map.length();
00562   if (adjusted_len > 10)
00563     adjusted_len = 10;
00564   rating_per_ch = word->best_choice->rating() / adjusted_len;
00565 
00566   if (rating_per_ch > crunch_pot_poor_rate) {
00567     if (crunch_debug > 2) {
00568       tprintf("Potential poor rating on \"%s\"\n",
00569               word->best_choice->unichar_string().string());
00570     }
00571     poor_indicator_count++;
00572   }
00573 
00574   if (word_crunchable &&
00575       word->best_choice->certainty() < crunch_pot_poor_cert) {
00576     if (crunch_debug > 2) {
00577       tprintf("Potential poor cert on \"%s\"\n",
00578               word->best_choice->unichar_string().string());
00579     }
00580     poor_indicator_count++;
00581   }
00582 
00583   if (garbage_level != G_OK) {
00584     if (crunch_debug > 2) {
00585       tprintf("Potential garbage on \"%s\"\n",
00586               word->best_choice->unichar_string().string());
00587     }
00588     poor_indicator_count++;
00589   }
00590   return poor_indicator_count >= crunch_pot_indicators;
00591 }
00592 
00593 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
00594   WERD_RES *word;
00595   PAGE_RES_IT copy_it;
00596   BOOL8 deleting_from_bol = FALSE;
00597   BOOL8 marked_delete_point = FALSE;
00598   inT16 debug_delete_mode;
00599   CRUNCH_MODE delete_mode;
00600   inT16 x_debug_delete_mode;
00601   CRUNCH_MODE x_delete_mode;
00602 
00603   page_res_it.restart_page();
00604   while (page_res_it.word() != NULL) {
00605     word = page_res_it.word();
00606 
00607     delete_mode = word_deletable (word, debug_delete_mode);
00608     if (delete_mode != CR_NONE) {
00609       if (word->word->flag (W_BOL) || deleting_from_bol) {
00610         if (crunch_debug > 0) {
00611           tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
00612             debug_delete_mode,
00613             word->best_choice->unichar_string().string());
00614         }
00615         word->unlv_crunch_mode = delete_mode;
00616         deleting_from_bol = TRUE;
00617       } else if (word->word->flag(W_EOL)) {
00618         if (marked_delete_point) {
00619           while (copy_it.word() != word) {
00620             x_delete_mode = word_deletable (copy_it.word (),
00621               x_debug_delete_mode);
00622             if (crunch_debug > 0) {
00623               tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
00624                 x_debug_delete_mode,
00625                 copy_it.word()->best_choice->unichar_string().string());
00626             }
00627             copy_it.word ()->unlv_crunch_mode = x_delete_mode;
00628             copy_it.forward ();
00629           }
00630         }
00631         if (crunch_debug > 0) {
00632           tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
00633             debug_delete_mode,
00634             word->best_choice->unichar_string().string());
00635         }
00636         word->unlv_crunch_mode = delete_mode;
00637         deleting_from_bol = FALSE;
00638         marked_delete_point = FALSE;
00639       }
00640       else {
00641         if (!marked_delete_point) {
00642           copy_it = page_res_it;
00643           marked_delete_point = TRUE;
00644         }
00645       }
00646     }
00647     else {
00648       deleting_from_bol = FALSE;
00649                                  //Forget earlier potential crunches
00650       marked_delete_point = FALSE;
00651     }
00652     /*
00653       The following step has been left till now as the tess fails are used to
00654       determine if the word is deletable.
00655     */
00656     if (!crunch_early_merge_tess_fails)
00657       word->merge_tess_fails();
00658     page_res_it.forward ();
00659   }
00660 }
00661 
00662 
00663 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
00664   int i;
00665   UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
00666   UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
00667   UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
00668   UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
00669   for (i = 0; i < word_res->reject_map.length(); ++i) {
00670     if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
00671       word_res->best_choice->set_unichar_id(unichar_dash, i);
00672       if (word_res->reject_map[i].accepted ())
00673         word_res->reject_map[i].setrej_unlv_rej ();
00674     }
00675     if (word_res->best_choice->unichar_id(i) == unichar_pow) {
00676       word_res->best_choice->set_unichar_id(unichar_space, i);
00677       if (word_res->reject_map[i].accepted ())
00678         word_res->reject_map[i].setrej_unlv_rej ();
00679     }
00680   }
00681 }
00682 
00683 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
00684   enum STATES
00685   {
00686     JUNK,
00687     FIRST_UPPER,
00688     FIRST_LOWER,
00689     FIRST_NUM,
00690     SUBSEQUENT_UPPER,
00691     SUBSEQUENT_LOWER,
00692     SUBSEQUENT_NUM
00693   };
00694   const char *str = word->best_choice->unichar_string().string();
00695   const char *lengths = word->best_choice->unichar_lengths().string();
00696   STATES state = JUNK;
00697   int len = 0;
00698   int isolated_digits = 0;
00699   int isolated_alphas = 0;
00700   int bad_char_count = 0;
00701   int tess_rejs = 0;
00702   int dodgy_chars = 0;
00703   int ok_chars;
00704   UNICHAR_ID last_char = -1;
00705   int alpha_repetition_count = 0;
00706   int longest_alpha_repetition_count = 0;
00707   int longest_lower_run_len = 0;
00708   int lower_string_count = 0;
00709   int longest_upper_run_len = 0;
00710   int upper_string_count = 0;
00711   int total_alpha_count = 0;
00712   int total_digit_count = 0;
00713 
00714   for (; *str != '\0'; str += *(lengths++)) {
00715     len++;
00716     if (word->uch_set->get_isupper (str, *lengths)) {
00717       total_alpha_count++;
00718       switch (state) {
00719         case SUBSEQUENT_UPPER:
00720         case FIRST_UPPER:
00721           state = SUBSEQUENT_UPPER;
00722           upper_string_count++;
00723           if (longest_upper_run_len < upper_string_count)
00724             longest_upper_run_len = upper_string_count;
00725           if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
00726             alpha_repetition_count++;
00727             if (longest_alpha_repetition_count < alpha_repetition_count) {
00728               longest_alpha_repetition_count = alpha_repetition_count;
00729             }
00730           }
00731           else {
00732             last_char = word->uch_set->unichar_to_id(str, *lengths);
00733             alpha_repetition_count = 1;
00734           }
00735           break;
00736         case FIRST_NUM:
00737           isolated_digits++;
00738         default:
00739           state = FIRST_UPPER;
00740           last_char = word->uch_set->unichar_to_id(str, *lengths);
00741           alpha_repetition_count = 1;
00742           upper_string_count = 1;
00743           break;
00744       }
00745     }
00746     else if (word->uch_set->get_islower (str, *lengths)) {
00747       total_alpha_count++;
00748       switch (state) {
00749         case SUBSEQUENT_LOWER:
00750         case FIRST_LOWER:
00751           state = SUBSEQUENT_LOWER;
00752           lower_string_count++;
00753           if (longest_lower_run_len < lower_string_count)
00754             longest_lower_run_len = lower_string_count;
00755           if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
00756             alpha_repetition_count++;
00757             if (longest_alpha_repetition_count < alpha_repetition_count) {
00758               longest_alpha_repetition_count = alpha_repetition_count;
00759             }
00760           }
00761           else {
00762             last_char = word->uch_set->unichar_to_id(str, *lengths);
00763             alpha_repetition_count = 1;
00764           }
00765           break;
00766         case FIRST_NUM:
00767           isolated_digits++;
00768         default:
00769           state = FIRST_LOWER;
00770           last_char = word->uch_set->unichar_to_id(str, *lengths);
00771           alpha_repetition_count = 1;
00772           lower_string_count = 1;
00773           break;
00774       }
00775     }
00776     else if (word->uch_set->get_isdigit (str, *lengths)) {
00777       total_digit_count++;
00778       switch (state) {
00779         case FIRST_NUM:
00780           state = SUBSEQUENT_NUM;
00781         case SUBSEQUENT_NUM:
00782           break;
00783         case FIRST_UPPER:
00784         case FIRST_LOWER:
00785           isolated_alphas++;
00786         default:
00787           state = FIRST_NUM;
00788           break;
00789       }
00790     }
00791     else {
00792       if (*lengths == 1 && *str == ' ')
00793         tess_rejs++;
00794       else
00795         bad_char_count++;
00796       switch (state) {
00797         case FIRST_NUM:
00798           isolated_digits++;
00799           break;
00800         case FIRST_UPPER:
00801         case FIRST_LOWER:
00802           isolated_alphas++;
00803         default:
00804           break;
00805       }
00806       state = JUNK;
00807     }
00808   }
00809 
00810   switch (state) {
00811     case FIRST_NUM:
00812       isolated_digits++;
00813       break;
00814     case FIRST_UPPER:
00815     case FIRST_LOWER:
00816       isolated_alphas++;
00817     default:
00818       break;
00819   }
00820 
00821   if (crunch_include_numerals) {
00822     total_alpha_count += total_digit_count - isolated_digits;
00823   }
00824 
00825   if (crunch_leave_ok_strings && len >= 4 &&
00826       2 * (total_alpha_count - isolated_alphas) > len &&
00827       longest_alpha_repetition_count < crunch_long_repetitions) {
00828     if ((crunch_accept_ok &&
00829          acceptable_word_string(*word->uch_set, str, lengths) !=
00830              AC_UNACCEPTABLE) ||
00831         longest_lower_run_len > crunch_leave_lc_strings ||
00832         longest_upper_run_len > crunch_leave_uc_strings)
00833       return G_NEVER_CRUNCH;
00834   }
00835   if (word->reject_map.length() > 1 &&
00836       strpbrk(str, " ") == NULL &&
00837       (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
00838        word->best_choice->permuter() == FREQ_DAWG_PERM ||
00839        word->best_choice->permuter() == USER_DAWG_PERM ||
00840        word->best_choice->permuter() == NUMBER_PERM ||
00841        acceptable_word_string(*word->uch_set, str, lengths) !=
00842            AC_UNACCEPTABLE || ok_dict_word))
00843     return G_OK;
00844 
00845   ok_chars = len - bad_char_count - isolated_digits -
00846     isolated_alphas - tess_rejs;
00847 
00848   if (crunch_debug > 3) {
00849     tprintf("garbage_word: \"%s\"\n",
00850             word->best_choice->unichar_string().string());
00851     tprintf("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n",
00852             len,
00853             bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
00854   }
00855   if (bad_char_count == 0 &&
00856       tess_rejs == 0 &&
00857       (len > isolated_digits + isolated_alphas || len <= 2))
00858     return G_OK;
00859 
00860   if (tess_rejs > ok_chars ||
00861       (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
00862     return G_TERRIBLE;
00863 
00864   if (len > 4) {
00865     dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
00866         isolated_alphas;
00867     if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
00868       return G_DODGY;
00869     else
00870       return G_OK;
00871   } else {
00872     dodgy_chars = 2 * tess_rejs + bad_char_count;
00873     if ((len == 4 && dodgy_chars > 2) ||
00874         (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
00875       return G_DODGY;
00876     else
00877       return G_OK;
00878   }
00879 }
00880 
00881 
00882 /*************************************************************************
00883  * word_deletable()
00884  *     DELETE WERDS AT ENDS OF ROWS IF
00885  *        Word is crunched &&
00886  *        ( string length = 0                                          OR
00887  *          > 50% of chars are "|" (before merging)                    OR
00888  *          certainty < -10                                            OR
00889  *          rating /char > 60                                          OR
00890  *          TOP of word is more than 0.5 xht BELOW baseline            OR
00891  *          BOTTOM of word is more than 0.5 xht ABOVE xht              OR
00892  *          length of word < 3xht                                      OR
00893  *          height of word < 0.7 xht                                   OR
00894  *          height of word > 3.0 xht                                   OR
00895  *          >75% of the outline BBs have longest dimension < 0.5xht
00896  *************************************************************************/
00897 
00898 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, inT16 &delete_mode) {
00899   int word_len = word->reject_map.length ();
00900   float rating_per_ch;
00901   TBOX box;                       //BB of word
00902 
00903   if (word->unlv_crunch_mode == CR_NONE) {
00904     delete_mode = 0;
00905     return CR_NONE;
00906   }
00907 
00908   if (word_len == 0) {
00909     delete_mode = 1;
00910     return CR_DELETE;
00911   }
00912 
00913   if (word->rebuild_word != NULL) {
00914     // Cube leaves rebuild_word NULL.
00915     box = word->rebuild_word->bounding_box();
00916     if (box.height () < crunch_del_min_ht * kBlnXHeight) {
00917       delete_mode = 4;
00918       return CR_DELETE;
00919     }
00920 
00921     if (noise_outlines(word->rebuild_word)) {
00922       delete_mode = 5;
00923       return CR_DELETE;
00924     }
00925   }
00926 
00927   if ((failure_count (word) * 1.5) > word_len) {
00928     delete_mode = 2;
00929     return CR_LOOSE_SPACE;
00930   }
00931 
00932   if (word->best_choice->certainty () < crunch_del_cert) {
00933     delete_mode = 7;
00934     return CR_LOOSE_SPACE;
00935   }
00936 
00937   rating_per_ch = word->best_choice->rating () / word_len;
00938 
00939   if (rating_per_ch > crunch_del_rating) {
00940     delete_mode = 8;
00941     return CR_LOOSE_SPACE;
00942   }
00943 
00944   if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
00945     delete_mode = 9;
00946     return CR_LOOSE_SPACE;
00947   }
00948 
00949   if (box.bottom () >
00950   kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
00951     delete_mode = 10;
00952     return CR_LOOSE_SPACE;
00953   }
00954 
00955   if (box.height () > crunch_del_max_ht * kBlnXHeight) {
00956     delete_mode = 11;
00957     return CR_LOOSE_SPACE;
00958   }
00959 
00960   if (box.width () < crunch_del_min_width * kBlnXHeight) {
00961     delete_mode = 3;
00962     return CR_LOOSE_SPACE;
00963   }
00964 
00965   delete_mode = 0;
00966   return CR_NONE;
00967 }
00968 
00969 inT16 Tesseract::failure_count(WERD_RES *word) {
00970   const char *str = word->best_choice->unichar_string().string();
00971   int tess_rejs = 0;
00972 
00973   for (; *str != '\0'; str++) {
00974     if (*str == ' ')
00975       tess_rejs++;
00976   }
00977   return tess_rejs;
00978 }
00979 
00980 
00981 BOOL8 Tesseract::noise_outlines(TWERD *word) {
00982   TBOX box;                       // BB of outline
00983   inT16 outline_count = 0;
00984   inT16 small_outline_count = 0;
00985   inT16 max_dimension;
00986   float small_limit = kBlnXHeight * crunch_small_outlines_size;
00987 
00988   for (int b = 0; b < word->NumBlobs(); ++b) {
00989     TBLOB* blob = word->blobs[b];
00990     for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
00991       outline_count++;
00992       box = ol->bounding_box();
00993       if (box.height() > box.width())
00994         max_dimension = box.height();
00995       else
00996         max_dimension = box.width();
00997       if (max_dimension < small_limit)
00998         small_outline_count++;
00999     }
01000   }
01001   return small_outline_count >= outline_count;
01002 }
01003 
01004 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines