|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: ratngs.cpp (Formerly ratings.c) 00003 * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes. 00004 * Author: Ray Smith 00005 * Created: Thu Apr 23 13:23:29 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 00021 #ifdef HAVE_CONFIG_H 00022 #include "config_auto.h" 00023 #endif 00024 00025 #include "ratngs.h" 00026 00027 #include "blobs.h" 00028 #include "callcpp.h" 00029 #include "genericvector.h" 00030 #include "matrix.h" 00031 #include "normalis.h" // kBlnBaselineOffset. 00032 #include "unicharset.h" 00033 00034 using tesseract::ScriptPos; 00035 00036 ELISTIZE(BLOB_CHOICE); 00037 ELISTIZE(WERD_CHOICE); 00038 00039 const float WERD_CHOICE::kBadRating = 100000.0; 00040 // Min offset in baseline-normalized coords to make a character a subscript. 00041 const int kMinSubscriptOffset = 20; 00042 // Min offset in baseline-normalized coords to make a character a superscript. 00043 const int kMinSuperscriptOffset = 20; 00044 // Max y of bottom of a drop-cap blob. 00045 const int kMaxDropCapBottom = -128; 00046 // Max fraction of x-height to use as denominator in measuring x-height overlap. 00047 const double kMaxOverlapDenominator = 0.125; 00048 // Min fraction of x-height range that should be in agreement for matching 00049 // x-heights. 00050 const double kMinXHeightMatch = 0.5; 00051 // Max tolerance on baseline position as a fraction of x-height for matching 00052 // baselines. 00053 const double kMaxBaselineDrift = 0.0625; 00054 00055 static const char kPermuterTypeNoPerm[] = "None"; 00056 static const char kPermuterTypePuncPerm[] = "Punctuation"; 00057 static const char kPermuterTypeTopPerm[] = "Top Choice"; 00058 static const char kPermuterTypeLowerPerm[] = "Top Lower Case"; 00059 static const char kPermuterTypeUpperPerm[] = "Top Upper Case"; 00060 static const char kPermuterTypeNgramPerm[] = "Ngram"; 00061 static const char kPermuterTypeNumberPerm[] = "Number"; 00062 static const char kPermuterTypeUserPatPerm[] = "User Pattern"; 00063 static const char kPermuterTypeSysDawgPerm[] = "System Dictionary"; 00064 static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary"; 00065 static const char kPermuterTypeUserDawgPerm[] = "User Dictionary"; 00066 static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary"; 00067 static const char kPermuterTypeCompoundPerm[] = "Compound"; 00068 00069 static const char * const kPermuterTypeNames[] = { 00070 kPermuterTypeNoPerm, // 0 00071 kPermuterTypePuncPerm, // 1 00072 kPermuterTypeTopPerm, // 2 00073 kPermuterTypeLowerPerm, // 3 00074 kPermuterTypeUpperPerm, // 4 00075 kPermuterTypeNgramPerm, // 5 00076 kPermuterTypeNumberPerm, // 6 00077 kPermuterTypeUserPatPerm, // 7 00078 kPermuterTypeSysDawgPerm, // 8 00079 kPermuterTypeDocDawgPerm, // 9 00080 kPermuterTypeUserDawgPerm, // 10 00081 kPermuterTypeFreqDawgPerm, // 11 00082 kPermuterTypeCompoundPerm // 12 00083 }; 00084 00090 BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id 00091 float src_rating, // rating 00092 float src_cert, // certainty 00093 int src_script_id, // script 00094 float min_xheight, // min xheight allowed 00095 float max_xheight, // max xheight by this char 00096 float yshift, // yshift out of position 00097 BlobChoiceClassifier c) { // adapted match or other 00098 unichar_id_ = src_unichar_id; 00099 rating_ = src_rating; 00100 certainty_ = src_cert; 00101 fontinfo_id_ = -1; 00102 fontinfo_id2_ = -1; 00103 script_id_ = src_script_id; 00104 min_xheight_ = min_xheight; 00105 max_xheight_ = max_xheight; 00106 yshift_ = yshift; 00107 classifier_ = c; 00108 } 00109 00115 BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) : ELIST_LINK(other) { 00116 unichar_id_ = other.unichar_id(); 00117 rating_ = other.rating(); 00118 certainty_ = other.certainty(); 00119 fontinfo_id_ = other.fontinfo_id(); 00120 fontinfo_id2_ = other.fontinfo_id2(); 00121 script_id_ = other.script_id(); 00122 matrix_cell_ = other.matrix_cell_; 00123 min_xheight_ = other.min_xheight_; 00124 max_xheight_ = other.max_xheight_; 00125 yshift_ = other.yshift(); 00126 classifier_ = other.classifier_; 00127 fonts_ = other.fonts_; 00128 } 00129 00130 // Returns true if *this and other agree on the baseline and x-height 00131 // to within some tolerance based on a given estimate of the x-height. 00132 bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE& other, float x_height, 00133 bool debug) const { 00134 double baseline_diff = fabs(yshift() - other.yshift()); 00135 if (baseline_diff > kMaxBaselineDrift * x_height) { 00136 if (debug) { 00137 tprintf("Baseline diff %g for %d v %d\n", 00138 baseline_diff, unichar_id_, other.unichar_id_); 00139 } 00140 return false; 00141 } 00142 double this_range = max_xheight() - min_xheight(); 00143 double other_range = other.max_xheight() - other.min_xheight(); 00144 double denominator = ClipToRange(MIN(this_range, other_range), 00145 1.0, kMaxOverlapDenominator * x_height); 00146 double overlap = MIN(max_xheight(), other.max_xheight()) - 00147 MAX(min_xheight(), other.min_xheight()); 00148 overlap /= denominator; 00149 if (debug) { 00150 tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n", 00151 unichar_id_, other.unichar_id_, baseline_diff, 00152 this_range, other_range, denominator, overlap); 00153 } 00154 00155 return overlap >= kMinXHeightMatch; 00156 } 00157 00158 // Helper to find the BLOB_CHOICE in the bc_list that matches the given 00159 // unichar_id, or NULL if there is no match. 00160 BLOB_CHOICE* FindMatchingChoice(UNICHAR_ID char_id, 00161 BLOB_CHOICE_LIST* bc_list) { 00162 // Find the corresponding best BLOB_CHOICE. 00163 BLOB_CHOICE_IT choice_it(bc_list); 00164 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); 00165 choice_it.forward()) { 00166 BLOB_CHOICE* choice = choice_it.data(); 00167 if (choice->unichar_id() == char_id) { 00168 return choice; 00169 } 00170 } 00171 return NULL; 00172 } 00173 00174 const char *WERD_CHOICE::permuter_name(uinT8 permuter) { 00175 return kPermuterTypeNames[permuter]; 00176 } 00177 00178 namespace tesseract { 00179 00180 const char *ScriptPosToString(enum ScriptPos script_pos) { 00181 switch (script_pos) { 00182 case SP_NORMAL: return "NORM"; 00183 case SP_SUBSCRIPT: return "SUB"; 00184 case SP_SUPERSCRIPT: return "SUPER"; 00185 case SP_DROPCAP: return "DROPC"; 00186 } 00187 return "SP_UNKNOWN"; 00188 } 00189 00190 } // namespace tesseract. 00191 00198 WERD_CHOICE::WERD_CHOICE(const char *src_string, 00199 const UNICHARSET &unicharset) 00200 : unicharset_(&unicharset){ 00201 GenericVector<UNICHAR_ID> encoding; 00202 GenericVector<char> lengths; 00203 if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) { 00204 lengths.push_back('\0'); 00205 STRING src_lengths = &lengths[0]; 00206 this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM); 00207 } else { // There must have been an invalid unichar in the string. 00208 this->init(8); 00209 this->make_bad(); 00210 } 00211 } 00212 00223 void WERD_CHOICE::init(const char *src_string, 00224 const char *src_lengths, 00225 float src_rating, 00226 float src_certainty, 00227 uinT8 src_permuter) { 00228 int src_string_len = strlen(src_string); 00229 if (src_string_len == 0) { 00230 this->init(8); 00231 } else { 00232 this->init(src_lengths ? strlen(src_lengths): src_string_len); 00233 length_ = reserved_; 00234 int offset = 0; 00235 for (int i = 0; i < length_; ++i) { 00236 int unichar_length = src_lengths ? src_lengths[i] : 1; 00237 unichar_ids_[i] = 00238 unicharset_->unichar_to_id(src_string+offset, unichar_length); 00239 state_[i] = 1; 00240 certainties_[i] = src_certainty; 00241 offset += unichar_length; 00242 } 00243 } 00244 adjust_factor_ = 1.0f; 00245 rating_ = src_rating; 00246 certainty_ = src_certainty; 00247 permuter_ = src_permuter; 00248 dangerous_ambig_found_ = false; 00249 } 00250 00254 WERD_CHOICE::~WERD_CHOICE() { 00255 delete[] unichar_ids_; 00256 delete[] script_pos_; 00257 delete[] state_; 00258 delete[] certainties_; 00259 } 00260 00261 const char *WERD_CHOICE::permuter_name() const { 00262 return kPermuterTypeNames[permuter_]; 00263 } 00264 00265 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word, 00266 // taken from the appropriate cell in the ratings MATRIX. 00267 // Borrowed pointer, so do not delete. 00268 BLOB_CHOICE_LIST* WERD_CHOICE::blob_choices(int index, MATRIX* ratings) const { 00269 MATRIX_COORD coord = MatrixCoord(index); 00270 BLOB_CHOICE_LIST* result = ratings->get(coord.col, coord.row); 00271 if (result == NULL) { 00272 result = new BLOB_CHOICE_LIST; 00273 ratings->put(coord.col, coord.row, result); 00274 } 00275 return result; 00276 } 00277 00278 // Returns the MATRIX_COORD corresponding to the location in the ratings 00279 // MATRIX for the given index into the word. 00280 MATRIX_COORD WERD_CHOICE::MatrixCoord(int index) const { 00281 int col = 0; 00282 for (int i = 0; i < index; ++i) 00283 col += state_[i]; 00284 int row = col + state_[index] - 1; 00285 return MATRIX_COORD(col, row); 00286 } 00287 00288 // Sets the entries for the given index from the BLOB_CHOICE, assuming 00289 // unit fragment lengths, but setting the state for this index to blob_count. 00290 void WERD_CHOICE::set_blob_choice(int index, int blob_count, 00291 const BLOB_CHOICE* blob_choice) { 00292 unichar_ids_[index] = blob_choice->unichar_id(); 00293 script_pos_[index] = tesseract::SP_NORMAL; 00294 state_[index] = blob_count; 00295 certainties_[index] = blob_choice->certainty(); 00296 } 00297 00298 00304 bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const { 00305 for (int i = 0; i < length_; ++i) { 00306 if (unichar_ids_[i] == unichar_id) { 00307 return true; 00308 } 00309 } 00310 return false; 00311 } 00312 00320 void WERD_CHOICE::remove_unichar_ids(int start, int num) { 00321 ASSERT_HOST(start >= 0 && start + num <= length_); 00322 // Accumulate the states to account for the merged blobs. 00323 for (int i = 0; i < num; ++i) { 00324 if (start > 0) 00325 state_[start - 1] += state_[start + i]; 00326 else if (start + num < length_) 00327 state_[start + num] += state_[start + i]; 00328 } 00329 for (int i = start; i + num < length_; ++i) { 00330 unichar_ids_[i] = unichar_ids_[i + num]; 00331 script_pos_[i] = script_pos_[i + num]; 00332 state_[i] = state_[i + num]; 00333 certainties_[i] = certainties_[i + num]; 00334 } 00335 length_ -= num; 00336 } 00337 00343 void WERD_CHOICE::reverse_and_mirror_unichar_ids() { 00344 for (int i = 0; i < length_ / 2; ++i) { 00345 UNICHAR_ID tmp_id = unichar_ids_[i]; 00346 unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]); 00347 unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id); 00348 } 00349 if (length_ % 2 != 0) { 00350 unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]); 00351 } 00352 } 00353 00361 void WERD_CHOICE::punct_stripped(int *start, int *end) const { 00362 *start = 0; 00363 *end = length() - 1; 00364 while (*start < length() && 00365 unicharset()->get_ispunctuation(unichar_id(*start))) { 00366 (*start)++; 00367 } 00368 while (*end > -1 && 00369 unicharset()->get_ispunctuation(unichar_id(*end))) { 00370 (*end)--; 00371 } 00372 (*end)++; 00373 } 00374 00375 void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const { 00376 int end = length(); 00377 while (end > 0 && 00378 unicharset_->get_isdigit(unichar_ids_[end - 1]) && 00379 BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) { 00380 end--; 00381 } 00382 int start = 0; 00383 while (start < end && 00384 unicharset_->get_isdigit(unichar_ids_[start]) && 00385 BlobPosition(start) == tesseract::SP_SUPERSCRIPT) { 00386 start++; 00387 } 00388 *pstart = start; 00389 *pend = end; 00390 } 00391 00392 WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const { 00393 ASSERT_HOST(start >= 0 && start <= length_); 00394 ASSERT_HOST(end >= 0 && end <= length_); 00395 if (end < start) { end = start; } 00396 WERD_CHOICE retval(unicharset_, end - start); 00397 for (int i = start; i < end; i++) { 00398 retval.append_unichar_id_space_allocated( 00399 unichar_ids_[i], state_[i], 0.0f, certainties_[i]); 00400 } 00401 return retval; 00402 } 00403 00409 bool WERD_CHOICE::has_rtl_unichar_id() const { 00410 int i; 00411 for (i = 0; i < length_; ++i) { 00412 UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]); 00413 if (dir == UNICHARSET::U_RIGHT_TO_LEFT || 00414 dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) { 00415 return true; 00416 } 00417 } 00418 return false; 00419 } 00420 00427 void WERD_CHOICE::string_and_lengths(STRING *word_str, 00428 STRING *word_lengths_str) const { 00429 *word_str = ""; 00430 if (word_lengths_str != NULL) *word_lengths_str = ""; 00431 for (int i = 0; i < length_; ++i) { 00432 const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]); 00433 *word_str += ch; 00434 if (word_lengths_str != NULL) { 00435 *word_lengths_str += strlen(ch); 00436 } 00437 } 00438 } 00439 00446 void WERD_CHOICE::append_unichar_id( 00447 UNICHAR_ID unichar_id, int blob_count, 00448 float rating, float certainty) { 00449 if (length_ == reserved_) { 00450 this->double_the_size(); 00451 } 00452 this->append_unichar_id_space_allocated(unichar_id, blob_count, 00453 rating, certainty); 00454 } 00455 00463 WERD_CHOICE & WERD_CHOICE::operator+= (const WERD_CHOICE & second) { 00464 ASSERT_HOST(unicharset_ == second.unicharset_); 00465 while (reserved_ < length_ + second.length()) { 00466 this->double_the_size(); 00467 } 00468 const UNICHAR_ID *other_unichar_ids = second.unichar_ids(); 00469 for (int i = 0; i < second.length(); ++i) { 00470 unichar_ids_[length_ + i] = other_unichar_ids[i]; 00471 state_[length_ + i] = second.state_[i]; 00472 certainties_[length_ + i] = second.certainties_[i]; 00473 script_pos_[length_ + i] = second.BlobPosition(i); 00474 } 00475 length_ += second.length(); 00476 if (second.adjust_factor_ > adjust_factor_) 00477 adjust_factor_ = second.adjust_factor_; 00478 rating_ += second.rating(); // add ratings 00479 if (second.certainty() < certainty_) // take min 00480 certainty_ = second.certainty(); 00481 if (second.dangerous_ambig_found_) 00482 dangerous_ambig_found_ = true; 00483 if (permuter_ == NO_PERM) { 00484 permuter_ = second.permuter(); 00485 } else if (second.permuter() != NO_PERM && 00486 second.permuter() != permuter_) { 00487 permuter_ = COMPOUND_PERM; 00488 } 00489 return *this; 00490 } 00491 00492 00499 WERD_CHOICE& WERD_CHOICE::operator=(const WERD_CHOICE& source) { 00500 while (reserved_ < source.length()) { 00501 this->double_the_size(); 00502 } 00503 00504 unicharset_ = source.unicharset_; 00505 const UNICHAR_ID *other_unichar_ids = source.unichar_ids(); 00506 for (int i = 0; i < source.length(); ++i) { 00507 unichar_ids_[i] = other_unichar_ids[i]; 00508 state_[i] = source.state_[i]; 00509 certainties_[i] = source.certainties_[i]; 00510 script_pos_[i] = source.BlobPosition(i); 00511 } 00512 length_ = source.length(); 00513 adjust_factor_ = source.adjust_factor_; 00514 rating_ = source.rating(); 00515 certainty_ = source.certainty(); 00516 min_x_height_ = source.min_x_height(); 00517 max_x_height_ = source.max_x_height(); 00518 permuter_ = source.permuter(); 00519 dangerous_ambig_found_ = source.dangerous_ambig_found_; 00520 return *this; 00521 } 00522 00523 // Sets up the script_pos_ member using the blobs_list to get the bln 00524 // bounding boxes, *this to get the unichars, and this->unicharset 00525 // to get the target positions. If small_caps is true, sub/super are not 00526 // considered, but dropcaps are. 00527 // NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.) 00528 void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD* word) { 00529 // Since WERD_CHOICE isn't supposed to depend on a Tesseract, 00530 // we don't have easy access to the flags Tesseract stores. Therefore, debug 00531 // for this module is hard compiled in. 00532 int debug = 0; 00533 00534 // Initialize to normal. 00535 for (int i = 0; i < length_; ++i) 00536 script_pos_[i] = tesseract::SP_NORMAL; 00537 if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) { 00538 return; 00539 } 00540 00541 int position_counts[4]; 00542 for (int i = 0; i < 4; i++) { 00543 position_counts[i] = 0; 00544 } 00545 00546 int chunk_index = 0; 00547 for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) { 00548 TBLOB* tblob = word->blobs[chunk_index]; 00549 int uni_id = unichar_id(blob_index); 00550 TBOX blob_box = tblob->bounding_box(); 00551 if (state_ != NULL) { 00552 for (int i = 1; i < state_[blob_index]; ++i) { 00553 ++chunk_index; 00554 tblob = word->blobs[chunk_index]; 00555 blob_box += tblob->bounding_box(); 00556 } 00557 } 00558 script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box, 00559 uni_id); 00560 if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) { 00561 script_pos_[blob_index] = tesseract::SP_NORMAL; 00562 } 00563 position_counts[script_pos_[blob_index]]++; 00564 } 00565 // If almost everything looks like a superscript or subscript, 00566 // we most likely just got the baseline wrong. 00567 if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ || 00568 position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) { 00569 if (debug >= 2) { 00570 tprintf("Most characters of %s are subscript or superscript.\n" 00571 "That seems wrong, so I'll assume we got the baseline wrong\n", 00572 unichar_string().string()); 00573 } 00574 for (int i = 0; i < length_; i++) { 00575 ScriptPos sp = script_pos_[i]; 00576 if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) { 00577 position_counts[sp]--; 00578 position_counts[tesseract::SP_NORMAL]++; 00579 script_pos_[i] = tesseract::SP_NORMAL; 00580 } 00581 } 00582 } 00583 00584 if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) || 00585 debug >= 2) { 00586 tprintf("SetScriptPosition on %s\n", unichar_string().string()); 00587 int chunk_index = 0; 00588 for (int blob_index = 0; blob_index < length_; ++blob_index) { 00589 if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) { 00590 TBLOB* tblob = word->blobs[chunk_index]; 00591 ScriptPositionOf(true, *unicharset_, tblob->bounding_box(), 00592 unichar_id(blob_index)); 00593 } 00594 chunk_index += state_ != NULL ? state_[blob_index] : 1; 00595 } 00596 } 00597 } 00598 // Sets the script_pos_ member from some source positions with a given length. 00599 void WERD_CHOICE::SetScriptPositions(const tesseract::ScriptPos* positions, 00600 int length) { 00601 ASSERT_HOST(length == length_); 00602 if (positions != script_pos_) { 00603 delete [] script_pos_; 00604 script_pos_ = new ScriptPos[length]; 00605 memcpy(script_pos_, positions, sizeof(positions[0]) * length); 00606 } 00607 } 00608 // Sets all the script_pos_ positions to the given position. 00609 void WERD_CHOICE::SetAllScriptPositions(tesseract::ScriptPos position) { 00610 for (int i = 0; i < length_; ++i) 00611 script_pos_[i] = position; 00612 } 00613 00614 /* static */ 00615 ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug, 00616 const UNICHARSET& unicharset, 00617 const TBOX& blob_box, 00618 UNICHAR_ID unichar_id) { 00619 ScriptPos retval = tesseract::SP_NORMAL; 00620 int top = blob_box.top(); 00621 int bottom = blob_box.bottom(); 00622 int min_bottom, max_bottom, min_top, max_top; 00623 unicharset.get_top_bottom(unichar_id, 00624 &min_bottom, &max_bottom, 00625 &min_top, &max_top); 00626 00627 int sub_thresh_top = min_top - kMinSubscriptOffset; 00628 int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset; 00629 int sup_thresh_bot = max_bottom + kMinSuperscriptOffset; 00630 if (bottom <= kMaxDropCapBottom) { 00631 retval = tesseract::SP_DROPCAP; 00632 } else if (top < sub_thresh_top && bottom < sub_thresh_bot) { 00633 retval = tesseract::SP_SUBSCRIPT; 00634 } else if (bottom > sup_thresh_bot) { 00635 retval = tesseract::SP_SUPERSCRIPT; 00636 } 00637 00638 if (print_debug) { 00639 const char *pos = ScriptPosToString(retval); 00640 tprintf("%s Character %s[bot:%d top: %d] " 00641 "bot_range[%d,%d] top_range[%d, %d] " 00642 "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n", 00643 pos, unicharset.id_to_unichar(unichar_id), 00644 bottom, top, 00645 min_bottom, max_bottom, min_top, max_top, 00646 sub_thresh_bot, sub_thresh_top, 00647 sup_thresh_bot); 00648 } 00649 return retval; 00650 } 00651 00652 // Returns the script-id (eg Han) of the dominant script in the word. 00653 int WERD_CHOICE::GetTopScriptID() const { 00654 int max_script = unicharset_->get_script_table_size(); 00655 int *sid = new int[max_script]; 00656 int x; 00657 for (x = 0; x < max_script; x++) sid[x] = 0; 00658 for (x = 0; x < length_; ++x) { 00659 int script_id = unicharset_->get_script(unichar_id(x)); 00660 sid[script_id]++; 00661 } 00662 if (unicharset_->han_sid() != unicharset_->null_sid()) { 00663 // Add the Hiragana & Katakana counts to Han and zero them out. 00664 if (unicharset_->hiragana_sid() != unicharset_->null_sid()) { 00665 sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()]; 00666 sid[unicharset_->hiragana_sid()] = 0; 00667 } 00668 if (unicharset_->katakana_sid() != unicharset_->null_sid()) { 00669 sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()]; 00670 sid[unicharset_->katakana_sid()] = 0; 00671 } 00672 } 00673 // Note that high script ID overrides lower one on a tie, thus biasing 00674 // towards non-Common script (if sorted that way in unicharset file). 00675 int max_sid = 0; 00676 for (x = 1; x < max_script; x++) 00677 if (sid[x] >= sid[max_sid]) max_sid = x; 00678 if (sid[max_sid] < length_ / 2) 00679 max_sid = unicharset_->null_sid(); 00680 delete[] sid; 00681 return max_sid; 00682 } 00683 00684 // Fixes the state_ for a chop at the given blob_posiiton. 00685 void WERD_CHOICE::UpdateStateForSplit(int blob_position) { 00686 int total_chunks = 0; 00687 for (int i = 0; i < length_; ++i) { 00688 total_chunks += state_[i]; 00689 if (total_chunks > blob_position) { 00690 ++state_[i]; 00691 return; 00692 } 00693 } 00694 } 00695 00696 // Returns the sum of all the state elements, being the total number of blobs. 00697 int WERD_CHOICE::TotalOfStates() const { 00698 int total_chunks = 0; 00699 for (int i = 0; i < length_; ++i) { 00700 total_chunks += state_[i]; 00701 } 00702 return total_chunks; 00703 } 00704 00710 void WERD_CHOICE::print(const char *msg) const { 00711 tprintf("%s : ", msg); 00712 for (int i = 0; i < length_; ++i) { 00713 tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i])); 00714 } 00715 tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n", 00716 rating_, certainty_, adjust_factor_, permuter_, 00717 min_x_height_, max_x_height_, dangerous_ambig_found_); 00718 tprintf("pos"); 00719 for (int i = 0; i < length_; ++i) { 00720 tprintf("\t%s", ScriptPosToString(script_pos_[i])); 00721 } 00722 tprintf("\nstr"); 00723 for (int i = 0; i < length_; ++i) { 00724 tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i])); 00725 } 00726 tprintf("\nstate:"); 00727 for (int i = 0; i < length_; ++i) { 00728 tprintf("\t%d ", state_[i]); 00729 } 00730 tprintf("\nC"); 00731 for (int i = 0; i < length_; ++i) { 00732 tprintf("\t%.3f", certainties_[i]); 00733 } 00734 tprintf("\n"); 00735 } 00736 00737 // Prints the segmentation state with an introductory message. 00738 void WERD_CHOICE::print_state(const char *msg) const { 00739 tprintf("%s", msg); 00740 for (int i = 0; i < length_; ++i) 00741 tprintf(" %d", state_[i]); 00742 tprintf("\n"); 00743 } 00744 00745 // Displays the segmentation state of *this (if not the same as the last 00746 // one displayed) and waits for a click in the window. 00747 void WERD_CHOICE::DisplaySegmentation(TWERD* word) { 00748 #ifndef GRAPHICS_DISABLED 00749 // Number of different colors to draw with. 00750 const int kNumColors = 6; 00751 static ScrollView *segm_window = NULL; 00752 // Check the state against the static prev_drawn_state. 00753 static GenericVector<int> prev_drawn_state; 00754 bool already_done = prev_drawn_state.size() == length_; 00755 if (!already_done) prev_drawn_state.init_to_size(length_, 0); 00756 for (int i = 0; i < length_; ++i) { 00757 if (prev_drawn_state[i] != state_[i]) { 00758 already_done = false; 00759 } 00760 prev_drawn_state[i] = state_[i]; 00761 } 00762 if (already_done || word->blobs.empty()) return; 00763 00764 // Create the window if needed. 00765 if (segm_window == NULL) { 00766 segm_window = new ScrollView("Segmentation", 5, 10, 500, 256, 00767 2000.0, 256.0, true); 00768 } else { 00769 segm_window->Clear(); 00770 } 00771 00772 TBOX bbox; 00773 int blob_index = 0; 00774 for (int c = 0; c < length_; ++c) { 00775 ScrollView::Color color = 00776 static_cast<ScrollView::Color>(c % kNumColors + 3); 00777 for (int i = 0; i < state_[c]; ++i, ++blob_index) { 00778 TBLOB* blob = word->blobs[blob_index]; 00779 bbox += blob->bounding_box(); 00780 blob->plot(segm_window, color, color); 00781 } 00782 } 00783 segm_window->ZoomToRectangle(bbox.left(), bbox.top(), 00784 bbox.right(), bbox.bottom()); 00785 segm_window->Update(); 00786 window_wait(segm_window); 00787 #endif 00788 } 00789 00790 00791 bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, 00792 const WERD_CHOICE &word2) { 00793 const UNICHARSET *uchset = word1.unicharset(); 00794 if (word2.unicharset() != uchset) return false; 00795 int w1start, w1end; 00796 word1.punct_stripped(&w1start, &w1end); 00797 int w2start, w2end; 00798 word2.punct_stripped(&w2start, &w2end); 00799 if (w1end - w1start != w2end - w2start) return false; 00800 for (int i = 0; i < w1end - w1start; i++) { 00801 if (uchset->to_lower(word1.unichar_id(w1start + i)) != 00802 uchset->to_lower(word2.unichar_id(w2start + i))) { 00803 return false; 00804 } 00805 } 00806 return true; 00807 } 00808 00819 void print_ratings_list(const char *msg, 00820 BLOB_CHOICE_LIST *ratings, 00821 const UNICHARSET ¤t_unicharset) { 00822 if (ratings->length() == 0) { 00823 tprintf("%s:<none>\n", msg); 00824 return; 00825 } 00826 if (*msg != '\0') { 00827 tprintf("%s\n", msg); 00828 } 00829 BLOB_CHOICE_IT c_it; 00830 c_it.set_to_list(ratings); 00831 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { 00832 c_it.data()->print(¤t_unicharset); 00833 if (!c_it.at_last()) tprintf("\n"); 00834 } 00835 tprintf("\n"); 00836 fflush(stdout); 00837 }