48 static const
double kStopperAmbiguityThresholdGain = 8.0;
51 static const
double kStopperAmbiguityThresholdOffset = 1.5;
65 static
double StopperAmbigThreshold(
double f1,
double f2) {
66 return (f2 - f1) * kStopperAmbiguityThresholdGain -
67 kStopperAmbiguityThresholdOffset;
76 bool merge_similar_words,
77 BLOCK_LIST *the_block_list,
80 BLOCK_IT block_it(the_block_list);
82 for (block_it.mark_cycle_pt();
83 !block_it.cycled_list(); block_it.forward()) {
84 block_res_it.add_to_end(
new BLOCK_RES(merge_similar_words,
97 ROW_IT row_it (the_block->
row_list ());
98 ROW_RES_IT row_res_it(&row_res_list);
104 font_assigned =
false;
111 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
112 row_res_it.add_to_end(
new ROW_RES(merge_similar_words, row_it.data()));
124 WERD_RES_IT word_res_it(&word_res_list);
130 whole_word_rej_count = 0;
133 bool add_next_word =
false;
137 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
138 auto* word_res =
new WERD_RES(word_it.data());
143 word_res->part_of_combo =
true;
145 }
else if (merge_similar_words) {
146 union_box = word_res->word->bounding_box();
147 add_next_word = !word_res->word->flag(
W_REP_CHAR) &&
149 word_res->odd_size = !add_next_word;
151 WERD* next_word = word_it.data_relative(1);
152 if (merge_similar_words) {
160 int prev_right = union_box.
right();
161 union_box += next_box;
165 add_next_word =
false;
173 if (combo ==
nullptr) {
174 copy_word =
new WERD;
175 *copy_word = *(word_it.data());
179 word_res_it.add_to_end(combo);
181 word_res->part_of_combo =
true;
185 word_res_it.add_to_end(word_res);
195 *word = *(source.
word);
218 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&source.
best_choices));
219 WERD_CHOICE_IT wc_dest_it(&best_choices);
220 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
222 wc_dest_it.add_after_then_move(
new WERD_CHOICE(*choice));
224 if (!wc_dest_it.empty()) {
225 wc_dest_it.move_to_first();
226 best_choice = wc_dest_it.data();
228 best_choice =
nullptr;
234 raw_choice =
nullptr;
244 CopySimpleFields(source);
283 CopySimpleFields(source);
309 const TBOX* norm_box,
312 bool allow_detailed_fx,
314 auto norm_mode_hint =
319 word->cblob_list()->empty()) ||
320 (pb !=
nullptr && !pb->
IsText())) {
323 SetupFake(unicharset_in);
328 SetupWordScript(unicharset_in);
330 float word_xheight = use_body_size && row !=
nullptr && row->
body_size() > 0.0f
332 chopped_word->BLNormalize(block, row, pix, word->flag(
W_INVERSE),
333 word_xheight, baseline_shift, numeric_mode,
334 norm_mode_hint, norm_box, &denorm);
336 SetupBasicsFromChoppedWord(unicharset_in);
338 int num_blobs = chopped_word->NumBlobs();
350 SetupBlobWidthsAndGaps();
358 SetupWordScript(unicharset_in);
359 chopped_word =
new TWERD;
360 rebuild_word =
new TWERD;
363 int blob_count = word->cblob_list()->
length();
364 if (blob_count > 0) {
365 auto** fake_choices =
new BLOB_CHOICE*[blob_count];
368 C_BLOB_IT b_it(word->cblob_list());
370 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
371 TBOX box = b_it.data()->bounding_box();
372 box_word->InsertBox(box_word->length(), box);
375 FakeClassifyWord(blob_count, fake_choices);
376 delete [] fake_choices;
380 LogNewRawChoice(word);
382 LogNewCookedChoice(1,
false, word);
391 word->set_script_id(script);
398 if (blamer_bundle !=
nullptr) {
399 blamer_bundle->SetupNormTruthWord(denorm);
405 blob_widths.truncate(0);
406 blob_gaps.truncate(0);
407 int num_blobs = chopped_word->NumBlobs();
408 for (
int b = 0; b < num_blobs; ++b) {
409 TBLOB *blob = chopped_word->blobs[b];
411 blob_widths.push_back(box.
width());
412 if (b + 1 < num_blobs) {
414 chopped_word->blobs[b + 1]->bounding_box().left() - box.
right());
425 seam_array.insert(seam, blob_number);
426 if (ratings !=
nullptr) {
428 ratings = ratings->ConsumeAndMakeBigger(blob_number);
430 if (raw_choice !=
nullptr)
431 raw_choice->UpdateStateForSplit(blob_number);
432 WERD_CHOICE_IT wc_it(&best_choices);
433 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
437 SetupBlobWidthsAndGaps();
445 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
446 for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
457 return !best_choices.singleton() || best_choice->dangerous_ambig_found();
463 int ratings_dim = ratings->dimension();
464 if (raw_choice->TotalOfStates() != ratings_dim) {
465 tprintf(
"raw_choice has total of states = %d vs ratings dim of %d\n",
466 raw_choice->TotalOfStates(), ratings_dim);
469 WERD_CHOICE_IT it(&best_choices);
471 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
474 tprintf(
"Cooked #%d has total of states = %d vs ratings dim of %d\n",
486 (word_to_debug !=
nullptr && *word_to_debug !=
'\0' && best_choice !=
nullptr &&
487 best_choice->unichar_string() ==
STRING(word_to_debug))) {
488 if (raw_choice !=
nullptr)
489 raw_choice->print(
"\nBest Raw Choice");
491 WERD_CHOICE_IT it(&best_choices);
493 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
504 tprintf(
"Best choice: accepted=%d, adaptable=%d, done=%d : ",
505 tess_accepted, tess_would_adapt, done);
506 if (best_choice ==
nullptr)
509 best_choice->print(msg);
518 if (best_choice ==
nullptr || best_choices.singleton())
521 if (debug_level >= 2)
522 best_choice->print(
"\nFiltering against best choice");
523 WERD_CHOICE_IT it(&best_choices);
525 for (it.forward(); !it.at_first(); it.forward(), ++index) {
527 float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
534 int i = 0, j = 0, chunk = 0;
539 int choice_chunk = choice->
state(0), best_chunk = best_choice->state(0);
540 while (i < choice->length() && j < best_choice->length()) {
541 if (choice->
unichar_id(i) != best_choice->unichar_id(j) &&
542 choice->
certainty(i) - best_choice->certainty(j) < threshold) {
543 if (debug_level >= 2) {
544 choice->
print(
"WorstCertaintyDiffWorseThan");
546 "i %d j %d Choice->Blob[i].Certainty %.4g" 547 " WorstOtherChoiceCertainty %g Threshold %g\n",
548 i, j, choice->
certainty(i), best_choice->certainty(j), threshold);
549 tprintf(
"Discarding bad choice #%d\n", index);
556 while (choice_chunk < chunk && ++i < choice->length())
557 choice_chunk += choice->
state(i);
559 while (best_chunk < chunk && ++j < best_choice->length())
560 best_chunk += best_choice->state(j);
571 int end_chunk = best_choice->state(0);
572 int end_raw_chunk = raw_choice->state(0);
574 for (
int i = 0; i < best_choice->length(); i++, thresholds++) {
575 float avg_rating = 0.0f;
576 int num_error_chunks = 0;
579 while (chunk < end_chunk) {
580 if (chunk >= end_raw_chunk) {
582 end_raw_chunk += raw_choice->state(raw_blob);
584 if (best_choice->unichar_id(i) !=
585 raw_choice->unichar_id(raw_blob)) {
586 avg_rating += raw_choice->certainty(raw_blob);
592 if (num_error_chunks > 0) {
593 avg_rating /= num_error_chunks;
594 *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
596 *thresholds = max_rating;
599 if (*thresholds > max_rating)
600 *thresholds = max_rating;
601 if (*thresholds < min_rating)
602 *thresholds = min_rating;
609 if (raw_choice ==
nullptr || word_choice->
rating() < raw_choice->rating()) {
626 if (best_choice !=
nullptr) {
632 float max_certainty_delta =
633 StopperAmbigThreshold(best_choice->adjust_factor(),
635 if (max_certainty_delta > -kStopperAmbiguityThresholdOffset)
636 max_certainty_delta = -kStopperAmbiguityThresholdOffset;
637 if (word_choice->
certainty() - best_choice->certainty() <
638 max_certainty_delta) {
642 tprintf(
"Discarding choice \"%s\" with an overly low certainty" 643 " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
645 best_choice->certainty(),
646 max_certainty_delta + best_choice->certainty());
655 WERD_CHOICE_IT it(&best_choices);
657 bool inserted =
false;
662 if (choice->
rating() > word_choice->
rating() && !inserted) {
664 it.add_before_stay_put(word_choice);
666 if (num_choices == 0)
667 best_choice = word_choice;
677 tprintf(
"Discarding duplicate choice \"%s\", rating %g vs %g\n",
685 if (num_choices > max_num_choices)
689 }
while (!it.at_first());
691 if (!inserted && num_choices < max_num_choices) {
692 it.add_to_end(word_choice);
694 if (num_choices == 0)
695 best_choice = word_choice;
699 tprintf(
"New %s", best_choice == word_choice ?
"Best" :
"Secondary");
702 word_choice->
print(
" Word Choice");
714 template<
class T>
static void MovePointerData(T**
dest, T**src) {
723 WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
724 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
725 if (!it.at_first()) alternates_str +=
"\", \"";
726 alternates_str += it.data()->unichar_string();
728 tprintf(
"Alternates for \"%s\": {\"%s\"}\n",
729 best_choice->unichar_string().string(), alternates_str.
string());
736 for (
int b = start_blob; b <= last_blob; ++b) {
737 result += blob_widths[b];
739 result += blob_gaps[b];
745 if (blob_index < 0 || blob_index >= blob_gaps.size())
747 return blob_gaps[blob_index];
755 if (index < 0 || index >= best_choice->length())
return nullptr;
756 BLOB_CHOICE_LIST* choices = GetBlobChoices(index);
764 return best_choice->blob_choices(index, ratings);
774 MovePointerData(&box_word, &word->
box_word);
775 seam_array.delete_data_pointers();
782 if (ratings !=
nullptr) ratings->delete_matrix_pointers();
783 MovePointerData(&ratings, &word->
ratings);
785 MovePointerData(&raw_choice, &word->
raw_choice);
786 best_choices.clear();
787 WERD_CHOICE_IT wc_it(&best_choices);
791 assert(blamer_bundle !=
nullptr);
794 CopySimpleFields(*word);
800 best_choice = choice;
805 reject_map.initialise(best_state.length());
806 done = tess_accepted = tess_would_adapt =
true;
807 SetScriptPositions();
815 rebuild_word =
new TWERD;
816 if (seam_array.empty())
818 best_state.truncate(0);
820 for (
int i = 0; i < best_choice->length(); ++i) {
821 int length = best_choice->state(i);
822 best_state.push_back(length);
827 TBLOB* blob = chopped_word->blobs[start];
828 rebuild_word->blobs.push_back(
new TBLOB(*blob));
841 rebuild_word =
new TWERD(*chopped_word);
843 int word_len = box_word->length();
844 best_state.reserve(word_len);
845 correct_text.reserve(word_len);
846 for (
int i = 0; i < word_len; ++i) {
847 best_state.push_back(1);
848 correct_text.push_back(
STRING(
""));
855 rebuild_word->ComputeBoundingBoxes();
857 box_word->ClipToOriginalWord(denorm.block(), word);
863 best_choice->SetScriptPositions(small_caps, chopped_word);
870 raw_choice->SetAllScriptPositions(position);
871 WERD_CHOICE_IT wc_it(&best_choices);
872 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward())
873 wc_it.data()->SetAllScriptPositions(position);
887 ratings =
new MATRIX(blob_count, 1);
888 for (
int c = 0; c < blob_count; ++c) {
889 auto* choice_list =
new BLOB_CHOICE_LIST;
890 BLOB_CHOICE_IT choice_it(choice_list);
891 choice_it.add_after_then_move(choices[c]);
892 ratings->put(c, c, choice_list);
895 reject_map.initialise(blob_count);
896 best_state.init_to_size(blob_count, 1);
903 int num_blobs = ratings->dimension();
904 auto* word_choice =
new WERD_CHOICE(uch_set, num_blobs);
905 word_choice->set_permuter(permuter);
906 for (
int b = 0; b < num_blobs; ++b) {
908 float rating = INT32_MAX;
909 float certainty = -INT32_MAX;
910 BLOB_CHOICE_LIST* choices = ratings->get(b, b);
911 if (choices !=
nullptr && !choices->empty()) {
912 BLOB_CHOICE_IT bc_it(choices);
915 rating = choice->
rating();
918 word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
921 LogNewRawChoice(word_choice);
923 LogNewCookedChoice(1,
false, word_choice);
928 correct_text.clear();
930 for (
int i = 0; i < best_choice->length(); ++i) {
931 UNICHAR_ID choice_id = best_choice->unichar_id(i);
932 const char* blob_choice = uch_set->id_to_unichar(choice_id);
933 correct_text.push_back(
STRING(blob_choice));
945 ASSERT_HOST(best_choice->length() == 0 || ratings !=
nullptr);
946 bool modified =
false;
947 for (
int i = 0; i + 1 < best_choice->length(); ++i) {
948 UNICHAR_ID new_id = class_cb->
Run(best_choice->unichar_id(i),
949 best_choice->unichar_id(i+1));
950 if (new_id != INVALID_UNICHAR_ID &&
951 (box_cb ==
nullptr || box_cb->
Run(box_word->BlobBox(i),
952 box_word->BlobBox(i + 1)))) {
954 best_choice->set_unichar_id(new_id, i);
956 MergeAdjacentBlobs(i);
957 const MATRIX_COORD& coord = best_choice->MatrixCoord(i);
958 if (!coord.
Valid(*ratings)) {
959 ratings->IncreaseBandSize(coord.
row + 1 - coord.
col);
961 BLOB_CHOICE_LIST* blob_choices = GetBlobChoices(i);
966 BLOB_CHOICE_IT bc_it(blob_choices);
967 bc_it.add_before_then_move(blob_choice);
979 if (reject_map.length() == best_choice->length())
980 reject_map.remove_pos(index);
981 best_choice->remove_unichar_id(index + 1);
982 rebuild_word->MergeBlobs(index, index + 2);
983 box_word->MergeBoxes(index, index + 2);
984 if (index + 1 < best_state.length()) {
985 best_state[index] += best_state[index + 1];
986 best_state.remove(index + 1);
996 static int is_simple_quote(
const char* signed_str,
int length) {
998 reinterpret_cast<const unsigned char*
>(signed_str);
1000 return (length == 1 && (*str ==
'\'' || *str ==
'`')) ||
1002 (length == 3 && ((*str == 0xe2 &&
1003 *(str + 1) == 0x80 &&
1004 *(str + 2) == 0x98) ||
1006 *(str + 1) == 0x80 &&
1007 *(str + 2) == 0x99)));
1013 const char *ch = uch_set->id_to_unichar(id1);
1014 const char *next_ch = uch_set->id_to_unichar(id2);
1015 if (is_simple_quote(ch, strlen(ch)) &&
1016 is_simple_quote(next_ch, strlen(next_ch)))
1017 return uch_set->unichar_to_id(
"\"");
1018 return INVALID_UNICHAR_ID;
1023 if (!uch_set->contains_unichar(
"\"") ||
1024 !uch_set->get_enabled(uch_set->unichar_to_id(
"\"")))
1027 ConditionalBlobMerge(
1035 const char *ch = uch_set->id_to_unichar(id1);
1036 const char *next_ch = uch_set->id_to_unichar(id2);
1037 if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
1038 (*ch ==
'-' || *ch ==
'~') && (*next_ch ==
'-' || *next_ch ==
'~'))
1039 return uch_set->unichar_to_id(
"-");
1040 return INVALID_UNICHAR_ID;
1052 if (!uch_set->contains_unichar(
"-") ||
1053 !uch_set->get_enabled(uch_set->unichar_to_id(
"-")))
1056 ConditionalBlobMerge(
1064 if (id1 == id2 && id1 == uch_set->unichar_to_id(
" "))
1067 return INVALID_UNICHAR_ID;
1072 if (ConditionalBlobMerge(
1074 int len = best_choice->length();
1084 for (
int index = start; index < start + count - 1; ++index) {
1085 if (index >= 0 && index < seam_array.size()) {
1086 SEAM* seam = seam_array[index];
1087 if (seam !=
nullptr && seam->
HasAnySplits())
return false;
1099 tess_failed =
false;
1100 tess_accepted =
false;
1101 tess_would_adapt =
false;
1111 fontinfo2 =
nullptr;
1113 fontinfo_id_count = 0;
1114 fontinfo_id2_count = 0;
1117 baseline_shift = 0.0f;
1118 space_certainty = 0.0f;
1119 guessed_x_ht =
true;
1120 guessed_caps_ht =
true;
1121 combination =
false;
1122 part_of_combo =
false;
1123 reject_spaces =
false;
1128 bln_boxes =
nullptr;
1131 chopped_word =
nullptr;
1132 rebuild_word =
nullptr;
1135 best_choice =
nullptr;
1136 raw_choice =
nullptr;
1137 ep_choice =
nullptr;
1138 blamer_bundle =
nullptr;
1146 delete blamer_bundle;
1147 blamer_bundle =
nullptr;
1154 fontinfo2 =
nullptr;
1155 fontinfo_id_count = 0;
1156 fontinfo_id2_count = 0;
1158 bln_boxes =
nullptr;
1160 delete chopped_word;
1161 chopped_word =
nullptr;
1162 delete rebuild_word;
1163 rebuild_word =
nullptr;
1167 correct_text.clear();
1168 seam_array.delete_data_pointers();
1170 blob_widths.clear();
1174 if (blamer_bundle !=
nullptr) blamer_bundle->ClearResults();
1177 best_choice =
nullptr;
1179 raw_choice =
nullptr;
1180 best_choices.clear();
1182 ep_choice =
nullptr;
1185 if (ratings !=
nullptr) {
1186 ratings->delete_matrix_pointers();
1194 if (other.block_res ==
nullptr) {
1196 if (block_res ==
nullptr)
1200 if (block_res ==
nullptr) {
1203 if (block_res == other.block_res) {
1204 if (other.row_res ==
nullptr || row_res ==
nullptr) {
1208 if (row_res == other.row_res) {
1210 ASSERT_HOST(other.word_res !=
nullptr && word_res !=
nullptr);
1211 if (word_res == other.word_res) {
1216 WERD_RES_IT word_res_it(&row_res->word_res_list);
1217 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
1218 word_res_it.forward()) {
1219 if (word_res_it.data() == word_res) {
1221 }
else if (word_res_it.data() == other.word_res) {
1225 ASSERT_HOST(
"Error: Incomparable PAGE_RES_ITs" ==
nullptr);
1229 ROW_RES_IT row_res_it(&block_res->row_res_list);
1230 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
1231 row_res_it.forward()) {
1232 if (row_res_it.data() == row_res) {
1234 }
else if (row_res_it.data() == other.row_res) {
1238 ASSERT_HOST(
"Error: Incomparable PAGE_RES_ITs" ==
nullptr);
1242 BLOCK_RES_IT block_res_it(&page_res->block_res_list);
1243 for (block_res_it.mark_cycle_pt();
1244 !block_res_it.cycled_list(); block_res_it.forward()) {
1245 if (block_res_it.data() == block_res) {
1247 }
else if (block_res_it.data() == other.block_res) {
1252 ASSERT_HOST(
"Error: Incomparable PAGE_RES_ITs" ==
nullptr);
1263 auto* new_res =
new WERD_RES(new_word);
1264 new_res->CopySimpleFields(clone_res);
1265 new_res->combination =
true;
1267 WERD_RES_IT wr_it(&row()->word_res_list);
1268 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1270 if (word == word_res)
1274 wr_it.add_before_then_move(new_res);
1275 if (wr_it.at_first()) {
1278 ResetWordIterator();
1286 static void ComputeBlobEnds(
const WERD_RES& word,
const TBOX& clip_box,
1287 C_BLOB_LIST* next_word_blobs,
1293 TBOX blob_box = blob_it.data()->bounding_box();
1295 for (
int b = 1; b < length; ++b) {
1296 blob_box += blob_it.data()->bounding_box();
1301 int blob_end = INT32_MAX;
1302 if (!blob_it.at_first() || next_word_blobs !=
nullptr) {
1303 if (blob_it.at_first())
1304 blob_it.set_to_list(next_word_blobs);
1305 blob_end = (blob_box.
right() + blob_it.data()->bounding_box().left()) / 2;
1307 blob_end = ClipToRange<int>(blob_end, clip_box.
left(), clip_box.
right());
1316 int w_index,
TBOX prev_box, WERD_RES_IT w_it) {
1317 constexpr
int kSignificantOverlapFraction = 4;
1319 TBOX current_box = words[w_index]->word->bounding_box();
1321 if (w_index + 1 < words.
size() && words[w_index + 1] !=
nullptr &&
1322 words[w_index + 1]->word !=
nullptr)
1323 next_box = words[w_index + 1]->word->bounding_box();
1324 for (w_it.forward(); !w_it.at_first() && w_it.data()->part_of_combo;
1326 if (w_it.data() ==
nullptr || w_it.data()->word ==
nullptr)
continue;
1327 TBOX w_box = w_it.data()->word->bounding_box();
1328 int height_limit = std::min<int>(w_box.
height(), w_box.
width() / 2);
1329 int width_limit = w_box.
width() / kSignificantOverlapFraction;
1330 int min_significant_overlap = std::max(height_limit, width_limit);
1334 if (overlap > min_significant_overlap) {
1335 if (prev_overlap > min_significant_overlap) {
1338 }
else if (next_overlap > min_significant_overlap) {
1342 clipped_box += w_box;
1346 if (clipped_box.
height() <= 0) {
1350 if (clipped_box.
width() <= 0) clipped_box = current_box;
1356 static TBOX MoveAndClipBlob(C_BLOB_IT* src_it, C_BLOB_IT* dest_it,
1357 const TBOX& clip_box) {
1358 C_BLOB* src_blob = src_it->extract();
1362 ClipToRange<int>(box.
left(), clip_box.
left(), clip_box.
right() - 1);
1364 ClipToRange<int>(box.
right(), clip_box.
left() + 1, clip_box.
right());
1366 ClipToRange<int>(box.
top(), clip_box.
bottom() + 1, clip_box.
top());
1368 ClipToRange<int>(box.
bottom(), clip_box.
bottom(), clip_box.
top() - 1);
1369 box =
TBOX(left, bottom, right, top);
1373 dest_it->add_after_then_move(src_blob);
1382 if (words->
empty()) {
1383 DeleteCurrentWord();
1389 (*words)[0]->word->set_flag(
W_BOL,
true);
1391 (*words)[0]->word->set_blanks(input_word->
word->
space());
1401 WERD_IT w_it(row()->row->word_list());
1403 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1404 WERD* word = w_it.data();
1405 if (word == input_word->
word)
1412 WERD_RES_IT wr_it(&row()->word_res_list);
1413 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1415 if (word == input_word)
1426 for (
int w = 0; w < words->
size(); ++w) {
1428 clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
1431 C_BLOB_LIST* next_word_blobs =
1432 w + 1 < words->
size() ? (*words)[w + 1]->word->cblob_list() :
nullptr;
1433 ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
1436 C_BLOB_LIST fake_blobs;
1437 C_BLOB_IT fake_b_it(&fake_blobs);
1439 fake_b_it.move_to_first();
1444 for (
int i = 0; i < blob_ends.
size(); ++i, fake_b_it.forward()) {
1445 int end_x = blob_ends[i];
1448 while (!src_b_it.empty() &&
1449 src_b_it.data()->bounding_box().x_middle() < end_x) {
1450 blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
1453 while (!rej_b_it.empty() &&
1454 rej_b_it.data()->bounding_box().x_middle() < end_x) {
1455 blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
1460 blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
1462 box_word->InsertBox(i, blob_box);
1469 w_it.add_before_stay_put(word_w->
word);
1472 (*words)[w] =
nullptr;
1473 wr_it.add_before_stay_put(word_w);
1481 delete w_it.extract();
1482 delete wr_it.extract();
1483 ResetWordIterator();
1491 if (!word_res->combination) {
1495 WERD_IT w_it(row()->row->word_list());
1496 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1497 if (w_it.data() == word_res->word) {
1502 delete w_it.extract();
1506 WERD_RES_IT wr_it(&row()->word_res_list);
1507 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1508 if (wr_it.data() == word_res) {
1514 delete wr_it.extract();
1515 ResetWordIterator();
1521 WERD* real_word = word_res->word;
1524 if (word_res->combination) {
1527 WERD_RES_IT wr_it(&row()->word_res_list);
1528 for (wr_it.mark_cycle_pt();
1529 !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
1533 real_word = wr_it.data()->word;
1548 block_res_it.set_to_list(&page_res->block_res_list);
1549 block_res_it.mark_cycle_pt();
1550 prev_block_res =
nullptr;
1551 prev_row_res =
nullptr;
1552 prev_word_res =
nullptr;
1553 block_res =
nullptr;
1556 next_block_res =
nullptr;
1557 next_row_res =
nullptr;
1558 next_word_res =
nullptr;
1559 internal_forward(
true, empty_ok);
1560 return internal_forward(
false, empty_ok);
1571 if (row_res == next_row_res) {
1574 word_res_it.move_to_first();
1575 for (word_res_it.mark_cycle_pt();
1576 !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
1577 word_res_it.forward()) {
1578 if (!word_res_it.data()->part_of_combo) {
1579 if (prev_row_res == row_res) prev_word_res = word_res;
1580 word_res = word_res_it.data();
1584 wr_it_of_next_word = word_res_it;
1585 word_res_it.forward();
1588 WERD_RES_IT wr_it(&row_res->word_res_list);
1589 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1590 if (!wr_it.data()->part_of_combo) {
1591 if (prev_row_res == row_res) prev_word_res = word_res;
1592 word_res = wr_it.data();
1613 WERD_RES *PAGE_RES_IT::internal_forward(
bool new_block,
bool empty_ok) {
1614 bool new_row =
false;
1616 prev_block_res = block_res;
1617 prev_row_res = row_res;
1618 prev_word_res = word_res;
1619 block_res = next_block_res;
1620 row_res = next_row_res;
1621 word_res = next_word_res;
1622 wr_it_of_current_word = wr_it_of_next_word;
1623 next_block_res =
nullptr;
1624 next_row_res =
nullptr;
1625 next_word_res =
nullptr;
1627 while (!block_res_it.cycled_list()) {
1630 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
1631 row_res_it.mark_cycle_pt();
1632 if (row_res_it.empty() && empty_ok) {
1633 next_block_res = block_res_it.data();
1638 while (!row_res_it.cycled_list()) {
1641 word_res_it.set_to_list(&row_res_it.data()->word_res_list);
1642 word_res_it.mark_cycle_pt();
1645 while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
1646 word_res_it.forward();
1647 if (!word_res_it.cycled_list()) {
1648 next_block_res = block_res_it.data();
1649 next_row_res = row_res_it.data();
1650 next_word_res = word_res_it.data();
1651 wr_it_of_next_word = word_res_it;
1652 word_res_it.forward();
1656 row_res_it.forward();
1660 block_res_it.forward();
1665 if (page_res !=
nullptr && page_res->prev_word_best_choice !=
nullptr) {
1666 *page_res->prev_word_best_choice =
1667 (new_block || prev_word_res ==
nullptr) ?
nullptr : prev_word_res->
best_choice;
1679 if (!row)
return nullptr;
1680 for (restart_page(); this->row() != row; forward()) {
1693 while (block_res == next_block_res &&
1694 (next_row_res !=
nullptr && next_row_res->row !=
nullptr &&
1695 row_res->row->para() == next_row_res->row->para())) {
1696 internal_forward(
false,
true);
1698 return internal_forward(
false,
true);
1708 while (block_res == next_block_res) {
1709 internal_forward(
false,
true);
1711 return internal_forward(
false,
true);
1715 int16_t chars_in_word;
1716 int16_t rejects_in_word = 0;
1718 chars_in_word = word_res->reject_map.length ();
1719 page_res->char_count += chars_in_word;
1720 block_res->char_count += chars_in_word;
1721 row_res->char_count += chars_in_word;
1723 rejects_in_word = word_res->reject_map.reject_count ();
1725 page_res->rej_count += rejects_in_word;
1726 block_res->rej_count += rejects_in_word;
1727 row_res->rej_count += rejects_in_word;
1728 if (chars_in_word == rejects_in_word)
1729 row_res->whole_word_rej_count += rejects_in_word;
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
UNICHAR_ID unichar_id() const
float adjust_factor() const
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
int GetBlobsGap(int blob_index)
void FilterWordChoices(int debug_level)
void operator=(const ELIST_LINK &)
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Special case latin for y. splitting.
void UpdateStateForSplit(int blob_position)
void DebugTopChoice(const char *msg) const
void SetupFake(const UNICHARSET &uch)
void ReplaceBestChoice(WERD_CHOICE *choice)
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
WERD_RES * forward_block()
GenericVector< int > blob_widths
BLOB_CHOICE * GetBlobChoice(int index) const
POLY_BLOCK * poly_block() const
static void JoinPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
GenericVector< STRING > correct_text
TBOX bounding_box() const
void add_str_int(const char *str, int number)
const STRING & unichar_string() const
C_BLOB_LIST * rej_cblob_list()
bool PiecesAllNatural(int start, int count) const
GenericVector< SEAM * > seam_array
static void BreakPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
static BoxWord * CopyFromNormalized(TWERD *tessword)
WERD_CHOICE_LIST best_choices
bool contains(const FCOORD pt) const
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
void FakeWordFromRatings(PermuterType permuter)
TBOX intersection(const TBOX &box) const
TBOX bounding_box() const
void SetupBlobWidthsAndGaps()
tesseract::BoxWord * bln_boxes
tesseract::Tesseract * tesseract
const double kMaxWordGapRatio
WERD_RES & operator=(const WERD_RES &source)
bool script_has_xheight() const
void DebugWordChoices(bool debug, const char *word_to_debug)
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
const UNICHARSET * uch_set
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
const FontInfo * fontinfo2
static int SortByXMiddle(const void *v1, const void *v2)
#define ELISTIZE(CLASSNAME)
bool HasAnySplits() const
void SetupWordScript(const UNICHARSET &unicharset_in)
bool LogNewRawChoice(WERD_CHOICE *word_choice)
CRUNCH_MODE unlv_crunch_mode
int8_t fontinfo_id2_count
WERD_RES * forward_paragraph()
void SetAllScriptPositions(tesseract::ScriptPos position)
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
int cmp(const PAGE_RES_IT &other) const
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
bool ConditionalBlobMerge(TessResultCallback2< UNICHAR_ID, UNICHAR_ID, UNICHAR_ID > *class_cb, TessResultCallback2< bool, const TBOX &, const TBOX & > *box_cb)
const FontInfo * fontinfo
const char * string() const
DLLSYM void tprintf(const char *format,...)
bool PrepareToInsertSeam(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int insert_index, bool modify)
tesseract::BoxWord * box_word
void copy_on(WERD_RES *word_res)
PDBLK pdblk
Page Description Block.
void CopySimpleFields(const WERD_RES &source)
void ConsumeWordResults(WERD_RES *word)
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
void start_seam_list(TWERD *word, GenericVector< SEAM * > *seam_array)
TBOX bounding_box() const
void BestChoiceToCorrectText()
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
UNICHAR_ID unichar_id(int index) const
WERD_RES * start_page(bool empty_ok)
WERD_CHOICE * best_choice
int GetBlobsWidth(int start_blob, int last_blob)
void CloneChoppedToRebuild()
const double kMaxWordSizeRatio
const double kMaxLineSizeRatio
bool flag(WERD_FLAGS mask) const
void MakeCurrentWordFuzzy()
bool Valid(const MATRIX &m) const
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
void InitForRetryRecognition(const WERD_RES &source)
GenericVector< int > best_state
ROW_LIST * row_list()
get rows
GenericVector< int > blob_gaps
int TotalOfStates() const
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
void MergeAdjacentBlobs(int index)
void PrintBestChoices() const
WERD_CHOICE ** prev_word_best_choice
x-height concept makes sense.
const int kWordrecMaxNumJoinChunks
BLOCK_RES_LIST block_res_list
void set_flag(WERD_FLAGS mask, bool value)
void SetScriptPositions()
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
void InsertSeam(int blob_number, SEAM *seam)
int state(int index) const
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
static C_BLOB * FakeBlob(const TBOX &box)
void set_unichar_id(UNICHAR_ID newunichar_id)
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
BlamerBundle * blamer_bundle
C_BLOB_LIST * cblob_list()