tesseract  4.1.0
pageres.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pageres.cpp (Formerly page_res.c)
3  * Description: Hierarchy of results classes from PAGE_RES to WERD_RES
4  * and an iterator class to iterate over the words.
5  * Main purposes:
6  * Easy way to iterate over the words without a 3-nested loop.
7  * Holds data used during word recognition.
8  * Holds information about alternative spacing paths.
9  * Author: Phil Cheatle
10  *
11  * (C) Copyright 1992, Hewlett-Packard Ltd.
12  ** Licensed under the Apache License, Version 2.0 (the "License");
13  ** you may not use this file except in compliance with the License.
14  ** You may obtain a copy of the License at
15  ** http://www.apache.org/licenses/LICENSE-2.0
16  ** Unless required by applicable law or agreed to in writing, software
17  ** distributed under the License is distributed on an "AS IS" BASIS,
18  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19  ** See the License for the specific language governing permissions and
20  ** limitations under the License.
21  *
22  **********************************************************************/
23 
24 #include "pageres.h"
25 #include <cassert> // for assert
26 #include <cstdint> // for INT32_MAX
27 #include <cstring> // for strlen
28 #include "blamer.h" // for BlamerBundle
29 #include "blobs.h" // for TWERD, TBLOB
30 #include "boxword.h" // for BoxWord
31 #include "errcode.h" // for ASSERT_HOST
32 #include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
33 #include "ocrrow.h" // for ROW, ROW_IT
34 #include "pdblock.h" // for PDBLK
35 #include "polyblk.h" // for POLY_BLOCK
36 #include "publictypes.h" // for OcrEngineMode, OEM_LSTM_ONLY
37 #include "seam.h" // for SEAM, start_seam_list
38 #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
39 #include "tesscallback.h" // for NewPermanentTessCallback, TessResultCallback2
40 #include "tprintf.h" // for tprintf
41 
42 struct Pix;
43 
46 
47 // Gain factor for computing thresholds that determine the ambiguity of a word.
48 static const double kStopperAmbiguityThresholdGain = 8.0;
49 // Constant offset for computing thresholds that determine the ambiguity of a
50 // word.
51 static const double kStopperAmbiguityThresholdOffset = 1.5;
52 // Max number of broken pieces to associate.
54 // Max ratio of word box height to line size to allow it to be processed as
55 // a line with other words.
56 const double kMaxWordSizeRatio = 1.25;
57 // Max ratio of line box height to line size to allow a new word to be added.
58 const double kMaxLineSizeRatio = 1.25;
59 // Max ratio of word gap to line size to allow a new word to be added.
60 const double kMaxWordGapRatio = 2.0;
61 
62 // Computes and returns a threshold of certainty difference used to determine
63 // which words to keep, based on the adjustment factors of the two words.
64 // TODO(rays) This is horrible. Replace with an enhance params training model.
65 static double StopperAmbigThreshold(double f1, double f2) {
66  return (f2 - f1) * kStopperAmbiguityThresholdGain -
67  kStopperAmbiguityThresholdOffset;
68 }
69 
70 /*************************************************************************
71  * PAGE_RES::PAGE_RES
72  *
73  * Constructor for page results
74  *************************************************************************/
76  bool merge_similar_words,
77  BLOCK_LIST *the_block_list,
78  WERD_CHOICE **prev_word_best_choice_ptr) {
79  Init();
80  BLOCK_IT block_it(the_block_list);
81  BLOCK_RES_IT block_res_it(&block_res_list);
82  for (block_it.mark_cycle_pt();
83  !block_it.cycled_list(); block_it.forward()) {
84  block_res_it.add_to_end(new BLOCK_RES(merge_similar_words,
85  block_it.data()));
86  }
87  prev_word_best_choice = prev_word_best_choice_ptr;
88 }
89 
90 /*************************************************************************
91  * BLOCK_RES::BLOCK_RES
92  *
93  * Constructor for BLOCK results
94  *************************************************************************/
95 
96 BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
97  ROW_IT row_it (the_block->row_list ());
98  ROW_RES_IT row_res_it(&row_res_list);
99 
100  char_count = 0;
101  rej_count = 0;
102  font_class = -1; //not assigned
103  x_height = -1.0;
104  font_assigned = false;
105  bold = false;
106  italic = false;
107  row_count = 0;
108 
109  block = the_block;
110 
111  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
112  row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
113  }
114 }
115 
116 /*************************************************************************
117  * ROW_RES::ROW_RES
118  *
119  * Constructor for ROW results
120  *************************************************************************/
121 
122 ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
123  WERD_IT word_it(the_row->word_list());
124  WERD_RES_IT word_res_it(&word_res_list);
125  WERD_RES *combo = nullptr; // current combination of fuzzies
126  WERD *copy_word;
127 
128  char_count = 0;
129  rej_count = 0;
130  whole_word_rej_count = 0;
131 
132  row = the_row;
133  bool add_next_word = false;
134  TBOX union_box;
135  float line_height = the_row->x_height() + the_row->ascenders() -
136  the_row->descenders();
137  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
138  auto* word_res = new WERD_RES(word_it.data());
139  word_res->x_height = the_row->x_height();
140  if (add_next_word) {
141  ASSERT_HOST(combo != nullptr);
142  // We are adding this word to the combination.
143  word_res->part_of_combo = true;
144  combo->copy_on(word_res);
145  } else if (merge_similar_words) {
146  union_box = word_res->word->bounding_box();
147  add_next_word = !word_res->word->flag(W_REP_CHAR) &&
148  union_box.height() <= line_height * kMaxWordSizeRatio;
149  word_res->odd_size = !add_next_word;
150  }
151  WERD* next_word = word_it.data_relative(1);
152  if (merge_similar_words) {
153  if (add_next_word && !next_word->flag(W_REP_CHAR)) {
154  // Next word will be added on if all of the following are true:
155  // Not a rep char.
156  // Box height small enough.
157  // Union box height small enough.
158  // Horizontal gap small enough.
159  TBOX next_box = next_word->bounding_box();
160  int prev_right = union_box.right();
161  union_box += next_box;
162  if (next_box.height() > line_height * kMaxWordSizeRatio ||
163  union_box.height() > line_height * kMaxLineSizeRatio ||
164  next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
165  add_next_word = false;
166  }
167  }
168  next_word->set_flag(W_FUZZY_NON, add_next_word);
169  } else {
170  add_next_word = next_word->flag(W_FUZZY_NON);
171  }
172  if (add_next_word) {
173  if (combo == nullptr) {
174  copy_word = new WERD;
175  *copy_word = *(word_it.data()); // deep copy
176  combo = new WERD_RES(copy_word);
177  combo->x_height = the_row->x_height();
178  combo->combination = true;
179  word_res_it.add_to_end(combo);
180  }
181  word_res->part_of_combo = true;
182  } else {
183  combo = nullptr;
184  }
185  word_res_it.add_to_end(word_res);
186  }
187 }
188 
189 
191  this->ELIST_LINK::operator=(source);
192  Clear();
193  if (source.combination) {
194  word = new WERD;
195  *word = *(source.word); // deep copy
196  } else {
197  word = source.word; // pt to same word
198  }
199  if (source.bln_boxes != nullptr)
200  bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
201  if (source.chopped_word != nullptr)
202  chopped_word = new TWERD(*source.chopped_word);
203  if (source.rebuild_word != nullptr)
204  rebuild_word = new TWERD(*source.rebuild_word);
205  // TODO(rays) Do we ever need to copy the seam_array?
206  blob_row = source.blob_row;
207  denorm = source.denorm;
208  if (source.box_word != nullptr)
209  box_word = new tesseract::BoxWord(*source.box_word);
210  best_state = source.best_state;
211  correct_text = source.correct_text;
212  blob_widths = source.blob_widths;
213  blob_gaps = source.blob_gaps;
214  // None of the uses of operator= require the ratings matrix to be copied,
215  // so don't as it would be really slow.
216 
217  // Copy the cooked choices.
218  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&source.best_choices));
219  WERD_CHOICE_IT wc_dest_it(&best_choices);
220  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
221  const WERD_CHOICE *choice = wc_it.data();
222  wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));
223  }
224  if (!wc_dest_it.empty()) {
225  wc_dest_it.move_to_first();
226  best_choice = wc_dest_it.data();
227  } else {
228  best_choice = nullptr;
229  }
230 
231  if (source.raw_choice != nullptr) {
232  raw_choice = new WERD_CHOICE(*source.raw_choice);
233  } else {
234  raw_choice = nullptr;
235  }
236  if (source.ep_choice != nullptr) {
237  ep_choice = new WERD_CHOICE(*source.ep_choice);
238  } else {
239  ep_choice = nullptr;
240  }
241  reject_map = source.reject_map;
242  combination = source.combination;
243  part_of_combo = source.part_of_combo;
244  CopySimpleFields(source);
245  if (source.blamer_bundle != nullptr) {
246  blamer_bundle = new BlamerBundle(*(source.blamer_bundle));
247  }
248  return *this;
249 }
250 
251 // Copies basic fields that don't involve pointers that might be useful
252 // to copy when making one WERD_RES from another.
254  tess_failed = source.tess_failed;
255  tess_accepted = source.tess_accepted;
256  tess_would_adapt = source.tess_would_adapt;
257  done = source.done;
258  unlv_crunch_mode = source.unlv_crunch_mode;
259  small_caps = source.small_caps;
260  odd_size = source.odd_size;
261  italic = source.italic;
262  bold = source.bold;
263  fontinfo = source.fontinfo;
264  fontinfo2 = source.fontinfo2;
265  fontinfo_id_count = source.fontinfo_id_count;
266  fontinfo_id2_count = source.fontinfo_id2_count;
267  x_height = source.x_height;
268  caps_height = source.caps_height;
269  baseline_shift = source.baseline_shift;
270  guessed_x_ht = source.guessed_x_ht;
271  guessed_caps_ht = source.guessed_caps_ht;
272  reject_spaces = source.reject_spaces;
273  uch_set = source.uch_set;
274  tesseract = source.tesseract;
275 }
276 
277 // Initializes a blank (default constructed) WERD_RES from one that has
278 // already been recognized.
279 // Use SetupFor*Recognition afterwards to complete the setup and make
280 // it ready for a retry recognition.
282  word = source.word;
283  CopySimpleFields(source);
284  if (source.blamer_bundle != nullptr) {
285  blamer_bundle = new BlamerBundle();
286  blamer_bundle->CopyTruth(*source.blamer_bundle);
287  }
288 }
289 
290 // Sets up the members used in recognition: bln_boxes, chopped_word,
291 // seam_array, denorm. Returns false if
292 // the word is empty and sets up fake results. If use_body_size is
293 // true and row->body_size is set, then body_size will be used for
294 // blob normalization instead of xheight + ascrise. This flag is for
295 // those languages that are using CJK pitch model and thus it has to
296 // be true if and only if tesseract->textord_use_cjk_fp_model is
297 // true.
298 // If allow_detailed_fx is true, the feature extractor will receive fine
299 // precision outline information, allowing smoother features and better
300 // features on low resolution images.
301 // The norm_mode_hint sets the default mode for normalization in absence
302 // of any of the above flags.
303 // norm_box is used to override the word bounding box to determine the
304 // normalization scale and offset.
305 // Returns false if the word is empty and sets up fake results.
306 bool WERD_RES::SetupForRecognition(const UNICHARSET& unicharset_in,
307  tesseract::Tesseract* tess, Pix* pix,
308  int norm_mode,
309  const TBOX* norm_box,
310  bool numeric_mode,
311  bool use_body_size,
312  bool allow_detailed_fx,
313  ROW *row, const BLOCK* block) {
314  auto norm_mode_hint =
315  static_cast<tesseract::OcrEngineMode>(norm_mode);
316  tesseract = tess;
317  POLY_BLOCK* pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
318  if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY &&
319  word->cblob_list()->empty()) ||
320  (pb != nullptr && !pb->IsText())) {
321  // Empty words occur when all the blobs have been moved to the rej_blobs
322  // list, which seems to occur frequently in junk.
323  SetupFake(unicharset_in);
324  word->set_flag(W_REP_CHAR, false);
325  return false;
326  }
327  ClearResults();
328  SetupWordScript(unicharset_in);
329  chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);
330  float word_xheight = use_body_size && row != nullptr && row->body_size() > 0.0f
331  ? row->body_size() : x_height;
332  chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
333  word_xheight, baseline_shift, numeric_mode,
334  norm_mode_hint, norm_box, &denorm);
335  blob_row = row;
336  SetupBasicsFromChoppedWord(unicharset_in);
337  SetupBlamerBundle();
338  int num_blobs = chopped_word->NumBlobs();
339  ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);
340  tess_failed = false;
341  return true;
342 }
343 
344 // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
345 // accumulators from a made chopped word. We presume the fields are already
346 // empty.
348  bln_boxes = tesseract::BoxWord::CopyFromNormalized(chopped_word);
349  start_seam_list(chopped_word, &seam_array);
350  SetupBlobWidthsAndGaps();
351  ClearWordChoices();
352 }
353 
354 // Sets up the members used in recognition for an empty recognition result:
355 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
356 void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
357  ClearResults();
358  SetupWordScript(unicharset_in);
359  chopped_word = new TWERD;
360  rebuild_word = new TWERD;
361  bln_boxes = new tesseract::BoxWord;
362  box_word = new tesseract::BoxWord;
363  int blob_count = word->cblob_list()->length();
364  if (blob_count > 0) {
365  auto** fake_choices = new BLOB_CHOICE*[blob_count];
366  // For non-text blocks, just pass any blobs through to the box_word
367  // and call the word failed with a fake classification.
368  C_BLOB_IT b_it(word->cblob_list());
369  int blob_id = 0;
370  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
371  TBOX box = b_it.data()->bounding_box();
372  box_word->InsertBox(box_word->length(), box);
373  fake_choices[blob_id++] = new BLOB_CHOICE;
374  }
375  FakeClassifyWord(blob_count, fake_choices);
376  delete [] fake_choices;
377  } else {
378  auto* word = new WERD_CHOICE(&unicharset_in);
379  word->make_bad();
380  LogNewRawChoice(word);
381  // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.
382  LogNewCookedChoice(1, false, word);
383  }
384  tess_failed = true;
385  done = true;
386 }
387 
389  uch_set = &uch;
390  int script = uch.default_sid();
391  word->set_script_id(script);
392  word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight());
393  word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
394 }
395 
396 // Sets up the blamer_bundle if it is not null, using the initialized denorm.
398  if (blamer_bundle != nullptr) {
399  blamer_bundle->SetupNormTruthWord(denorm);
400  }
401 }
402 
403 // Computes the blob_widths and blob_gaps from the chopped_word.
405  blob_widths.truncate(0);
406  blob_gaps.truncate(0);
407  int num_blobs = chopped_word->NumBlobs();
408  for (int b = 0; b < num_blobs; ++b) {
409  TBLOB *blob = chopped_word->blobs[b];
410  TBOX box = blob->bounding_box();
411  blob_widths.push_back(box.width());
412  if (b + 1 < num_blobs) {
413  blob_gaps.push_back(
414  chopped_word->blobs[b + 1]->bounding_box().left() - box.right());
415  }
416  }
417 }
418 
419 // Updates internal data to account for a new SEAM (chop) at the given
420 // blob_number. Fixes the ratings matrix and states in the choices, as well
421 // as the blob widths and gaps.
422 void WERD_RES::InsertSeam(int blob_number, SEAM* seam) {
423  // Insert the seam into the SEAMS array.
424  seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
425  seam_array.insert(seam, blob_number);
426  if (ratings != nullptr) {
427  // Expand the ratings matrix.
428  ratings = ratings->ConsumeAndMakeBigger(blob_number);
429  // Fix all the segmentation states.
430  if (raw_choice != nullptr)
431  raw_choice->UpdateStateForSplit(blob_number);
432  WERD_CHOICE_IT wc_it(&best_choices);
433  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
434  WERD_CHOICE* choice = wc_it.data();
435  choice->UpdateStateForSplit(blob_number);
436  }
437  SetupBlobWidthsAndGaps();
438  }
439 }
440 
441 // Returns true if all the word choices except the first have adjust_factors
442 // worse than the given threshold.
444  // The choices are not changed by this iteration.
445  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
446  for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
447  WERD_CHOICE* choice = wc_it.data();
448  if (choice->adjust_factor() <= threshold)
449  return false;
450  }
451  return true;
452 }
453 
454 // Returns true if the current word is ambiguous (by number of answers or
455 // by dangerous ambigs.)
457  return !best_choices.singleton() || best_choice->dangerous_ambig_found();
458 }
459 
460 // Returns true if the ratings matrix size matches the sum of each of the
461 // segmentation states.
463  int ratings_dim = ratings->dimension();
464  if (raw_choice->TotalOfStates() != ratings_dim) {
465  tprintf("raw_choice has total of states = %d vs ratings dim of %d\n",
466  raw_choice->TotalOfStates(), ratings_dim);
467  return false;
468  }
469  WERD_CHOICE_IT it(&best_choices);
470  int index = 0;
471  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
472  WERD_CHOICE* choice = it.data();
473  if (choice->TotalOfStates() != ratings_dim) {
474  tprintf("Cooked #%d has total of states = %d vs ratings dim of %d\n",
475  index, choice->TotalOfStates(), ratings_dim);
476  return false;
477  }
478  }
479  return true;
480 }
481 
482 // Prints a list of words found if debug is true or the word result matches
483 // the word_to_debug.
484 void WERD_RES::DebugWordChoices(bool debug, const char* word_to_debug) {
485  if (debug ||
486  (word_to_debug != nullptr && *word_to_debug != '\0' && best_choice != nullptr &&
487  best_choice->unichar_string() == STRING(word_to_debug))) {
488  if (raw_choice != nullptr)
489  raw_choice->print("\nBest Raw Choice");
490 
491  WERD_CHOICE_IT it(&best_choices);
492  int index = 0;
493  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
494  WERD_CHOICE* choice = it.data();
495  STRING label;
496  label.add_str_int("\nCooked Choice #", index);
497  choice->print(label.string());
498  }
499  }
500 }
501 
502 // Prints the top choice along with the accepted/done flags.
503 void WERD_RES::DebugTopChoice(const char* msg) const {
504  tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ",
505  tess_accepted, tess_would_adapt, done);
506  if (best_choice == nullptr)
507  tprintf("<Null choice>\n");
508  else
509  best_choice->print(msg);
510 }
511 
512 // Removes from best_choices all choices which are not within a reasonable
513 // range of the best choice.
514 // TODO(rays) incorporate the information used here into the params training
515 // re-ranker, in place of this heuristic that is based on the previous
516 // adjustment factor.
517 void WERD_RES::FilterWordChoices(int debug_level) {
518  if (best_choice == nullptr || best_choices.singleton())
519  return;
520 
521  if (debug_level >= 2)
522  best_choice->print("\nFiltering against best choice");
523  WERD_CHOICE_IT it(&best_choices);
524  int index = 0;
525  for (it.forward(); !it.at_first(); it.forward(), ++index) {
526  WERD_CHOICE* choice = it.data();
527  float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
528  choice->adjust_factor());
529  // i, j index the blob choice in choice, best_choice.
530  // chunk is an index into the chopped_word blobs (AKA chunks).
531  // Since the two words may use different segmentations of the chunks, we
532  // iterate over the chunks to find out whether a comparable blob
533  // classification is much worse than the best result.
534  int i = 0, j = 0, chunk = 0;
535  // Each iteration of the while deals with 1 chunk. On entry choice_chunk
536  // and best_chunk are the indices of the first chunk in the NEXT blob,
537  // i.e. we don't have to increment i, j while chunk < choice_chunk and
538  // best_chunk respectively.
539  int choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
540  while (i < choice->length() && j < best_choice->length()) {
541  if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
542  choice->certainty(i) - best_choice->certainty(j) < threshold) {
543  if (debug_level >= 2) {
544  choice->print("WorstCertaintyDiffWorseThan");
545  tprintf(
546  "i %d j %d Choice->Blob[i].Certainty %.4g"
547  " WorstOtherChoiceCertainty %g Threshold %g\n",
548  i, j, choice->certainty(i), best_choice->certainty(j), threshold);
549  tprintf("Discarding bad choice #%d\n", index);
550  }
551  delete it.extract();
552  break;
553  }
554  ++chunk;
555  // If needed, advance choice_chunk to keep up with chunk.
556  while (choice_chunk < chunk && ++i < choice->length())
557  choice_chunk += choice->state(i);
558  // If needed, advance best_chunk to keep up with chunk.
559  while (best_chunk < chunk && ++j < best_choice->length())
560  best_chunk += best_choice->state(j);
561  }
562  }
563 }
564 
565 void WERD_RES::ComputeAdaptionThresholds(float certainty_scale,
566  float min_rating,
567  float max_rating,
568  float rating_margin,
569  float* thresholds) {
570  int chunk = 0;
571  int end_chunk = best_choice->state(0);
572  int end_raw_chunk = raw_choice->state(0);
573  int raw_blob = 0;
574  for (int i = 0; i < best_choice->length(); i++, thresholds++) {
575  float avg_rating = 0.0f;
576  int num_error_chunks = 0;
577 
578  // For each chunk in best choice blob i, count non-matching raw results.
579  while (chunk < end_chunk) {
580  if (chunk >= end_raw_chunk) {
581  ++raw_blob;
582  end_raw_chunk += raw_choice->state(raw_blob);
583  }
584  if (best_choice->unichar_id(i) !=
585  raw_choice->unichar_id(raw_blob)) {
586  avg_rating += raw_choice->certainty(raw_blob);
587  ++num_error_chunks;
588  }
589  ++chunk;
590  }
591 
592  if (num_error_chunks > 0) {
593  avg_rating /= num_error_chunks;
594  *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
595  } else {
596  *thresholds = max_rating;
597  }
598 
599  if (*thresholds > max_rating)
600  *thresholds = max_rating;
601  if (*thresholds < min_rating)
602  *thresholds = min_rating;
603  }
604 }
605 
606 // Saves a copy of the word_choice if it has the best unadjusted rating.
607 // Returns true if the word_choice was the new best.
609  if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) {
610  delete raw_choice;
611  raw_choice = new WERD_CHOICE(*word_choice);
612  raw_choice->set_permuter(TOP_CHOICE_PERM);
613  return true;
614  }
615  return false;
616 }
617 
618 // Consumes word_choice by adding it to best_choices, (taking ownership) if
619 // the certainty for word_choice is some distance of the best choice in
620 // best_choices, or by deleting the word_choice and returning false.
621 // The best_choices list is kept in sorted order by rating. Duplicates are
622 // removed, and the list is kept no longer than max_num_choices in length.
623 // Returns true if the word_choice is still a valid pointer.
624 bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,
625  WERD_CHOICE* word_choice) {
626  if (best_choice != nullptr) {
627  // Throw out obviously bad choices to save some work.
628  // TODO(rays) Get rid of this! This piece of code produces different
629  // results according to the order in which words are found, which is an
630  // undesirable behavior. It would be better to keep all the choices and
631  // prune them later when more information is available.
632  float max_certainty_delta =
633  StopperAmbigThreshold(best_choice->adjust_factor(),
634  word_choice->adjust_factor());
635  if (max_certainty_delta > -kStopperAmbiguityThresholdOffset)
636  max_certainty_delta = -kStopperAmbiguityThresholdOffset;
637  if (word_choice->certainty() - best_choice->certainty() <
638  max_certainty_delta) {
639  if (debug) {
640  STRING bad_string;
641  word_choice->string_and_lengths(&bad_string, nullptr);
642  tprintf("Discarding choice \"%s\" with an overly low certainty"
643  " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
644  bad_string.string(), word_choice->certainty(),
645  best_choice->certainty(),
646  max_certainty_delta + best_choice->certainty());
647  }
648  delete word_choice;
649  return false;
650  }
651  }
652 
653  // Insert in the list in order of increasing rating, but knock out worse
654  // string duplicates.
655  WERD_CHOICE_IT it(&best_choices);
656  const STRING& new_str = word_choice->unichar_string();
657  bool inserted = false;
658  int num_choices = 0;
659  if (!it.empty()) {
660  do {
661  WERD_CHOICE* choice = it.data();
662  if (choice->rating() > word_choice->rating() && !inserted) {
663  // Time to insert.
664  it.add_before_stay_put(word_choice);
665  inserted = true;
666  if (num_choices == 0)
667  best_choice = word_choice; // This is the new best.
668  ++num_choices;
669  }
670  if (choice->unichar_string() == new_str) {
671  if (inserted) {
672  // New is better.
673  delete it.extract();
674  } else {
675  // Old is better.
676  if (debug) {
677  tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",
678  new_str.string(), word_choice->rating(), choice->rating());
679  }
680  delete word_choice;
681  return false;
682  }
683  } else {
684  ++num_choices;
685  if (num_choices > max_num_choices)
686  delete it.extract();
687  }
688  it.forward();
689  } while (!it.at_first());
690  }
691  if (!inserted && num_choices < max_num_choices) {
692  it.add_to_end(word_choice);
693  inserted = true;
694  if (num_choices == 0)
695  best_choice = word_choice; // This is the new best.
696  }
697  if (debug) {
698  if (inserted)
699  tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");
700  else
701  tprintf("Poor");
702  word_choice->print(" Word Choice");
703  }
704  if (!inserted) {
705  delete word_choice;
706  return false;
707  }
708  return true;
709 }
710 
711 
712 // Simple helper moves the ownership of the pointer data from src to dest,
713 // first deleting anything in dest, and nulling out src afterwards.
714 template<class T> static void MovePointerData(T** dest, T**src) {
715  delete *dest;
716  *dest = *src;
717  *src = nullptr;
718 }
719 
720 // Prints a brief list of all the best choices.
722  STRING alternates_str;
723  WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
724  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
725  if (!it.at_first()) alternates_str += "\", \"";
726  alternates_str += it.data()->unichar_string();
727  }
728  tprintf("Alternates for \"%s\": {\"%s\"}\n",
729  best_choice->unichar_string().string(), alternates_str.string());
730 }
731 
732 // Returns the sum of the widths of the blob between start_blob and last_blob
733 // inclusive.
734 int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) {
735  int result = 0;
736  for (int b = start_blob; b <= last_blob; ++b) {
737  result += blob_widths[b];
738  if (b < last_blob)
739  result += blob_gaps[b];
740  }
741  return result;
742 }
743 // Returns the width of a gap between the specified blob and the next one.
744 int WERD_RES::GetBlobsGap(int blob_index) {
745  if (blob_index < 0 || blob_index >= blob_gaps.size())
746  return 0;
747  return blob_gaps[blob_index];
748 }
749 
750 // Returns the BLOB_CHOICE corresponding to the given index in the
751 // best choice word taken from the appropriate cell in the ratings MATRIX.
752 // Borrowed pointer, so do not delete. May return nullptr if there is no
753 // BLOB_CHOICE matching the unichar_id at the given index.
755  if (index < 0 || index >= best_choice->length()) return nullptr;
756  BLOB_CHOICE_LIST* choices = GetBlobChoices(index);
757  return FindMatchingChoice(best_choice->unichar_id(index), choices);
758 }
759 
760 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
761 // best choice word taken from the appropriate cell in the ratings MATRIX.
762 // Borrowed pointer, so do not delete.
763 BLOB_CHOICE_LIST* WERD_RES::GetBlobChoices(int index) const {
764  return best_choice->blob_choices(index, ratings);
765 }
766 
767 // Moves the results fields from word to this. This takes ownership of all
768 // the data, so src can be destructed.
770  denorm = word->denorm;
771  blob_row = word->blob_row;
772  MovePointerData(&chopped_word, &word->chopped_word);
773  MovePointerData(&rebuild_word, &word->rebuild_word);
774  MovePointerData(&box_word, &word->box_word);
775  seam_array.delete_data_pointers();
776  seam_array = word->seam_array;
777  word->seam_array.clear();
778  best_state.move(&word->best_state);
779  correct_text.move(&word->correct_text);
780  blob_widths.move(&word->blob_widths);
781  blob_gaps.move(&word->blob_gaps);
782  if (ratings != nullptr) ratings->delete_matrix_pointers();
783  MovePointerData(&ratings, &word->ratings);
784  best_choice = word->best_choice;
785  MovePointerData(&raw_choice, &word->raw_choice);
786  best_choices.clear();
787  WERD_CHOICE_IT wc_it(&best_choices);
788  wc_it.add_list_after(&word->best_choices);
789  reject_map = word->reject_map;
790  if (word->blamer_bundle != nullptr) {
791  assert(blamer_bundle != nullptr);
792  blamer_bundle->CopyResults(*(word->blamer_bundle));
793  }
794  CopySimpleFields(*word);
795 }
796 
797 // Replace the best choice and rebuild box word.
798 // choice must be from the current best_choices list.
800  best_choice = choice;
801  RebuildBestState();
802  SetupBoxWord();
803  // Make up a fake reject map of the right length to keep the
804  // rejection pass happy.
805  reject_map.initialise(best_state.length());
806  done = tess_accepted = tess_would_adapt = true;
807  SetScriptPositions();
808 }
809 
810 // Builds the rebuild_word and sets the best_state from the chopped_word and
811 // the best_choice->state.
813  ASSERT_HOST(best_choice != nullptr);
814  delete rebuild_word;
815  rebuild_word = new TWERD;
816  if (seam_array.empty())
817  start_seam_list(chopped_word, &seam_array);
818  best_state.truncate(0);
819  int start = 0;
820  for (int i = 0; i < best_choice->length(); ++i) {
821  int length = best_choice->state(i);
822  best_state.push_back(length);
823  if (length > 1) {
824  SEAM::JoinPieces(seam_array, chopped_word->blobs, start,
825  start + length - 1);
826  }
827  TBLOB* blob = chopped_word->blobs[start];
828  rebuild_word->blobs.push_back(new TBLOB(*blob));
829  if (length > 1) {
830  SEAM::BreakPieces(seam_array, chopped_word->blobs, start,
831  start + length - 1);
832  }
833  start += length;
834  }
835 }
836 
837 // Copies the chopped_word to the rebuild_word, faking a best_state as well.
838 // Also sets up the output box_word.
840  delete rebuild_word;
841  rebuild_word = new TWERD(*chopped_word);
842  SetupBoxWord();
843  int word_len = box_word->length();
844  best_state.reserve(word_len);
845  correct_text.reserve(word_len);
846  for (int i = 0; i < word_len; ++i) {
847  best_state.push_back(1);
848  correct_text.push_back(STRING(""));
849  }
850 }
851 
852 // Sets/replaces the box_word with one made from the rebuild_word.
854  delete box_word;
855  rebuild_word->ComputeBoundingBoxes();
856  box_word = tesseract::BoxWord::CopyFromNormalized(rebuild_word);
857  box_word->ClipToOriginalWord(denorm.block(), word);
858 }
859 
860 // Sets up the script positions in the output best_choice using the best_choice
861 // to get the unichars, and the unicharset to get the target positions.
863  best_choice->SetScriptPositions(small_caps, chopped_word);
864 }
865 // Sets all the blobs in all the words (raw choice and best choices) to be
866 // the given position. (When a sub/superscript is recognized as a separate
867 // word, it falls victim to the rule that a whole word cannot be sub or
868 // superscript, so this function overrides that problem.)
870  raw_choice->SetAllScriptPositions(position);
871  WERD_CHOICE_IT wc_it(&best_choices);
872  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward())
873  wc_it.data()->SetAllScriptPositions(position);
874 }
875 
876 // Classifies the word with some already-calculated BLOB_CHOICEs.
877 // The choices are an array of blob_count pointers to BLOB_CHOICE,
878 // providing a single classifier result for each blob.
879 // The BLOB_CHOICEs are consumed and the word takes ownership.
880 // The number of blobs in the box_word must match blob_count.
881 void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
882  // Setup the WERD_RES.
883  ASSERT_HOST(box_word != nullptr);
884  ASSERT_HOST(blob_count == box_word->length());
885  ClearWordChoices();
886  ClearRatings();
887  ratings = new MATRIX(blob_count, 1);
888  for (int c = 0; c < blob_count; ++c) {
889  auto* choice_list = new BLOB_CHOICE_LIST;
890  BLOB_CHOICE_IT choice_it(choice_list);
891  choice_it.add_after_then_move(choices[c]);
892  ratings->put(c, c, choice_list);
893  }
894  FakeWordFromRatings(TOP_CHOICE_PERM);
895  reject_map.initialise(blob_count);
896  best_state.init_to_size(blob_count, 1);
897  done = true;
898 }
899 
900 // Creates a WERD_CHOICE for the word using the top choices from the leading
901 // diagonal of the ratings matrix.
903  int num_blobs = ratings->dimension();
904  auto* word_choice = new WERD_CHOICE(uch_set, num_blobs);
905  word_choice->set_permuter(permuter);
906  for (int b = 0; b < num_blobs; ++b) {
907  UNICHAR_ID unichar_id = UNICHAR_SPACE;
908  float rating = INT32_MAX;
909  float certainty = -INT32_MAX;
910  BLOB_CHOICE_LIST* choices = ratings->get(b, b);
911  if (choices != nullptr && !choices->empty()) {
912  BLOB_CHOICE_IT bc_it(choices);
913  BLOB_CHOICE* choice = bc_it.data();
914  unichar_id = choice->unichar_id();
915  rating = choice->rating();
916  certainty = choice->certainty();
917  }
918  word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
919  certainty);
920  }
921  LogNewRawChoice(word_choice);
922  // Ownership of word_choice taken by word here.
923  LogNewCookedChoice(1, false, word_choice);
924 }
925 
926 // Copies the best_choice strings to the correct_text for adaption/training.
928  correct_text.clear();
929  ASSERT_HOST(best_choice != nullptr);
930  for (int i = 0; i < best_choice->length(); ++i) {
931  UNICHAR_ID choice_id = best_choice->unichar_id(i);
932  const char* blob_choice = uch_set->id_to_unichar(choice_id);
933  correct_text.push_back(STRING(blob_choice));
934  }
935 }
936 
937 // Merges 2 adjacent blobs in the result if the permanent callback
938 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
939 // callback box_cb is nullptr or returns true, setting the merged blob
940 // result to the class returned from class_cb.
941 // Returns true if anything was merged.
945  ASSERT_HOST(best_choice->length() == 0 || ratings != nullptr);
946  bool modified = false;
947  for (int i = 0; i + 1 < best_choice->length(); ++i) {
948  UNICHAR_ID new_id = class_cb->Run(best_choice->unichar_id(i),
949  best_choice->unichar_id(i+1));
950  if (new_id != INVALID_UNICHAR_ID &&
951  (box_cb == nullptr || box_cb->Run(box_word->BlobBox(i),
952  box_word->BlobBox(i + 1)))) {
953  // Raw choice should not be fixed.
954  best_choice->set_unichar_id(new_id, i);
955  modified = true;
956  MergeAdjacentBlobs(i);
957  const MATRIX_COORD& coord = best_choice->MatrixCoord(i);
958  if (!coord.Valid(*ratings)) {
959  ratings->IncreaseBandSize(coord.row + 1 - coord.col);
960  }
961  BLOB_CHOICE_LIST* blob_choices = GetBlobChoices(i);
962  if (FindMatchingChoice(new_id, blob_choices) == nullptr) {
963  // Insert a fake result.
964  auto* blob_choice = new BLOB_CHOICE;
965  blob_choice->set_unichar_id(new_id);
966  BLOB_CHOICE_IT bc_it(blob_choices);
967  bc_it.add_before_then_move(blob_choice);
968  }
969  }
970  }
971  delete class_cb;
972  delete box_cb;
973  return modified;
974 }
975 
976 // Merges 2 adjacent blobs in the result (index and index+1) and corrects
977 // all the data to account for the change.
979  if (reject_map.length() == best_choice->length())
980  reject_map.remove_pos(index);
981  best_choice->remove_unichar_id(index + 1);
982  rebuild_word->MergeBlobs(index, index + 2);
983  box_word->MergeBoxes(index, index + 2);
984  if (index + 1 < best_state.length()) {
985  best_state[index] += best_state[index + 1];
986  best_state.remove(index + 1);
987  }
988 }
989 
990 // TODO(tkielbus) Decide between keeping this behavior here or modifying the
991 // training data.
992 
993 // Utility function for fix_quotes
994 // Return true if the next character in the string (given the UTF8 length in
995 // bytes) is a quote character.
996 static int is_simple_quote(const char* signed_str, int length) {
997  const auto* str =
998  reinterpret_cast<const unsigned char*>(signed_str);
999  // Standard 1 byte quotes.
1000  return (length == 1 && (*str == '\'' || *str == '`')) ||
1001  // UTF-8 3 bytes curved quotes.
1002  (length == 3 && ((*str == 0xe2 &&
1003  *(str + 1) == 0x80 &&
1004  *(str + 2) == 0x98) ||
1005  (*str == 0xe2 &&
1006  *(str + 1) == 0x80 &&
1007  *(str + 2) == 0x99)));
1008 }
1009 
1010 // Callback helper for fix_quotes returns a double quote if both
1011 // arguments are quote, otherwise INVALID_UNICHAR_ID.
1013  const char *ch = uch_set->id_to_unichar(id1);
1014  const char *next_ch = uch_set->id_to_unichar(id2);
1015  if (is_simple_quote(ch, strlen(ch)) &&
1016  is_simple_quote(next_ch, strlen(next_ch)))
1017  return uch_set->unichar_to_id("\"");
1018  return INVALID_UNICHAR_ID;
1019 }
1020 
1021 // Change pairs of quotes to double quotes.
1023  if (!uch_set->contains_unichar("\"") ||
1024  !uch_set->get_enabled(uch_set->unichar_to_id("\"")))
1025  return; // Don't create it if it is disallowed.
1026 
1027  ConditionalBlobMerge(
1029  nullptr);
1030 }
1031 
1032 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
1033 // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
1035  const char *ch = uch_set->id_to_unichar(id1);
1036  const char *next_ch = uch_set->id_to_unichar(id2);
1037  if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
1038  (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~'))
1039  return uch_set->unichar_to_id("-");
1040  return INVALID_UNICHAR_ID;
1041 }
1042 
1043 // Callback helper for fix_hyphens returns true if box1 and box2 overlap
1044 // (assuming both on the same textline, are in order and a chopped em dash.)
1045 bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) {
1046  return box1.right() >= box2.left();
1047 }
1048 
1049 // Change pairs of hyphens to a single hyphen if the bounding boxes touch
1050 // Typically a long dash which has been segmented.
1052  if (!uch_set->contains_unichar("-") ||
1053  !uch_set->get_enabled(uch_set->unichar_to_id("-")))
1054  return; // Don't create it if it is disallowed.
1055 
1056  ConditionalBlobMerge(
1059 }
1060 
1061 // Callback helper for merge_tess_fails returns a space if both
1062 // arguments are space, otherwise INVALID_UNICHAR_ID.
1064  if (id1 == id2 && id1 == uch_set->unichar_to_id(" "))
1065  return id1;
1066  else
1067  return INVALID_UNICHAR_ID;
1068 }
1069 
1070 // Change pairs of tess failures to a single one
1072  if (ConditionalBlobMerge(
1073  NewPermanentTessCallback(this, &WERD_RES::BothSpaces), nullptr)) {
1074  int len = best_choice->length();
1075  ASSERT_HOST(reject_map.length() == len);
1076  ASSERT_HOST(box_word->length() == len);
1077  }
1078 }
1079 
1080 // Returns true if the collection of count pieces, starting at start, are all
1081 // natural connected components, ie there are no real chops involved.
1082 bool WERD_RES::PiecesAllNatural(int start, int count) const {
1083  // all seams must have no splits.
1084  for (int index = start; index < start + count - 1; ++index) {
1085  if (index >= 0 && index < seam_array.size()) {
1086  SEAM* seam = seam_array[index];
1087  if (seam != nullptr && seam->HasAnySplits()) return false;
1088  }
1089  }
1090  return true;
1091 }
1092 
1093 
1095  Clear();
1096 }
1097 
1099  tess_failed = false;
1100  tess_accepted = false;
1101  tess_would_adapt = false;
1102  done = false;
1103  unlv_crunch_mode = CR_NONE;
1104  small_caps = false;
1105  odd_size = false;
1106  italic = false;
1107  bold = false;
1108  // The fontinfos and tesseract count as non-pointers as they point to
1109  // data owned elsewhere.
1110  fontinfo = nullptr;
1111  fontinfo2 = nullptr;
1112  tesseract = nullptr;
1113  fontinfo_id_count = 0;
1114  fontinfo_id2_count = 0;
1115  x_height = 0.0;
1116  caps_height = 0.0;
1117  baseline_shift = 0.0f;
1118  space_certainty = 0.0f;
1119  guessed_x_ht = true;
1120  guessed_caps_ht = true;
1121  combination = false;
1122  part_of_combo = false;
1123  reject_spaces = false;
1124 }
1125 
1127  word = nullptr;
1128  bln_boxes = nullptr;
1129  blob_row = nullptr;
1130  uch_set = nullptr;
1131  chopped_word = nullptr;
1132  rebuild_word = nullptr;
1133  box_word = nullptr;
1134  ratings = nullptr;
1135  best_choice = nullptr;
1136  raw_choice = nullptr;
1137  ep_choice = nullptr;
1138  blamer_bundle = nullptr;
1139 }
1140 
1142  if (combination) {
1143  delete word;
1144  }
1145  word = nullptr;
1146  delete blamer_bundle;
1147  blamer_bundle = nullptr;
1148  ClearResults();
1149 }
1150 
1152  done = false;
1153  fontinfo = nullptr;
1154  fontinfo2 = nullptr;
1155  fontinfo_id_count = 0;
1156  fontinfo_id2_count = 0;
1157  delete bln_boxes;
1158  bln_boxes = nullptr;
1159  blob_row = nullptr;
1160  delete chopped_word;
1161  chopped_word = nullptr;
1162  delete rebuild_word;
1163  rebuild_word = nullptr;
1164  delete box_word;
1165  box_word = nullptr;
1166  best_state.clear();
1167  correct_text.clear();
1168  seam_array.delete_data_pointers();
1169  seam_array.clear();
1170  blob_widths.clear();
1171  blob_gaps.clear();
1172  ClearRatings();
1173  ClearWordChoices();
1174  if (blamer_bundle != nullptr) blamer_bundle->ClearResults();
1175 }
1177  best_choice = nullptr;
1178  delete raw_choice;
1179  raw_choice = nullptr;
1180  best_choices.clear();
1181  delete ep_choice;
1182  ep_choice = nullptr;
1183 }
1185  if (ratings != nullptr) {
1186  ratings->delete_matrix_pointers();
1187  delete ratings;
1188  ratings = nullptr;
1189  }
1190 }
1191 
1192 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
1193  ASSERT_HOST(page_res == other.page_res);
1194  if (other.block_res == nullptr) {
1195  // other points to the end of the page.
1196  if (block_res == nullptr)
1197  return 0;
1198  return -1;
1199  }
1200  if (block_res == nullptr) {
1201  return 1; // we point to the end of the page.
1202  }
1203  if (block_res == other.block_res) {
1204  if (other.row_res == nullptr || row_res == nullptr) {
1205  // this should only happen if we hit an image block.
1206  return 0;
1207  }
1208  if (row_res == other.row_res) {
1209  // we point to the same block and row.
1210  ASSERT_HOST(other.word_res != nullptr && word_res != nullptr);
1211  if (word_res == other.word_res) {
1212  // we point to the same word!
1213  return 0;
1214  }
1215 
1216  WERD_RES_IT word_res_it(&row_res->word_res_list);
1217  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
1218  word_res_it.forward()) {
1219  if (word_res_it.data() == word_res) {
1220  return -1;
1221  } else if (word_res_it.data() == other.word_res) {
1222  return 1;
1223  }
1224  }
1225  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1226  }
1227 
1228  // we both point to the same block, but different rows.
1229  ROW_RES_IT row_res_it(&block_res->row_res_list);
1230  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
1231  row_res_it.forward()) {
1232  if (row_res_it.data() == row_res) {
1233  return -1;
1234  } else if (row_res_it.data() == other.row_res) {
1235  return 1;
1236  }
1237  }
1238  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1239  }
1240 
1241  // We point to different blocks.
1242  BLOCK_RES_IT block_res_it(&page_res->block_res_list);
1243  for (block_res_it.mark_cycle_pt();
1244  !block_res_it.cycled_list(); block_res_it.forward()) {
1245  if (block_res_it.data() == block_res) {
1246  return -1;
1247  } else if (block_res_it.data() == other.block_res) {
1248  return 1;
1249  }
1250  }
1251  // Shouldn't happen...
1252  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1253  return 0;
1254 }
1255 
1256 // Inserts the new_word as a combination owned by a corresponding WERD_RES
1257 // before the current position. The simple fields of the WERD_RES are copied
1258 // from clone_res and the resulting WERD_RES is returned for further setup
1259 // with best_choice etc.
1261  WERD* new_word) {
1262  // Make a WERD_RES for the new_word.
1263  auto* new_res = new WERD_RES(new_word);
1264  new_res->CopySimpleFields(clone_res);
1265  new_res->combination = true;
1266  // Insert into the appropriate place in the ROW_RES.
1267  WERD_RES_IT wr_it(&row()->word_res_list);
1268  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1269  WERD_RES* word = wr_it.data();
1270  if (word == word_res)
1271  break;
1272  }
1273  ASSERT_HOST(!wr_it.cycled_list());
1274  wr_it.add_before_then_move(new_res);
1275  if (wr_it.at_first()) {
1276  // This is the new first word, so reset the member iterator so it
1277  // detects the cycled_list state correctly.
1278  ResetWordIterator();
1279  }
1280  return new_res;
1281 }
1282 
1283 // Helper computes the boundaries between blobs in the word. The blob bounds
1284 // are likely very poor, if they come from LSTM, where it only outputs the
1285 // character at one pixel within it, so we find the midpoints between them.
1286 static void ComputeBlobEnds(const WERD_RES& word, const TBOX& clip_box,
1287  C_BLOB_LIST* next_word_blobs,
1288  GenericVector<int>* blob_ends) {
1289  C_BLOB_IT blob_it(word.word->cblob_list());
1290  for (int i = 0; i < word.best_state.size(); ++i) {
1291  int length = word.best_state[i];
1292  // Get the bounding box of the fake blobs
1293  TBOX blob_box = blob_it.data()->bounding_box();
1294  blob_it.forward();
1295  for (int b = 1; b < length; ++b) {
1296  blob_box += blob_it.data()->bounding_box();
1297  blob_it.forward();
1298  }
1299  // This blob_box is crap, so for now we are only looking for the
1300  // boundaries between them.
1301  int blob_end = INT32_MAX;
1302  if (!blob_it.at_first() || next_word_blobs != nullptr) {
1303  if (blob_it.at_first())
1304  blob_it.set_to_list(next_word_blobs);
1305  blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
1306  }
1307  blob_end = ClipToRange<int>(blob_end, clip_box.left(), clip_box.right());
1308  blob_ends->push_back(blob_end);
1309  }
1310  blob_ends->back() = clip_box.right();
1311 }
1312 
1313 // Helper computes the bounds of a word by restricting it to existing words
1314 // that significantly overlap.
1315 static TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES>& words,
1316  int w_index, TBOX prev_box, WERD_RES_IT w_it) {
1317  constexpr int kSignificantOverlapFraction = 4;
1318  TBOX clipped_box;
1319  TBOX current_box = words[w_index]->word->bounding_box();
1320  TBOX next_box;
1321  if (w_index + 1 < words.size() && words[w_index + 1] != nullptr &&
1322  words[w_index + 1]->word != nullptr)
1323  next_box = words[w_index + 1]->word->bounding_box();
1324  for (w_it.forward(); !w_it.at_first() && w_it.data()->part_of_combo;
1325  w_it.forward()) {
1326  if (w_it.data() == nullptr || w_it.data()->word == nullptr) continue;
1327  TBOX w_box = w_it.data()->word->bounding_box();
1328  int height_limit = std::min<int>(w_box.height(), w_box.width() / 2);
1329  int width_limit = w_box.width() / kSignificantOverlapFraction;
1330  int min_significant_overlap = std::max(height_limit, width_limit);
1331  int overlap = w_box.intersection(current_box).width();
1332  int prev_overlap = w_box.intersection(prev_box).width();
1333  int next_overlap = w_box.intersection(next_box).width();
1334  if (overlap > min_significant_overlap) {
1335  if (prev_overlap > min_significant_overlap) {
1336  // We have no choice but to use the LSTM word edge.
1337  clipped_box.set_left(current_box.left());
1338  } else if (next_overlap > min_significant_overlap) {
1339  // We have no choice but to use the LSTM word edge.
1340  clipped_box.set_right(current_box.right());
1341  } else {
1342  clipped_box += w_box;
1343  }
1344  }
1345  }
1346  if (clipped_box.height() <= 0) {
1347  clipped_box.set_top(current_box.top());
1348  clipped_box.set_bottom(current_box.bottom());
1349  }
1350  if (clipped_box.width() <= 0) clipped_box = current_box;
1351  return clipped_box;
1352 }
1353 
1354 // Helper moves the blob from src to dest. If it isn't contained by clip_box,
1355 // the blob is replaced by a fake that is contained.
1356 static TBOX MoveAndClipBlob(C_BLOB_IT* src_it, C_BLOB_IT* dest_it,
1357  const TBOX& clip_box) {
1358  C_BLOB* src_blob = src_it->extract();
1359  TBOX box = src_blob->bounding_box();
1360  if (!clip_box.contains(box)) {
1361  int left =
1362  ClipToRange<int>(box.left(), clip_box.left(), clip_box.right() - 1);
1363  int right =
1364  ClipToRange<int>(box.right(), clip_box.left() + 1, clip_box.right());
1365  int top =
1366  ClipToRange<int>(box.top(), clip_box.bottom() + 1, clip_box.top());
1367  int bottom =
1368  ClipToRange<int>(box.bottom(), clip_box.bottom(), clip_box.top() - 1);
1369  box = TBOX(left, bottom, right, top);
1370  delete src_blob;
1371  src_blob = C_BLOB::FakeBlob(box);
1372  }
1373  dest_it->add_after_then_move(src_blob);
1374  return box;
1375 }
1376 
1377 // Replaces the current WERD/WERD_RES with the given words. The given words
1378 // contain fake blobs that indicate the position of the characters. These are
1379 // replaced with real blobs from the current word as much as possible.
1382  if (words->empty()) {
1383  DeleteCurrentWord();
1384  return;
1385  }
1386  WERD_RES* input_word = word();
1387  // Set the BOL/EOL flags on the words from the input word.
1388  if (input_word->word->flag(W_BOL)) {
1389  (*words)[0]->word->set_flag(W_BOL, true);
1390  } else {
1391  (*words)[0]->word->set_blanks(input_word->word->space());
1392  }
1393  words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
1394 
1395  // Move the blobs from the input word to the new set of words.
1396  // If the input word_res is a combination, then the replacements will also be
1397  // combinations, and will own their own words. If the input word_res is not a
1398  // combination, then the final replacements will not be either, (although it
1399  // is allowed for the input words to be combinations) and their words
1400  // will get put on the row list. This maintains the ownership rules.
1401  WERD_IT w_it(row()->row->word_list());
1402  if (!input_word->combination) {
1403  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1404  WERD* word = w_it.data();
1405  if (word == input_word->word)
1406  break;
1407  }
1408  // w_it is now set to the input_word's word.
1409  ASSERT_HOST(!w_it.cycled_list());
1410  }
1411  // Insert into the appropriate place in the ROW_RES.
1412  WERD_RES_IT wr_it(&row()->word_res_list);
1413  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1414  WERD_RES* word = wr_it.data();
1415  if (word == input_word)
1416  break;
1417  }
1418  ASSERT_HOST(!wr_it.cycled_list());
1419  // Since we only have an estimate of the bounds between blobs, use the blob
1420  // x-middle as the determiner of where to put the blobs
1421  C_BLOB_IT src_b_it(input_word->word->cblob_list());
1422  src_b_it.sort(&C_BLOB::SortByXMiddle);
1423  C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
1424  rej_b_it.sort(&C_BLOB::SortByXMiddle);
1425  TBOX clip_box;
1426  for (int w = 0; w < words->size(); ++w) {
1427  WERD_RES* word_w = (*words)[w];
1428  clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
1429  // Compute blob boundaries.
1430  GenericVector<int> blob_ends;
1431  C_BLOB_LIST* next_word_blobs =
1432  w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
1433  ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
1434  // Remove the fake blobs on the current word, but keep safe for back-up if
1435  // no blob can be found.
1436  C_BLOB_LIST fake_blobs;
1437  C_BLOB_IT fake_b_it(&fake_blobs);
1438  fake_b_it.add_list_after(word_w->word->cblob_list());
1439  fake_b_it.move_to_first();
1440  word_w->word->cblob_list()->clear();
1441  C_BLOB_IT dest_it(word_w->word->cblob_list());
1442  // Build the box word as we move the blobs.
1443  auto* box_word = new tesseract::BoxWord;
1444  for (int i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
1445  int end_x = blob_ends[i];
1446  TBOX blob_box;
1447  // Add the blobs up to end_x.
1448  while (!src_b_it.empty() &&
1449  src_b_it.data()->bounding_box().x_middle() < end_x) {
1450  blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
1451  src_b_it.forward();
1452  }
1453  while (!rej_b_it.empty() &&
1454  rej_b_it.data()->bounding_box().x_middle() < end_x) {
1455  blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
1456  rej_b_it.forward();
1457  }
1458  if (blob_box.null_box()) {
1459  // Use the original box as a back-up.
1460  blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
1461  }
1462  box_word->InsertBox(i, blob_box);
1463  }
1464  delete word_w->box_word;
1465  word_w->box_word = box_word;
1466  if (!input_word->combination) {
1467  // Insert word_w->word into the ROW. It doesn't own its word, so the
1468  // ROW needs to own it.
1469  w_it.add_before_stay_put(word_w->word);
1470  word_w->combination = false;
1471  }
1472  (*words)[w] = nullptr; // We are taking ownership.
1473  wr_it.add_before_stay_put(word_w);
1474  }
1475  // We have taken ownership of the words.
1476  words->clear();
1477  // Delete the current word, which has been replaced. We could just call
1478  // DeleteCurrentWord, but that would iterate both lists again, and we know
1479  // we are already in the right place.
1480  if (!input_word->combination)
1481  delete w_it.extract();
1482  delete wr_it.extract();
1483  ResetWordIterator();
1484 }
1485 
1486 // Deletes the current WERD_RES and its underlying WERD.
1488  // Check that this word is as we expect. part_of_combos are NEVER iterated
1489  // by the normal iterator, so we should never be trying to delete them.
1490  ASSERT_HOST(!word_res->part_of_combo);
1491  if (!word_res->combination) {
1492  // Combinations own their own word, so we won't find the word on the
1493  // row's word_list, but it is legitimate to try to delete them.
1494  // Delete word from the ROW when not a combination.
1495  WERD_IT w_it(row()->row->word_list());
1496  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1497  if (w_it.data() == word_res->word) {
1498  break;
1499  }
1500  }
1501  ASSERT_HOST(!w_it.cycled_list());
1502  delete w_it.extract();
1503  }
1504  // Remove the WERD_RES for the new_word.
1505  // Remove the WORD_RES from the ROW_RES.
1506  WERD_RES_IT wr_it(&row()->word_res_list);
1507  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1508  if (wr_it.data() == word_res) {
1509  word_res = nullptr;
1510  break;
1511  }
1512  }
1513  ASSERT_HOST(!wr_it.cycled_list());
1514  delete wr_it.extract();
1515  ResetWordIterator();
1516 }
1517 
1518 // Makes the current word a fuzzy space if not already fuzzy. Updates
1519 // corresponding part of combo if required.
1521  WERD* real_word = word_res->word;
1522  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
1523  real_word->set_flag(W_FUZZY_SP, true);
1524  if (word_res->combination) {
1525  // The next word should be the corresponding part of combo, but we have
1526  // already stepped past it, so find it by search.
1527  WERD_RES_IT wr_it(&row()->word_res_list);
1528  for (wr_it.mark_cycle_pt();
1529  !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
1530  }
1531  wr_it.forward();
1532  ASSERT_HOST(wr_it.data()->part_of_combo);
1533  real_word = wr_it.data()->word;
1534  ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
1535  !real_word->flag(W_FUZZY_NON));
1536  real_word->set_flag(W_FUZZY_SP, true);
1537  }
1538  }
1539 }
1540 
1541 /*************************************************************************
1542  * PAGE_RES_IT::restart_page
1543  *
1544  * Set things up at the start of the page
1545  *************************************************************************/
1546 
1548  block_res_it.set_to_list(&page_res->block_res_list);
1549  block_res_it.mark_cycle_pt();
1550  prev_block_res = nullptr;
1551  prev_row_res = nullptr;
1552  prev_word_res = nullptr;
1553  block_res = nullptr;
1554  row_res = nullptr;
1555  word_res = nullptr;
1556  next_block_res = nullptr;
1557  next_row_res = nullptr;
1558  next_word_res = nullptr;
1559  internal_forward(true, empty_ok);
1560  return internal_forward(false, empty_ok);
1561 }
1562 
1563 // Recovers from operations on the current word, such as in InsertCloneWord
1564 // and DeleteCurrentWord.
1565 // Resets the word_res_it so that it is one past the next_word_res, as
1566 // it should be after internal_forward. If next_row_res != row_res,
1567 // then the next_word_res is in the next row, so there is no need to do
1568 // anything to word_res_it, but it is still a good idea to reset the pointers
1569 // word_res and prev_word_res, which are still in the current row.
1571  if (row_res == next_row_res) {
1572  // Reset the member iterator so it can move forward and detect the
1573  // cycled_list state correctly.
1574  word_res_it.move_to_first();
1575  for (word_res_it.mark_cycle_pt();
1576  !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
1577  word_res_it.forward()) {
1578  if (!word_res_it.data()->part_of_combo) {
1579  if (prev_row_res == row_res) prev_word_res = word_res;
1580  word_res = word_res_it.data();
1581  }
1582  }
1583  ASSERT_HOST(!word_res_it.cycled_list());
1584  wr_it_of_next_word = word_res_it;
1585  word_res_it.forward();
1586  } else {
1587  // word_res_it is OK, but reset word_res and prev_word_res if needed.
1588  WERD_RES_IT wr_it(&row_res->word_res_list);
1589  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1590  if (!wr_it.data()->part_of_combo) {
1591  if (prev_row_res == row_res) prev_word_res = word_res;
1592  word_res = wr_it.data();
1593  }
1594  }
1595  }
1596 }
1597 
1598 /*************************************************************************
1599  * PAGE_RES_IT::internal_forward
1600  *
1601  * Find the next word on the page. If empty_ok is true, then non-text blocks
1602  * and text blocks with no text are visited as if they contain a single
1603  * imaginary word in a single imaginary row. (word() and row() both return nullptr
1604  * in such a block and the return value is nullptr.)
1605  * If empty_ok is false, the old behaviour is maintained. Each real word
1606  * is visited and empty and non-text blocks and rows are skipped.
1607  * new_block is used to initialize the iterators for a new block.
1608  * The iterator maintains pointers to block, row and word for the previous,
1609  * current and next words. These are correct, regardless of block/row
1610  * boundaries. nullptr values denote start and end of the page.
1611  *************************************************************************/
1612 
1613 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
1614  bool new_row = false;
1615 
1616  prev_block_res = block_res;
1617  prev_row_res = row_res;
1618  prev_word_res = word_res;
1619  block_res = next_block_res;
1620  row_res = next_row_res;
1621  word_res = next_word_res;
1622  wr_it_of_current_word = wr_it_of_next_word;
1623  next_block_res = nullptr;
1624  next_row_res = nullptr;
1625  next_word_res = nullptr;
1626 
1627  while (!block_res_it.cycled_list()) {
1628  if (new_block) {
1629  new_block = false;
1630  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
1631  row_res_it.mark_cycle_pt();
1632  if (row_res_it.empty() && empty_ok) {
1633  next_block_res = block_res_it.data();
1634  break;
1635  }
1636  new_row = true;
1637  }
1638  while (!row_res_it.cycled_list()) {
1639  if (new_row) {
1640  new_row = false;
1641  word_res_it.set_to_list(&row_res_it.data()->word_res_list);
1642  word_res_it.mark_cycle_pt();
1643  }
1644  // Skip any part_of_combo words.
1645  while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
1646  word_res_it.forward();
1647  if (!word_res_it.cycled_list()) {
1648  next_block_res = block_res_it.data();
1649  next_row_res = row_res_it.data();
1650  next_word_res = word_res_it.data();
1651  wr_it_of_next_word = word_res_it;
1652  word_res_it.forward();
1653  goto foundword;
1654  }
1655  // end of row reached
1656  row_res_it.forward();
1657  new_row = true;
1658  }
1659  // end of block reached
1660  block_res_it.forward();
1661  new_block = true;
1662  }
1663  foundword:
1664  // Update prev_word_best_choice pointer.
1665  if (page_res != nullptr && page_res->prev_word_best_choice != nullptr) {
1666  *page_res->prev_word_best_choice =
1667  (new_block || prev_word_res == nullptr) ? nullptr : prev_word_res->best_choice;
1668  }
1669  return word_res;
1670 }
1671 
1672 /*************************************************************************
1673  * PAGE_RES_IT::restart_row()
1674  *
1675  * Move to the beginning (leftmost word) of the current row.
1676  *************************************************************************/
1678  ROW_RES *row = this->row();
1679  if (!row) return nullptr;
1680  for (restart_page(); this->row() != row; forward()) {
1681  // pass
1682  }
1683  return word();
1684 }
1685 
1686 /*************************************************************************
1687  * PAGE_RES_IT::forward_paragraph
1688  *
1689  * Move to the beginning of the next paragraph, allowing empty blocks.
1690  *************************************************************************/
1691 
1693  while (block_res == next_block_res &&
1694  (next_row_res != nullptr && next_row_res->row != nullptr &&
1695  row_res->row->para() == next_row_res->row->para())) {
1696  internal_forward(false, true);
1697  }
1698  return internal_forward(false, true);
1699 }
1700 
1701 /*************************************************************************
1702  * PAGE_RES_IT::forward_block
1703  *
1704  * Move to the beginning of the next block, allowing empty blocks.
1705  *************************************************************************/
1706 
1708  while (block_res == next_block_res) {
1709  internal_forward(false, true);
1710  }
1711  return internal_forward(false, true);
1712 }
1713 
1715  int16_t chars_in_word;
1716  int16_t rejects_in_word = 0;
1717 
1718  chars_in_word = word_res->reject_map.length ();
1719  page_res->char_count += chars_in_word;
1720  block_res->char_count += chars_in_word;
1721  row_res->char_count += chars_in_word;
1722 
1723  rejects_in_word = word_res->reject_map.reject_count ();
1724 
1725  page_res->rej_count += rejects_in_word;
1726  block_res->rej_count += rejects_in_word;
1727  row_res->rej_count += rejects_in_word;
1728  if (chars_in_word == rejects_in_word)
1729  row_res->whole_word_rej_count += rejects_in_word;
1730 }
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
Definition: pageres.cpp:1045
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
float baseline_shift
Definition: pageres.h:312
Definition: werd.h:56
float adjust_factor() const
Definition: ratngs.h:306
bool tess_failed
Definition: pageres.h:287
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1012
int default_sid() const
Definition: unicharset.h:894
int16_t top() const
Definition: rect.h:58
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:744
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:517
PermuterType
Definition: ratngs.h:242
void operator=(const ELIST_LINK &)
Definition: elst.h:99
float x_height() const
Definition: ocrrow.h:64
void ClearWordChoices()
Definition: pageres.cpp:1176
bool done
Definition: pageres.h:297
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:780
Special case latin for y. splitting.
Definition: werd.h:36
void set_top(int y)
Definition: rect.h:61
bool guessed_x_ht
Definition: pageres.h:307
Definition: rect.h:34
void UpdateStateForSplit(int blob_position)
Definition: ratngs.cpp:699
void set_right(int x)
Definition: rect.h:82
fuzzy nonspace
Definition: werd.h:40
void DebugTopChoice(const char *msg) const
Definition: pageres.cpp:503
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:356
TWERD * rebuild_word
Definition: pageres.h:259
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:799
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1260
Definition: blobs.h:397
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:306
WERD_RES * forward_block()
Definition: pageres.cpp:1707
void ResetWordIterator()
Definition: pageres.cpp:1570
float rating() const
Definition: ratngs.h:80
GenericVector< int > blob_widths
Definition: pageres.h:218
Definition: strngs.h:45
void SetupBoxWord()
Definition: pageres.cpp:853
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:754
POLY_BLOCK * poly_block() const
Definition: pdblock.h:56
static void JoinPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:210
GenericVector< STRING > correct_text
Definition: pageres.h:274
TBOX bounding_box() const
Definition: blobs.cpp:472
void add_str_int(const char *str, int number)
Definition: strngs.cpp:377
const STRING & unichar_string() const
Definition: ratngs.h:541
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:90
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1082
PAGE_RES()
Definition: pageres.h:101
start of line
Definition: werd.h:32
GenericVector< SEAM * > seam_array
Definition: pageres.h:216
ROW_RES()=default
static void BreakPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:188
static BoxWord * CopyFromNormalized(TWERD *tessword)
Definition: boxword.cpp:56
WERD_CHOICE_LIST best_choices
Definition: pageres.h:242
bool contains(const FCOORD pt) const
Definition: rect.h:333
WERD_CHOICE * ep_choice
Definition: pageres.h:285
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1063
Definition: blobs.h:263
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:902
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
bool combination
Definition: pageres.h:333
int8_t bold
Definition: pageres.h:301
TBOX bounding_box() const
Definition: stepblob.cpp:253
void SetupBlobWidthsAndGaps()
Definition: pageres.cpp:404
tesseract::BoxWord * bln_boxes
Definition: pageres.h:197
bool tess_accepted
Definition: pageres.h:295
void set_left(int x)
Definition: rect.h:75
tesseract::Tesseract * tesseract
Definition: pageres.h:281
repeated character
Definition: werd.h:38
int length() const
Definition: boxword.h:83
fuzzy space
Definition: werd.h:39
const double kMaxWordGapRatio
Definition: pageres.cpp:60
bool odd_size
Definition: pageres.h:299
bool tess_would_adapt
Definition: pageres.h:296
WERD_RES & operator=(const WERD_RES &source)
Definition: pageres.cpp:190
bool script_has_xheight() const
Definition: unicharset.h:904
end of line
Definition: werd.h:33
REJMAP reject_map
Definition: pageres.h:286
int16_t height() const
Definition: rect.h:108
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:484
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:449
void print() const
Definition: ratngs.h:580
const UNICHARSET * uch_set
Definition: pageres.h:205
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:180
float certainty() const
Definition: ratngs.h:83
const FontInfo * fontinfo2
Definition: pageres.h:304
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:125
#define ELISTIZE(CLASSNAME)
Definition: elst.h:955
bool guessed_caps_ht
Definition: pageres.h:308
bool HasAnySplits() const
Definition: seam.h:61
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:388
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:608
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:309
int8_t fontinfo_id2_count
Definition: pageres.h:306
float rating() const
Definition: ratngs.h:327
WERD_RES * forward_paragraph()
Definition: pageres.cpp:1692
float descenders() const
Definition: ocrrow.h:85
void fix_quotes()
Definition: pageres.cpp:1022
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:869
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:881
int cmp(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1192
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:443
bool ConditionalBlobMerge(TessResultCallback2< UNICHAR_ID, UNICHAR_ID, UNICHAR_ID > *class_cb, TessResultCallback2< bool, const TBOX &, const TBOX & > *box_cb)
Definition: pageres.cpp:942
bool small_caps
Definition: pageres.h:298
Definition: seam.h:38
const FontInfo * fontinfo
Definition: pageres.h:303
const char * string() const
Definition: strngs.cpp:194
white on black
Definition: werd.h:41
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:36
bool PrepareToInsertSeam(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int insert_index, bool modify)
Definition: seam.cpp:76
bool empty() const
Definition: genericvector.h:89
void fix_hyphens()
Definition: pageres.cpp:1051
BLOCK_RES()=default
tesseract::BoxWord * box_word
Definition: pageres.h:265
int push_back(T object)
int32_t char_count
Definition: pageres.h:78
void copy_on(WERD_RES *word_res)
Definition: pageres.h:661
bool StatesAllValid()
Definition: pageres.cpp:462
void InitPointers()
Definition: pageres.cpp:1126
PAGE_RES * page_res
Definition: pageres.h:678
int16_t width() const
Definition: rect.h:115
ROW * blob_row
Definition: pageres.h:199
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:191
WERD_RES * restart_row()
Definition: pageres.cpp:1677
void CopySimpleFields(const WERD_RES &source)
Definition: pageres.cpp:253
Definition: matrix.h:578
int16_t right() const
Definition: rect.h:79
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:769
int16_t bottom() const
Definition: rect.h:65
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
uint8_t space()
Definition: werd.h:99
void start_seam_list(TWERD *word, GenericVector< SEAM * > *seam_array)
Definition: seam.cpp:263
TBOX bounding_box() const
Definition: werd.cpp:148
float certainty() const
Definition: ratngs.h:330
void BestChoiceToCorrectText()
Definition: pageres.cpp:927
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:565
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
void ClearResults()
Definition: pageres.cpp:1151
int8_t fontinfo_id_count
Definition: pageres.h:305
void Clear()
Definition: pageres.cpp:1141
void merge_tess_fails()
Definition: pageres.cpp:1071
WERD_RES * start_page(bool empty_ok)
Definition: pageres.cpp:1547
int32_t rej_count
Definition: pageres.h:79
float body_size() const
Definition: ocrrow.h:73
WERD_CHOICE * best_choice
Definition: pageres.h:234
MATRIX * ratings
Definition: pageres.h:230
#define ASSERT_HOST(x)
Definition: errcode.h:88
int GetBlobsWidth(int start_blob, int last_blob)
Definition: pageres.cpp:734
void CloneChoppedToRebuild()
Definition: pageres.cpp:839
int16_t left() const
Definition: rect.h:72
const double kMaxWordSizeRatio
Definition: pageres.cpp:56
const double kMaxLineSizeRatio
Definition: pageres.cpp:58
WERD_LIST * word_list()
Definition: ocrrow.h:55
bool part_of_combo
Definition: pageres.h:334
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
TWERD * chopped_word
Definition: pageres.h:214
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1520
bool Valid(const MATRIX &m) const
Definition: matrix.h:618
float caps_height
Definition: pageres.h:311
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:347
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
int UNICHAR_ID
Definition: unichar.h:34
void Init()
Definition: pageres.h:93
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:281
void InitNonPointers()
Definition: pageres.cpp:1098
void RebuildBestState()
Definition: pageres.cpp:812
bool IsText() const
Definition: polyblk.h:49
GenericVector< int > best_state
Definition: pageres.h:270
Definition: ocrrow.h:36
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:117
GenericVector< int > blob_gaps
Definition: pageres.h:221
int size() const
Definition: genericvector.h:70
void SetupBlamerBundle()
Definition: pageres.cpp:397
WERD_CHOICE * raw_choice
Definition: pageres.h:239
int TotalOfStates() const
Definition: ratngs.cpp:711
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1034
int latin_sid() const
Definition: unicharset.h:886
virtual R Run(A1, A2)=0
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:978
void PrintBestChoices() const
Definition: pageres.cpp:721
WERD_CHOICE ** prev_word_best_choice
Definition: pageres.h:84
x-height concept makes sense.
Definition: werd.h:35
float x_height
Definition: pageres.h:310
const int kWordrecMaxNumJoinChunks
Definition: pageres.cpp:53
void set_bottom(int y)
Definition: rect.h:68
bool null_box() const
Definition: rect.h:50
BLOCK_RES_LIST block_res_list
Definition: pageres.h:80
WERD * word
Definition: pageres.h:188
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:118
DENORM denorm
Definition: pageres.h:203
void SetScriptPositions()
Definition: pageres.cpp:862
bool IsAmbiguous()
Definition: pageres.cpp:456
int count(LIST var_list)
Definition: oldlist.cpp:96
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:763
void DeleteCurrentWord()
Definition: pageres.cpp:1487
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:422
int state(int index) const
Definition: ratngs.h:319
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:624
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:241
bool reject_spaces
Definition: pageres.h:335
int8_t italic
Definition: pageres.h:300
Definition: ocrblock.h:29
T & back() const
float ascenders() const
Definition: ocrrow.h:82
void rej_stat_word()
Definition: pageres.cpp:1714
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:145
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1380
void ClearRatings()
Definition: pageres.cpp:1184
BlamerBundle * blamer_bundle
Definition: pageres.h:245
C_BLOB_LIST * cblob_list()
Definition: werd.h:95