|
tesseract 3.04.01
|
Classes | |
| class | TessBaseAPI |
| class | TessResultRenderer |
| class | TessTextRenderer |
| class | TessHOcrRenderer |
| class | TessPDFRenderer |
| class | TessUnlvRenderer |
| class | TessBoxTextRenderer |
| class | TessOsdRenderer |
| class | CubeRecoContext |
| class | CubeClassifier |
| class | CubeTessClassifier |
| struct | DocQualCallbacks |
| class | EquationDetect |
| class | LTRResultIterator |
| class | ChoiceIterator |
| class | MutableIterator |
| class | PageIterator |
| struct | BlobData |
| class | UnicodeSpanSkipper |
| struct | Cluster |
| class | SimpleClusterer |
| struct | GeometricClassifierState |
| struct | Interval |
| class | RowInfo |
| struct | LineHypothesis |
| class | RowScratchRegisters |
| class | ParagraphTheory |
| class | ParagraphModelSmearer |
| class | ResultIterator |
| class | TesseractCubeCombiner |
| struct | TesseractStats |
| struct | WordData |
| class | Tesseract |
| class | ImageThresholder |
| class | BoxWord |
| class | CCStruct |
| class | DetLineFit |
| class | DPPoint |
| struct | ScoredFont |
| struct | FontSpacingInfo |
| struct | FontInfo |
| struct | FontSet |
| class | FontInfoTable |
| class | WordFeature |
| struct | FloatWordFeature |
| class | ImageData |
| class | DocumentData |
| class | DocumentCache |
| struct | ParamsTrainingHypothesis |
| class | ParamsTrainingBundle |
| class | UnicharIdArrayUtils |
| class | AmbigSpec |
| class | UnicharAmbigs |
| class | BitVector |
| class | CCUtilMutex |
| class | CCUtil |
| class | DoublePtr |
| class | GenericHeap |
| class | PointerVector |
| class | TRand |
| class | IndexMap |
| class | IndexMapBiDi |
| struct | KDPair |
| struct | KDPairInc |
| struct | KDPairDec |
| class | KDPtrPair |
| struct | KDPtrPairInc |
| struct | KDPtrPairDec |
| class | KDVector |
| class | ObjectCache |
| struct | ParamsVectors |
| class | ParamUtils |
| class | Param |
| class | IntParam |
| class | BoolParam |
| class | StringParam |
| class | DoubleParam |
| class | TFile |
| class | TessdataManager |
| class | Classify |
| class | ErrorCounter |
| class | IntFeatureDist |
| class | IntFeatureMap |
| class | IntFeatureSpace |
| class | ClassPruner |
| struct | ShapeDist |
| class | MasterTrainer |
| class | SampleIterator |
| class | ShapeClassifier |
| struct | UnicharRating |
| struct | ShapeRating |
| struct | ShapeQueueEntry |
| struct | UnicharAndFonts |
| class | Shape |
| class | ShapeTable |
| class | TessClassifier |
| class | TrainingSample |
| class | TrainingSampleSet |
| class | AltList |
| class | BeamSearch |
| class | Bmp8 |
| class | CachedFile |
| class | CharAltList |
| struct | Bigram |
| struct | CharBigram |
| struct | CharBigramTable |
| class | CharBigrams |
| class | CharSamp |
| class | CharSampEnum |
| class | CharSampSet |
| class | CharSet |
| class | CharClassifier |
| class | CharClassifierFactory |
| class | ConCompPt |
| class | ConComp |
| class | ConvNetCharClassifier |
| class | CubeLineObject |
| class | CubeLineSegmenter |
| class | CubeObject |
| class | CubeSearchObject |
| class | CubeTuningParams |
| class | CubeUtils |
| class | FeatureBase |
| class | FeatureBmp |
| class | FeatureChebyshev |
| class | FeatureHybrid |
| class | HybridNeuralNetCharClassifier |
| class | LangModEdge |
| class | LangModel |
| class | SearchColumn |
| class | SearchNode |
| class | SearchNodeHashTable |
| class | SearchObject |
| class | TessLangModEdge |
| class | TessLangModel |
| class | TuningParams |
| class | WordAltList |
| class | WordListLangModel |
| struct | PairSizeInfo |
| struct | FontPairSizeInfo |
| class | WordSizeModel |
| class | WordUnigrams |
| class | CUtil |
| struct | NodeChild |
| class | Dawg |
| struct | DawgPosition |
| class | DawgPositionVector |
| class | SquishedDawg |
| struct | DawgLoader |
| class | DawgCache |
| struct | DawgArgs |
| class | Dict |
| class | Trie |
| class | InputFileBuffer |
| class | NeuralNet |
| class | Neuron |
| struct | AlignedBlobParams |
| class | AlignedBlob |
| class | BaselineRow |
| class | BaselineBlock |
| class | BaselineDetect |
| class | GridBase |
| class | IntGrid |
| class | BBGrid |
| struct | PtrHash |
| class | GridSearch |
| class | TabEventHandler |
| class | BlobGrid |
| class | CCNonTextDetect |
| class | ColumnFinder |
| class | ColPartition |
| class | ColPartitionGrid |
| class | ColPartitionSet |
| class | PixelHistogram |
| class | ShiroRekhaSplitter |
| class | EquationDetectBase |
| class | ImageFind |
| class | LineFinder |
| class | StrokeWidth |
| class | TabFind |
| class | ColSegment |
| class | TableFinder |
| class | StructuredTable |
| class | TableRecognizer |
| class | TabConstraint |
| class | TabVector |
| class | TextlineProjection |
| class | WordWithBox |
| class | Textord |
| struct | BlockGroup |
| class | WorkingPartSet |
| class | BoxChar |
| struct | BoxCharPtrSort |
| class | File |
| class | InputBuffer |
| class | OutputBuffer |
| class | IcuErrorCode |
| class | LigatureTable |
| class | PangoFontInfo |
| class | FontUtils |
| class | StringRenderer |
| struct | SpacingProperties |
| struct | AssociateStats |
| class | AssociateUtils |
| class | LanguageModel |
| struct | LMConsistencyInfo |
| class | LMPainPoints |
| struct | LanguageModelDawgInfo |
| struct | LanguageModelNgramInfo |
| struct | ViterbiStateEntry |
| struct | LanguageModelState |
| Struct to store information maintained by various language model components. More... | |
| struct | BestChoiceBundle |
| Bundle together all the things pertaining to the best choice/state. More... | |
| class | ParamsModel |
| class | SegSearchPending |
| class | FRAGMENT |
| class | Wordrec |
Typedefs | |
| typedef int(Dict::* | DictFunc )(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const |
| typedef double(Dict::* | ProbabilityInContextFunc )(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes) |
| typedef float(Dict::* | ParamsModelClassifyFunc )(const char *lang, void *path) |
| typedef void(Wordrec::* | FillLatticeFunc )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) |
| typedef TessCallback4< const UNICHARSET &, int, PageIterator *, Pix * > | TruthCallback |
| typedef GenericVectorEqEq < const ParagraphModel * > | SetOfModels |
| typedef void(Tesseract::* | WordRecognizer )(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words) |
| typedef GenericVector < ParamsTrainingHypothesis > | ParamsTrainingHypothesisList |
| typedef GenericVector< UNICHAR_ID > | UnicharIdVector |
| typedef GenericVector < AmbigSpec_LIST * > | UnicharAmbigsVector |
| typedef bool(* | FileReader )(const STRING &filename, GenericVector< char > *data) |
| typedef bool(* | FileWriter )(const GenericVector< char > &data, const STRING &filename) |
| typedef KDPairInc< int, int > | IntKDPair |
| typedef GenericHeap < ShapeQueueEntry > | ShapeQueue |
| typedef signed int | char_32 |
| typedef basic_string< char_32 > | string_32 |
| typedef GenericVector< NodeChild > | NodeChildVector |
| typedef GenericVector< int > | SuccessorList |
| typedef GenericVector < SuccessorList * > | SuccessorListsVector |
| typedef GenericVector< Dawg * > | DawgVector |
| typedef GridSearch< BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT > | BlobGridSearch |
| typedef GridSearch < ColPartition, ColPartition_CLIST, ColPartition_C_IT > | ColPartitionGridSearch |
| typedef GenericVector < ColPartitionSet * > | PartSetVector |
| typedef TessResultCallback1 < bool, int > | WidthCallback |
| typedef BBGrid< ColSegment, ColSegment_CLIST, ColSegment_C_IT > | ColSegmentGrid |
| typedef GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT > | ColSegmentGridSearch |
| typedef BBGrid< WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > | WordGrid |
| typedef GridSearch < WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > | WordSearch |
| typedef hash_map< string, string, StringHash > | LigHash |
| typedef GenericHeap < MatrixCoordPair > | PainPointHeap |
| typedef unsigned char | LanguageModelFlagsType |
| Used for expressing various language model flags. | |
Enumerations | |
| enum | LineType { LT_START = 'S', LT_BODY = 'C', LT_UNKNOWN = 'U', LT_MULTIPLE = 'M' } |
| enum | CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT } |
| enum | NormalizationMode { NM_BASELINE = -3, NM_CHAR_ISOTROPIC = -2, NM_CHAR_ANISOTROPIC = -1 } |
| enum | kParamsTrainingFeatureType { PTRAIN_DIGITS_SHORT, PTRAIN_DIGITS_MED, PTRAIN_DIGITS_LONG, PTRAIN_NUM_SHORT, PTRAIN_NUM_MED, PTRAIN_NUM_LONG, PTRAIN_DOC_SHORT, PTRAIN_DOC_MED, PTRAIN_DOC_LONG, PTRAIN_DICT_SHORT, PTRAIN_DICT_MED, PTRAIN_DICT_LONG, PTRAIN_FREQ_SHORT, PTRAIN_FREQ_MED, PTRAIN_FREQ_LONG, PTRAIN_SHAPE_COST_PER_CHAR, PTRAIN_NGRAM_COST_PER_CHAR, PTRAIN_NUM_BAD_PUNC, PTRAIN_NUM_BAD_CASE, PTRAIN_XHEIGHT_CONSISTENCY, PTRAIN_NUM_BAD_CHAR_TYPE, PTRAIN_NUM_BAD_SPACING, PTRAIN_NUM_BAD_FONT, PTRAIN_RATING_PER_CHAR, PTRAIN_NUM_FEATURE_TYPES } |
| enum | Orientation { ORIENTATION_PAGE_UP = 0, ORIENTATION_PAGE_RIGHT = 1, ORIENTATION_PAGE_DOWN = 2, ORIENTATION_PAGE_LEFT = 3 } |
| enum | WritingDirection { WRITING_DIRECTION_LEFT_TO_RIGHT = 0, WRITING_DIRECTION_RIGHT_TO_LEFT = 1, WRITING_DIRECTION_TOP_TO_BOTTOM = 2 } |
| enum | TextlineOrder { TEXTLINE_ORDER_LEFT_TO_RIGHT = 0, TEXTLINE_ORDER_RIGHT_TO_LEFT = 1, TEXTLINE_ORDER_TOP_TO_BOTTOM = 2 } |
| enum | PageSegMode { PSM_OSD_ONLY, PSM_AUTO_OSD, PSM_AUTO_ONLY, PSM_AUTO, PSM_SINGLE_COLUMN, PSM_SINGLE_BLOCK_VERT_TEXT, PSM_SINGLE_BLOCK, PSM_SINGLE_LINE, PSM_SINGLE_WORD, PSM_CIRCLE_WORD, PSM_SINGLE_CHAR, PSM_SPARSE_TEXT, PSM_SPARSE_TEXT_OSD, PSM_RAW_LINE, PSM_COUNT } |
| enum | PageIteratorLevel { RIL_BLOCK, RIL_PARA, RIL_TEXTLINE, RIL_WORD, RIL_SYMBOL } |
| enum | ParagraphJustification { JUSTIFICATION_UNKNOWN, JUSTIFICATION_LEFT, JUSTIFICATION_CENTER, JUSTIFICATION_RIGHT } |
| enum | OcrEngineMode { OEM_TESSERACT_ONLY, OEM_CUBE_ONLY, OEM_TESSERACT_CUBE_COMBINED, OEM_DEFAULT } |
| enum | ScriptPos { SP_NORMAL, SP_SUBSCRIPT, SP_SUPERSCRIPT, SP_DROPCAP } |
| enum | AmbigType { NOT_AMBIG, REPLACE_AMBIG, DEFINITE_AMBIG, SIMILAR_AMBIG, CASE_AMBIG, AMBIG_TYPE_COUNT } |
| enum | SetParamConstraint { SET_PARAM_CONSTRAINT_NONE, SET_PARAM_CONSTRAINT_DEBUG_ONLY, SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY, SET_PARAM_CONSTRAINT_NON_INIT_ONLY } |
| enum | TessdataType { TESSDATA_LANG_CONFIG, TESSDATA_UNICHARSET, TESSDATA_AMBIGS, TESSDATA_INTTEMP, TESSDATA_PFFMTABLE, TESSDATA_NORMPROTO, TESSDATA_PUNC_DAWG, TESSDATA_SYSTEM_DAWG, TESSDATA_NUMBER_DAWG, TESSDATA_FREQ_DAWG, TESSDATA_FIXED_LENGTH_DAWGS, TESSDATA_CUBE_UNICHARSET, TESSDATA_CUBE_SYSTEM_DAWG, TESSDATA_SHAPE_TABLE, TESSDATA_BIGRAM_DAWG, TESSDATA_UNAMBIG_DAWG, TESSDATA_PARAMS_MODEL, TESSDATA_NUM_ENTRIES } |
| enum | CharSegmentationType { CST_FRAGMENT, CST_WHOLE, CST_IMPROPER, CST_NGRAM } |
| enum | CountTypes { CT_UNICHAR_TOP_OK, CT_UNICHAR_TOP1_ERR, CT_UNICHAR_TOP2_ERR, CT_UNICHAR_TOPN_ERR, CT_UNICHAR_TOPTOP_ERR, CT_OK_MULTI_UNICHAR, CT_OK_JOINED, CT_OK_BROKEN, CT_REJECT, CT_FONT_ATTR_ERR, CT_OK_MULTI_FONT, CT_NUM_RESULTS, CT_RANK, CT_REJECTED_JUNK, CT_ACCEPTED_JUNK, CT_SIZE } |
| enum | DawgType { DAWG_TYPE_PUNCTUATION, DAWG_TYPE_WORD, DAWG_TYPE_NUMBER, DAWG_TYPE_PATTERN, DAWG_TYPE_COUNT } |
| enum | XHeightConsistencyEnum { XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT } |
| enum | ColumnSpanningType { CST_NOISE, CST_FLOWING, CST_HEADING, CST_PULLOUT, CST_COUNT } |
| enum | NeighbourPartitionType { NPT_HTEXT, NPT_VTEXT, NPT_WEAK_HTEXT, NPT_WEAK_VTEXT, NPT_IMAGE, NPT_COUNT } |
| enum | LeftOrRight { LR_LEFT, LR_RIGHT } |
| enum | PartitionFindResult { PFR_OK, PFR_SKEW, PFR_NOISE } |
| enum | ColSegType { COL_UNKNOWN, COL_TEXT, COL_TABLE, COL_MIXED, COL_COUNT } |
| enum | TabAlignment { TA_LEFT_ALIGNED, TA_LEFT_RAGGED, TA_CENTER_JUSTIFIED, TA_RIGHT_ALIGNED, TA_RIGHT_RAGGED, TA_SEPARATOR, TA_COUNT } |
| enum | LMPainPointsType { LM_PPTYPE_BLAMER, LM_PPTYPE_AMBIG, LM_PPTYPE_PATH, LM_PPTYPE_SHAPE, LM_PPTYPE_NUM } |
Functions | |
| STRING | HOcrEscape (const char *text) |
| double | prec (double x) |
| long | dist2 (int x1, int y1, int x2, int y2) |
| void | GetWordBaseline (int writing_direction, int ppi, int height, int word_x1, int word_y1, int word_x2, int word_y2, int line_x1, int line_y1, int line_x2, int line_y2, double *x0, double *y0, double *length) |
| void | AffineMatrix (int writing_direction, int line_x1, int line_y1, int line_x2, int line_y2, double *a, double *b, double *c, double *d) |
| void | ClipBaseline (int ppi, int x1, int y1, int x2, int y2, int *line_x1, int *line_y1, int *line_x2, int *line_y2) |
| bool | IsTextOrEquationType (PolyBlockType type) |
| bool | IsLeftIndented (const EquationDetect::IndentType type) |
| bool | IsRightIndented (const EquationDetect::IndentType type) |
| STRING | RtlEmbed (const STRING &word, bool rtlify) |
| bool | IsLatinLetter (int ch) |
| bool | IsDigitLike (int ch) |
| bool | IsOpeningPunct (int ch) |
| bool | IsTerminalPunct (int ch) |
| const char * | SkipChars (const char *str, const char *toskip) |
| const char * | SkipChars (const char *str, bool(*skip)(int)) |
| const char * | SkipOne (const char *str, const char *toskip) |
| bool | LikelyListNumeral (const STRING &word) |
| bool | LikelyListMark (const STRING &word) |
| bool | AsciiLikelyListItem (const STRING &word) |
| int | UnicodeFor (const UNICHARSET *u, const WERD_CHOICE *werd, int pos) |
| bool | LikelyListMarkUnicode (int ch) |
| bool | UniLikelyListItem (const UNICHARSET *u, const WERD_CHOICE *werd) |
| void | LeftWordAttributes (const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea) |
| void | RightWordAttributes (const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea) |
| int | ClosestCluster (const GenericVector< Cluster > &clusters, int value) |
| void | CalculateTabStops (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, int tolerance, GenericVector< Cluster > *left_tabs, GenericVector< Cluster > *right_tabs) |
| void | MarkRowsWithModel (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, const ParagraphModel *model, bool ltr, int eop_threshold) |
| void | GeometricClassifyThreeTabStopTextBlock (int debug_level, GeometricClassifierState &s, ParagraphTheory *theory) |
| void | GeometricClassify (int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory) |
| bool | ValidFirstLine (const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model) |
| bool | ValidBodyLine (const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model) |
| bool | CrownCompatible (const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model) |
| void | DiscardUnusedModels (const GenericVector< RowScratchRegisters > &rows, ParagraphTheory *theory) |
| void | DowngradeWeakestToCrowns (int debug_level, ParagraphTheory *theory, GenericVector< RowScratchRegisters > *rows) |
| void | RecomputeMarginsAndClearHypotheses (GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile) |
| int | InterwordSpace (const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end) |
| bool | FirstWordWouldHaveFit (const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification) |
| bool | FirstWordWouldHaveFit (const RowScratchRegisters &before, const RowScratchRegisters &after) |
| bool | TextSupportsBreak (const RowScratchRegisters &before, const RowScratchRegisters &after) |
| bool | LikelyParagraphStart (const RowScratchRegisters &before, const RowScratchRegisters &after) |
| bool | LikelyParagraphStart (const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification j) |
| ParagraphModel | InternalParagraphModelByOutline (const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance, bool *consistent) |
| ParagraphModel | ParagraphModelByOutline (int debug_level, const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance) |
| bool | RowsFitModel (const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model) |
| void | MarkStrongEvidence (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end) |
| void | ModelStrongEvidence (int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, bool allow_flush_models, ParagraphTheory *theory) |
| void | StrongEvidenceClassify (int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory) |
| void | SeparateSimpleLeaderLines (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory) |
| void | ConvertHypothesizedModelRunsToParagraphs (int debug_level, const GenericVector< RowScratchRegisters > &rows, GenericVector< PARA * > *row_owners, ParagraphTheory *theory) |
| bool | RowIsStranded (const GenericVector< RowScratchRegisters > &rows, int row) |
| void | LeftoverSegments (const GenericVector< RowScratchRegisters > &rows, GenericVector< Interval > *to_fix, int row_start, int row_end) |
| void | CanonicalizeDetectionResults (GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs) |
| void | DetectParagraphs (int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel * > *models) |
| void | InitializeTextAndBoxesPreRecognition (const MutableIterator &it, RowInfo *info) |
| void | InitializeRowInfo (bool after_recognition, const MutableIterator &it, RowInfo *info) |
| void | DetectParagraphs (int debug_level, bool after_text_recognition, const MutableIterator *block_start, GenericVector< ParagraphModel * > *models) |
| bool | StrongModel (const ParagraphModel *model) |
| bool | read_t (PAGE_RES_IT *page_res_it, TBOX *tbox) |
| void | YOutlierPieces (WERD_RES *word, int rebuilt_blob_index, int super_y_bottom, int sub_y_top, ScriptPos *leading_pos, int *num_leading_outliers, ScriptPos *trailing_pos, int *num_trailing_outliers) |
| bool | CompareFontInfo (const FontInfo &fi1, const FontInfo &fi2) |
| bool | CompareFontSet (const FontSet &fs1, const FontSet &fs2) |
| void | FontInfoDeleteCallback (FontInfo f) |
| void | FontSetDeleteCallback (FontSet fs) |
| bool | read_info (FILE *f, FontInfo *fi, bool swap) |
| bool | write_info (FILE *f, const FontInfo &fi) |
| bool | read_spacing_info (FILE *f, FontInfo *fi, bool swap) |
| bool | write_spacing_info (FILE *f, const FontInfo &fi) |
| bool | read_set (FILE *f, FontSet *fs, bool swap) |
| bool | write_set (FILE *f, const FontSet &fs) |
| usr src packages BUILD tesseract ccstruct otsuthr cpp int | OtsuThreshold (Pix *src_pix, int left, int top, int width, int height, int **thresholds, int **hi_values) |
| void | HistogramRect (Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram) |
| int | OtsuStats (const int *histogram, int *H_out, int *omega0_out) |
| int | ParamsTrainingFeatureByName (const char *name) |
| bool | PSM_OSD_ENABLED (int pageseg_mode) |
| bool | PSM_ORIENTATION_ENABLED (int pageseg_mode) |
| bool | PSM_COL_FIND_ENABLED (int pageseg_mode) |
| bool | PSM_SPARSE (int pageseg_mode) |
| bool | PSM_BLOCK_FIND_ENABLED (int pageseg_mode) |
| bool | PSM_LINE_FIND_ENABLED (int pageseg_mode) |
| bool | PSM_WORD_FIND_ENABLED (int pageseg_mode) |
| const char * | ScriptPosToString (enum ScriptPos script_pos) |
| ELISTIZE (AmbigSpec) | |
| ELISTIZEH (AmbigSpec) | |
| bool | LoadDataFromFile (const STRING &filename, GenericVector< char > *data) |
| bool | SaveDataToFile (const GenericVector< char > &data, const STRING &filename) |
| template<typename T > | |
| bool | cmp_eq (T const &t1, T const &t2) |
| template<typename T > | |
| int | sort_cmp (const void *t1, const void *t2) |
| template<typename T > | |
| int | sort_ptr_cmp (const void *t1, const void *t2) |
| void | ExtractFontName (const STRING &filename, STRING *fontname) |
| TrainingSample * | BlobToTrainingSample (const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features) |
| uinT8 | NormalizeDirection (uinT8 dir, const FCOORD &unnormed_pos, const DENORM &denorm, const DENORM *root_denorm) |
| void | ClearFeatureSpaceWindow (NORM_METHOD norm_method, ScrollView *window) |
| void | CallWithUTF8 (TessCallback1< const char * > *cb, const WERD_CHOICE *wc) |
| Pix * | GridReducedPix (const TBOX &box, int gridsize, ICOORD bleft, int *left, int *bottom) |
| Pix * | TraceOutlineOnReducedPix (C_OUTLINE *outline, int gridsize, ICOORD bleft, int *left, int *bottom) |
| Pix * | TraceBlockOnReducedPix (BLOCK *block, int gridsize, ICOORD bleft, int *left, int *bottom) |
| template<class BBC > | |
| int | SortByBoxLeft (const void *void1, const void *void2) |
| template<class BBC > | |
| int | SortRightToLeft (const void *void1, const void *void2) |
| template<class BBC > | |
| int | SortByBoxBottom (const void *void1, const void *void2) |
| template<typename T > | |
| void | DeleteObject (T *object) |
| void | SetBlobStrokeWidth (Pix *pix, BLOBNBOX *blob) |
| void | assign_blobs_to_blocks2 (Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks) |
| void | ParseCommandLineFlags (const char *usage, int *argc, char ***argv, const bool remove_flags) |
| ShapeTable * | LoadShapeTable (const STRING &file_prefix) |
| void | WriteShapeTable (const STRING &file_prefix, const ShapeTable &shape_table) |
| MasterTrainer * | LoadTrainingData (int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix) |
| Pix * | DegradeImage (Pix *input, int exposure, TRand *randomizer, float *rotation) |
| usr src packages BUILD tesseract training normstrngs cpp void | UTF8ToUTF32 (const char *utf8_str, GenericVector< char32 > *str32) |
| void | UTF32ToUTF8 (const GenericVector< char32 > &str32, STRING *utf8_str) |
| bool | is_hyphen_punc (const char32 ch) |
| bool | is_single_quote (const char32 ch) |
| bool | is_double_quote (const char32 ch) |
| STRING | NormalizeUTF8String (const char *str8) |
| void | NormalizeChar32 (char32 ch, GenericVector< char32 > *str) |
| char32 | OCRNormalize (char32 ch) |
| bool | IsOCREquivalent (char32 ch1, char32 ch2) |
| bool | IsValidCodepoint (const char32 ch) |
| bool | IsWhitespace (const char32 ch) |
| bool | IsUTF8Whitespace (const char *text) |
| int | SpanUTF8Whitespace (const char *text) |
| int | SpanUTF8NotWhitespace (const char *text) |
| bool | IsInterchangeValid (const char32 ch) |
| bool | IsInterchangeValid7BitAscii (const char32 ch) |
| char32 | FullwidthToHalfwidth (const char32 ch) |
| Pix * | CairoARGB32ToPixFormat (cairo_surface_t *surface) |
| void | ExtractFontProperties (const string &utf8_text, StringRenderer *render, const string &output_base) |
| bool | MakeIndividualGlyphs (Pix *pix, const vector< BoxChar * > &vbox, const int input_tiff_page) |
| void | SetupBasicProperties (bool report_errors, UNICHARSET *unicharset) |
| void | SetPropertiesForInputFile (const string &script_dir, const string &input_unicharset_file, const string &output_unicharset_file, const string &output_xheights_file) |
| ELISTIZE (ViterbiStateEntry) | |
| ELISTIZEH (ViterbiStateEntry) | |
| template<class BLOB_CHOICE > | |
| int | SortByUnicharID (const void *void1, const void *void2) |
| template<class BLOB_CHOICE > | |
| int | SortByRating (const void *void1, const void *void2) |
convert_prob_to_tess_certainty | |
Normalize a probability in the range [0.0, 1.0] to a tesseract certainty in the range [-20.0, 0.0] | |
char_box_to_tbox | |
Create a TBOX from a character bounding box. If nonzero, the x_offset accounts for any additional padding of the word box that should be taken into account. | |
| TBOX | char_box_to_tbox (Box *char_box, TBOX word_box, int x_offset) |
Variables | |
| const int | kBasicBufSize = 2048 |
| const int | kCharWidth = 2 |
| const float | kMathDigitDensityTh1 = 0.25 |
| const float | kMathDigitDensityTh2 = 0.1 |
| const float | kMathItalicDensityTh = 0.5 |
| const float | kUnclearDensityTh = 0.25 |
| const int | kSeedBlobsCountTh = 10 |
| const int | kLeftIndentAlignmentCountTh = 1 |
| const int | kMaxCharTopRange = 48 |
| const int | kMinCredibleResolution = 70 |
| Minimum believable resolution. | |
| const int | kDefaultResolution = 300 |
| Default resolution used if input in not believable. | |
| const int | kMaxCircleErosions = 8 |
| usr src packages BUILD tesseract ccmain paragraphs cpp usr src packages BUILD tesseract ccmain paragraphs cpp usr src packages BUILD tesseract ccmain paragraphs cpp const ParagraphModel * | kCrownLeft = reinterpret_cast<ParagraphModel *>(0xDEAD111F) |
| const ParagraphModel * | kCrownRight = reinterpret_cast<ParagraphModel *>(0xDEAD888F) |
| const inT16 | kMaxBoxEdgeDiff = 2 |
| const int | kBoxClipTolerance = 2 |
| usr src packages BUILD tesseract ccstruct detlinefit cpp const int | kNumEndPoints = 3 |
| const int | kMinPointsForErrorCount = 16 |
| const int | kMaxRealDistance = 2.0 |
| const int | kFeaturePadding = 2 |
| const int | kImagePadding = 4 |
| const int | kNumPagesPerMiniBatch = 100 |
| const int | kHistogramSize = 256 |
| usr src packages BUILD tesseract ccutil ambigs cpp usr src packages BUILD tesseract ccutil ambigs cpp const int | kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) |
| CCUtilMutex | tprintfMutex |
| usr src packages BUILD tesseract ccutil unicodes cpp const char * | kUTF8LineSeparator = "\u2028" |
| const char * | kUTF8ParagraphSeparator = "\u2029" |
| const char * | kLRM = "\u200E" |
| const char * | kRLM = "\u200F" |
| const char * | kRLE = "\u202A" |
| const char * | kPDF = "\u202C" |
| const char * | kHyphenLikeUTF8 [] |
| const char * | kApostropheLikeUTF8 [] |
| const char | kUniversalAmbigsFile [] |
| const int | ksizeofUniversalAmbigsFile = sizeof(kUniversalAmbigsFile) |
| usr src packages BUILD tesseract classify errorcounter cpp usr src packages BUILD tesseract classify errorcounter cpp usr src packages BUILD tesseract classify errorcounter cpp const double | kRatingEpsilon = 1.0 / 32 |
| const int | kMaxOffsetDist = 32 |
| const double | kMinPCLengthIncrease = 1.0 / 1024 |
| usr src packages BUILD tesseract classify mastertrainer cpp const int | kMinClusteredShapes = 1 |
| const int | kMaxUnicharsPerCluster = 2000 |
| const float | kFontMergeDistance = 0.025 |
| const float | kInfiniteDist = 999.0f |
| const int | kRandomizingCenter = 128 |
| const int | kTestChar = -1 |
| const int | kSquareLimit = 25 |
| const int | kPrime1 = 17 |
| const int | kPrime2 = 13 |
| const int | kMinOutlierSamples = 5 |
| const int | kStateCnt = 4 |
| const int | kNumLiteralCnt = 5 |
| const int | case_state_table [6][4] |
| const char | kDoNotReverse [] = "RRP_DO_NO_REVERSE" |
| const char | kReverseIfHasRTL [] = "RRP_REVERSE_IF_HAS_RTL" |
| const char | kForceReverse [] = "RRP_FORCE_REVERSE" |
| const char *const | RTLReversePolicyNames [] |
| const double | kAlignedFraction = 0.03125 |
| const double | kRaggedFraction = 2.5 |
| const double | kAlignedGapFraction = 0.75 |
| const double | kRaggedGapFraction = 1.0 |
| const int | kVLineAlignment = 3 |
| const int | kVLineGutter = 1 |
| const int | kVLineSearchSize = 150 |
| const int | kMinRaggedTabs = 5 |
| const int | kMinAlignedTabs = 4 |
| const int | kVLineMinLength = 500 |
| const double | kMinTabGradient = 4.0 |
| const int | kMaxSkewFactor = 15 |
| const char * | kTextordDebugPix = "psdebug_pix" |
| usr src packages BUILD tesseract textord ccnontextdetect cpp usr src packages BUILD tesseract textord ccnontextdetect cpp usr src packages BUILD tesseract textord ccnontextdetect cpp const double | kMaxSmallNeighboursPerPix = 1.0 / 32 |
| const int | kMaxLargeOverlapsWithSmall = 3 |
| const int | kMaxMediumOverlapsWithSmall = 12 |
| const int | kMaxLargeOverlapsWithMedium = 12 |
| const int | kOriginalNoiseMultiple = 8 |
| const int | kNoisePadding = 4 |
| const double | kPhotoOffsetFraction = 0.375 |
| const double | kMinGoodTextPARatio = 1.5 |
| usr src packages BUILD tesseract textord colfind cpp usr src packages BUILD tesseract textord colfind cpp usr src packages BUILD tesseract textord colfind cpp usr src packages BUILD tesseract textord colfind cpp const int | kMinColumnWidth = 100 |
| const int | kMaxIncompatibleColumnCount = 2 |
| const double | kMarginOverlapFraction = 0.25 |
| const double | kHorizontalGapMergeFraction = 0.5 |
| const double | kMinNonNoiseFraction = 0.5 |
| const double | kMinGutterWidthGrid = 0.5 |
| const double | kMaxDistToPartSizeRatio = 1.5 |
| bool | textord_tabfind_show_initial_partitions = false |
| bool | textord_tabfind_show_reject_blobs = false |
| int | textord_tabfind_show_partitions = 0 |
| bool | textord_tabfind_show_columns = false |
| bool | textord_tabfind_show_blocks = false |
| bool | textord_tabfind_find_tables = true |
| const double | kMaxSpacingDrift = 1.0 / 72 |
| const double | kMaxTopSpacingFraction = 0.25 |
| const double | kMaxSameBlockLineSpacing = 3 |
| const double | kMaxSizeRatio = 1.5 |
| const double | kMaxLeaderGapFractionOfMax = 0.25 |
| const double | kMaxLeaderGapFractionOfMin = 0.5 |
| const int | kMinLeaderCount = 5 |
| const int | kMinStrongTextValue = 6 |
| const int | kMinChainTextValue = 3 |
| const int | kHorzStrongTextlineCount = 8 |
| const int | kHorzStrongTextlineHeight = 10 |
| const int | kHorzStrongTextlineAspect = 5 |
| const double | kMaxBaselineError = 0.4375 |
| const double | kMinBaselineCoverage = 0.5 |
| const int | kMaxRMSColorNoise = 128 |
| const int | kMaxColorDistance = 900 |
| const int | kRGBRMSColors = 4 |
| bool | textord_tabfind_show_color_fit = false |
| const int | kMaxPadFactor = 6 |
| const int | kMaxNeighbourDistFactor = 4 |
| const int | kMaxCaptionLines = 7 |
| const double | kMinCaptionGapRatio = 2.0 |
| const double | kMinCaptionGapHeightRatio = 0.5 |
| const double | kBigPartSizeRatio = 1.75 |
| const double | kTinyEnoughTextlineOverlapFraction = 0.25 |
| const double | kMaxPartitionSpacing = 1.75 |
| const int | kSmoothDecisionMargin = 4 |
| const double | kMinRectangularFraction = 0.125 |
| const double | kMaxRectangularFraction = 0.75 |
| const double | kMaxRectangularGradient = 0.1 |
| const int | kMinImageFindSize = 100 |
| const double | kRMSFitScaling = 8.0 |
| const int | kMinColorDifference = 16 |
| const int | kThinLineFraction = 20 |
| Denominator of resolution makes max pixel width to allow thin lines. | |
| const int | kMinLineLengthFraction = 4 |
| Denominator of resolution makes min pixels to demand line lengths to be. | |
| const int | kCrackSpacing = 100 |
| Spacing of cracks across the page to break up tall vertical lines. | |
| const int | kLineFindGridSize = 50 |
| Grid size used by line finder. Not very critical. | |
| const int | kMinThickLineWidth = 12 |
| const int | kMaxLineResidue = 6 |
| const double | kThickLengthMultiple = 0.75 |
| const double | kMaxNonLineDensity = 0.25 |
| const double | kMaxStaveHeight = 1.0 |
| const double | kMinMusicPixelFraction = 0.75 |
| int | textord_tabfind_show_strokewidths = 0 |
| bool | textord_tabfind_only_strokewidths = false |
| const double | kStrokeWidthFractionTolerance = 0.125 |
| const double | kStrokeWidthTolerance = 1.5 |
| const double | kStrokeWidthFractionCJK = 0.25 |
| const double | kStrokeWidthCJK = 2.0 |
| const int | kCJKRadius = 2 |
| const double | kCJKBrokenDistanceFraction = 0.25 |
| const int | kCJKMaxComponents = 8 |
| const double | kCJKAspectRatio = 1.25 |
| const double | kCJKAspectRatioIncrease = 1.0625 |
| const int | kMaxCJKSizeRatio = 5 |
| const double | kBrokenCJKIterationFraction = 0.125 |
| const double | kDiacriticXPadRatio = 7.0 |
| const double | kDiacriticYPadRatio = 1.75 |
| const double | kMinDiacriticSizeRatio = 1.0625 |
| const double | kMaxDiacriticDistanceRatio = 1.25 |
| const double | kMaxDiacriticGapToBaseCharHeight = 1.0 |
| const int | kSearchRadius = 2 |
| const int | kLineTrapLongest = 4 |
| const int | kLineTrapShortest = 2 |
| const int | kMostlyOneDirRatio = 3 |
| const double | kLineResidueAspectRatio = 8.0 |
| const int | kLineResiduePadRatio = 3 |
| const double | kLineResidueSizeRatio = 1.75 |
| const float | kSizeRatioToReject = 2.0 |
| const int | kMaxLargeOverlaps = 3 |
| const double | kNeighbourSearchFactor = 2.5 |
| const double | kNoiseOverlapGrowthFactor = 4.0 |
| const double | kNoiseOverlapAreaFactor = 1.0 / 512 |
| const double | kShapePerimeterRatio = 3.0 |
| const int | kTabRadiusFactor = 5 |
| const int | kMinVerticalSearch = 3 |
| const int | kMaxVerticalSearch = 12 |
| const int | kMaxRaggedSearch = 25 |
| const int | kMinLinesInColumn = 10 |
| const double | kMinFractionalLinesInColumn = 0.125 |
| const double | kMinGutterWidthAbsolute = 0.02 |
| const double | kMaxGutterWidthAbsolute = 2.00 |
| const int | kRaggedGutterMultiple = 5 |
| const double | kLineFragmentAspectRatio = 10.0 |
| const double | kSmoothFactor = 0.25 |
| const double | kCharVerticalOverlapFraction = 0.375 |
| const double | kMaxHorizontalGap = 3.0 |
| const int | kMinEvaluatedTabs = 3 |
| const int | kMaxTextLineBlobRatio = 5 |
| const int | kMinTextLineBlobRatio = 3 |
| const double | kMinImageArea = 0.5 |
| const double | kCosMaxSkewAngle = 0.866025 |
| bool | textord_tabfind_show_initialtabs = false |
| bool | textord_tabfind_show_finaltabs = false |
| const int | kColumnWidthFactor = 20 |
| usr src packages BUILD tesseract textord tablefind cpp const int | kMaxVerticalSpacing = 500 |
| const int | kMaxBlobWidth = 500 |
| const double | kSplitPartitionSize = 2.0 |
| const double | kAllowTextHeight = 0.5 |
| const double | kAllowTextWidth = 0.6 |
| const double | kAllowTextArea = 0.8 |
| const double | kAllowBlobHeight = 0.3 |
| const double | kAllowBlobWidth = 0.4 |
| const double | kAllowBlobArea = 0.05 |
| const int | kMinBoxesInTextPartition = 10 |
| const int | kMaxBoxesInDataPartition = 20 |
| const double | kMaxGapInTextPartition = 4.0 |
| const double | kMinMaxGapInTextPartition = 0.5 |
| const double | kMaxBlobOverlapFactor = 4.0 |
| const double | kMaxTableCellXheight = 2.0 |
| const int | kMaxColumnHeaderDistance = 4 |
| const double | kTableColumnThreshold = 3.0 |
| const int | kRulingVerticalMargin = 3 |
| const double | kMinOverlapWithTable = 0.6 |
| const int | kSideSpaceMargin = 10 |
| const double | kSmallTableProjectionThreshold = 0.35 |
| const double | kLargeTableProjectionThreshold = 0.45 |
| const int | kLargeTableRowCount = 6 |
| const int | kMinRowsInTable = 3 |
| const double | kRequiredFullJustifiedSpacing = 4.0 |
| const int | kAdjacentLeaderSearchPadding = 2 |
| const double | kParagraphEndingPreviousLineRatio = 1.3 |
| const double | kMaxParagraphEndingLeftSpaceMultiple = 3.0 |
| const double | kMinParagraphEndingTextToWhitespaceRatio = 3.0 |
| const double | kMaxXProjectionGapFactor = 2.0 |
| const double | kStrokeWidthFractionalTolerance = 0.25 |
| const double | kStrokeWidthConstantTolerance = 2.0 |
| bool | textord_dump_table_images = false |
| bool | textord_show_tables = false |
| bool | textord_tablefind_show_mark = false |
| bool | textord_tablefind_show_stats = false |
| bool | textord_tablefind_recognize_tables = false |
| const double | kHorizontalSpacing = 0.30 |
| const double | kVerticalSpacing = -0.2 |
| const int | kCellSplitRowThreshold = 0 |
| const int | kCellSplitColumnThreshold = 0 |
| const int | kLinedTableMinVerticalLines = 3 |
| const int | kLinedTableMinHorizontalLines = 3 |
| const double | kRequiredColumns = 0.7 |
| const double | kMarginFactor = 1.1 |
| const double | kMaxRowSize = 2.5 |
| const double | kGoodRowNumberOfColumnsSmall [] = { 2, 2, 2, 2, 2, 3, 3 } |
| const int | kGoodRowNumberOfColumnsSmallSize |
| const double | kGoodRowNumberOfColumnsLarge = 0.7 |
| const double | kMinFilledArea = 0.35 |
| const int | kGutterMultiple = 4 |
| const int | kGutterToNeighbourRatio = 3 |
| const int | kSimilarVectorDist = 10 |
| const int | kSimilarRaggedDist = 50 |
| const int | kMaxFillinMultiple = 11 |
| const double | kMinGutterFraction = 0.5 |
| const double | kLineCountReciprocal = 4.0 |
| const double | kMinAlignedGutter = 0.25 |
| const double | kMinRaggedGutter = 1.5 |
| double | textord_tabvector_vertical_gap_fraction = 0.5 |
| double | textord_tabvector_vertical_box_ratio = 0.5 |
| const char * | kAlignmentNames [] |
| const int | kMaxLineLength = 1024 |
| usr src packages BUILD tesseract training degradeimage cpp const float | kRotationRange = 0.02f |
| const int | kExposureFactor = 16 |
| const int | kSaltnPepper = 5 |
| const int | kMinRampSize = 1000 |
| const int | kMinLigature = 0xfb00 |
| const int | kMaxLigature = 0xfb17 |
The box file is assumed to contain box definitions, one per line, of the following format for blob-level boxes:
* <UTF8 str> <left> <bottom> <right> <top> <page id> *
and for word/line-level boxes:
* WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str> *
NOTES: The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
<page id>=""> is 0-based, and the page number is used for multipage input (tiff).
In the blob-level form, each line represents a recognizable unit, which may be several UTF-8 bytes, but there is a bounding box around each recognizable unit, and no classifier is needed to train in this mode (bootstrapping.)
In the word/line-level form, the line begins with the literal "WordStr", and the bounding box bounds either a whole line or a whole word. The recognizable units in the word/line are listed after the # at the end of the line and are space delimited, ignoring any original spaces on the line. Eg.
* word -> #w o r d * multi word line -> #m u l t i w o r d l i n e *
The recognizable units must be space-delimited in order to allow multiple unicodes to be used for a single recognizable unit, eg Hindi.
In this mode, the classifier must have been pre-trained with the desired character set, or it will not be able to find the character segmentations.
Make a word from the selected blobs and run Tess on them.
| page_res | recognise blobs |
| selection_box | within this box |
fp_eval_word_spacing() Evaluation function for fixed pitch word lists.
Basically, count the number of "nice" characters - those which are in tess acceptable words or in dict words and are not rejected. Penalise any potential noise chars
build_menu()
Construct the menu tree used by the command window
process_cmd_win_event()
Process a command returned from the command window (Just call the appropriate command handler)
word_blank_and_set_display() Word processor
Blank display of word then redisplay word according to current display mode settings
---------------------------------------------------------------------------- Public Function Prototypes ----------------------------------------------------------------------------
---------------------------------------------------------------------------- Include Files and Type Defines ----------------------------------------------------------------------------
| typedef GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> tesseract::BlobGridSearch |
Definition at line 31 of file blobgrid.h.
| typedef signed int tesseract::char_32 |
Definition at line 40 of file string_32.h.
| typedef GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> tesseract::ColPartitionGridSearch |
Definition at line 913 of file colpartition.h.
| typedef BBGrid<ColSegment, ColSegment_CLIST, ColSegment_C_IT> tesseract::ColSegmentGrid |
Definition at line 118 of file tablefind.h.
| typedef GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> tesseract::ColSegmentGridSearch |
Definition at line 121 of file tablefind.h.
| typedef GenericVector<Dawg *> tesseract::DawgVector |
| typedef int(Dict::* tesseract::DictFunc)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const |
| typedef bool(* tesseract::FileReader)(const STRING &filename, GenericVector< char > *data) |
Definition at line 349 of file genericvector.h.
| typedef bool(* tesseract::FileWriter)(const GenericVector< char > &data, const STRING &filename) |
Definition at line 352 of file genericvector.h.
| typedef void(Wordrec::* tesseract::FillLatticeFunc)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) |
| typedef KDPairInc<int, int> tesseract::IntKDPair |
| typedef unsigned char tesseract::LanguageModelFlagsType |
Used for expressing various language model flags.
Definition at line 37 of file lm_state.h.
| typedef hash_map<string, string, StringHash> tesseract::LigHash |
Definition at line 32 of file ligature_table.h.
Definition at line 34 of file lm_pain_points.h.
| typedef float(Dict::* tesseract::ParamsModelClassifyFunc)(const char *lang, void *path) |
Definition at line 122 of file params_training_featdef.h.
Definition at line 33 of file colpartitionset.h.
| typedef double(Dict::* tesseract::ProbabilityInContextFunc)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes) |
| typedef GenericVectorEqEq<const ParagraphModel *> tesseract::SetOfModels |
Definition at line 94 of file paragraphs_internal.h.
Definition at line 156 of file shapetable.h.
| typedef basic_string<char_32> tesseract::string_32 |
Definition at line 41 of file string_32.h.
| typedef GenericVector<int> tesseract::SuccessorList |
| typedef TessCallback4<const UNICHARSET &, int, PageIterator *, Pix *> tesseract::TruthCallback |
| typedef GenericVector<AmbigSpec_LIST *> tesseract::UnicharAmbigsVector |
| typedef TessResultCallback1<bool, int> tesseract::WidthCallback |
| typedef BBGrid<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> tesseract::WordGrid |
| typedef void(Tesseract::* tesseract::WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words) |
Definition at line 166 of file tesseractclass.h.
| typedef GridSearch<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> tesseract::WordSearch |
| enum tesseract::AmbigType |
Definition at line 44 of file ambigs.h.
{
NOT_AMBIG, // the ngram pair is not ambiguous
REPLACE_AMBIG, // ocred ngram should always be substituted with correct
DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)
SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1)
CASE_AMBIG, // this is a case ambiguity (1-1)
AMBIG_TYPE_COUNT // number of enum entries
};
Definition at line 54 of file classify.h.
{
CST_FRAGMENT, // A partial character.
CST_WHOLE, // A correctly segmented character.
CST_IMPROPER, // More than one but less than 2 characters.
CST_NGRAM // Multiple characters.
};
Definition at line 482 of file tessedit.cpp.
Definition at line 30 of file tablefind.h.
{
COL_UNKNOWN,
COL_TEXT,
COL_TABLE,
COL_MIXED,
COL_COUNT
};
Definition at line 47 of file colpartition.h.
{
CST_NOISE, // Strictly between columns.
CST_FLOWING, // Strictly within a single column.
CST_HEADING, // Spans multiple columns.
CST_PULLOUT, // Touches multiple columns, but doesn't span them.
CST_COUNT // Number of entries.
};
Definition at line 69 of file errorcounter.h.
{
CT_UNICHAR_TOP_OK, // Top shape contains correct unichar id.
// The rank of the results in TOP1, TOP2, TOPN is determined by a gap of
// kRatingEpsilon from the first result in each group. The real top choice
// is measured using TOPTOP.
CT_UNICHAR_TOP1_ERR, // Top shape does not contain correct unichar id.
CT_UNICHAR_TOP2_ERR, // Top 2 shapes don't contain correct unichar id.
CT_UNICHAR_TOPN_ERR, // No output shape contains correct unichar id.
CT_UNICHAR_TOPTOP_ERR, // Very top choice not correct.
CT_OK_MULTI_UNICHAR, // Top shape id has correct unichar id, and others.
CT_OK_JOINED, // Top shape id is correct but marked joined.
CT_OK_BROKEN, // Top shape id is correct but marked broken.
CT_REJECT, // Classifier hates this.
CT_FONT_ATTR_ERR, // Top unichar OK, but font attributes incorrect.
CT_OK_MULTI_FONT, // CT_FONT_ATTR_OK but there are multiple font attrs.
CT_NUM_RESULTS, // Number of answers produced.
CT_RANK, // Rank of correct answer.
CT_REJECTED_JUNK, // Junk that was correctly rejected.
CT_ACCEPTED_JUNK, // Junk that was incorrectly classified otherwise.
CT_SIZE // Number of types for array sizing.
};
| enum tesseract::DawgType |
Definition at line 71 of file dawg.h.
{
DAWG_TYPE_PUNCTUATION,
DAWG_TYPE_WORD,
DAWG_TYPE_NUMBER,
DAWG_TYPE_PATTERN,
DAWG_TYPE_COUNT // number of enum entries
};
Definition at line 39 of file params_training_featdef.h.
{
// Digits
PTRAIN_DIGITS_SHORT, // 0
PTRAIN_DIGITS_MED, // 1
PTRAIN_DIGITS_LONG, // 2
// Number or pattern (NUMBER_PERM, USER_PATTERN_PERM)
PTRAIN_NUM_SHORT, // 3
PTRAIN_NUM_MED, // 4
PTRAIN_NUM_LONG, // 5
// Document word (DOC_DAWG_PERM)
PTRAIN_DOC_SHORT, // 6
PTRAIN_DOC_MED, // 7
PTRAIN_DOC_LONG, // 8
// Word (SYSTEM_DAWG_PERM, USER_DAWG_PERM, COMPOUND_PERM)
PTRAIN_DICT_SHORT, // 9
PTRAIN_DICT_MED, // 10
PTRAIN_DICT_LONG, // 11
// Frequent word (FREQ_DAWG_PERM)
PTRAIN_FREQ_SHORT, // 12
PTRAIN_FREQ_MED, // 13
PTRAIN_FREQ_LONG, // 14
PTRAIN_SHAPE_COST_PER_CHAR, // 15
PTRAIN_NGRAM_COST_PER_CHAR, // 16
PTRAIN_NUM_BAD_PUNC, // 17
PTRAIN_NUM_BAD_CASE, // 18
PTRAIN_XHEIGHT_CONSISTENCY, // 19
PTRAIN_NUM_BAD_CHAR_TYPE, // 20
PTRAIN_NUM_BAD_SPACING, // 21
PTRAIN_NUM_BAD_FONT, // 22
PTRAIN_RATING_PER_CHAR, // 23
PTRAIN_NUM_FEATURE_TYPES
};
| enum tesseract::LineType |
Definition at line 54 of file paragraphs_internal.h.
{
LT_START = 'S', // First line of a paragraph.
LT_BODY = 'C', // Continuation line of a paragraph.
LT_UNKNOWN = 'U', // No clues.
LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.
};
Definition at line 37 of file lm_pain_points.h.
Definition at line 1550 of file colpartitiongrid.cpp.
{
NPT_HTEXT, // Definite horizontal text.
NPT_VTEXT, // Definite vertical text.
NPT_WEAK_HTEXT, // Weakly horizontal text. Counts as HTEXT for HTEXT, but
// image for image and VTEXT.
NPT_WEAK_VTEXT, // Weakly vertical text. Counts as VTEXT for VTEXT, but
// image for image and HTEXT.
NPT_IMAGE, // Defininte non-text.
NPT_COUNT // Number of array elements.
};
Definition at line 44 of file normalis.h.
{
NM_BASELINE = -3, // The original BL normalization mode.
NM_CHAR_ISOTROPIC = -2, // Character normalization but isotropic.
NM_CHAR_ANISOTROPIC = -1 // The original CN normalization mode.
};
When Tesseract/Cube is initialized we can choose to instantiate/load/run only the Tesseract part, only the Cube part or both along with the combiner. The preference of which engine to use is stored in tessedit_ocr_engine_mode.
ATTENTION: When modifying this enum, please make sure to make the appropriate changes to all the enums mirroring it (e.g. OCREngine in cityblock/workflow/detection/detection_storage.proto). Such enums will mention the connection to OcrEngineMode in the comments.
Definition at line 256 of file publictypes.h.
{
OEM_TESSERACT_ONLY, // Run Tesseract only - fastest
OEM_CUBE_ONLY, // Run Cube only - better accuracy, but slower
OEM_TESSERACT_CUBE_COMBINED, // Run both and combine results - best accuracy
OEM_DEFAULT // Specify this mode when calling init_*(),
// to indicate that any of the above modes
// should be automatically inferred from the
// variables in the language-specific config,
// command-line configs, or if not specified
// in any of the above should be set to the
// default OEM_TESSERACT_ONLY.
};
+------------------+ Orientation Example: | 1 Aaaa Aaaa Aaaa | ==================== | Aaa aa aaa aa | To left is a diagram of some (1) English and | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit. | 2 | | ####### c c C | Upright Latin characters are represented as A and a. | ####### c c c | '<' represents a latin character rotated | < ####### c c c | anti-clockwise 90 degrees. | < ####### c c | | < ####### . c | Upright Chinese characters are represented C and c. | 3 ####### c | +------------------+ NOTA BENE: enum values here should match goodoc.proto
If you orient your head so that "up" aligns with Orientation, then the characters will appear "right side up" and readable.
In the example above, both the English and Chinese paragraphs are oriented so their "up" is the top of the page (page up). The photo credit is read with one's head turned leftward ("up" is to page left).
The values of this enum match the convention of Tesseract's osdetect.h
Definition at line 108 of file publictypes.h.
{
ORIENTATION_PAGE_UP = 0,
ORIENTATION_PAGE_RIGHT = 1,
ORIENTATION_PAGE_DOWN = 2,
ORIENTATION_PAGE_LEFT = 3,
};
enum of the elements of the page hierarchy, used in ResultIterator to provide functions that operate on each level without having to have 5x as many functions.
Definition at line 207 of file publictypes.h.
{
RIL_BLOCK, // Block of text/image/separator line.
RIL_PARA, // Paragraph within a block.
RIL_TEXTLINE, // Line within a paragraph.
RIL_WORD, // Word within a textline.
RIL_SYMBOL // Symbol/character within a word.
};
Possible modes for page layout analysis. These *must* be kept in order of decreasing amount of layout analysis to be done, except for OSD_ONLY, so that the inequality test macros below work.
Definition at line 151 of file publictypes.h.
JUSTIFICATION_UNKNONW The alignment is not clearly one of the other options. This could happen for example if there are only one or two lines of text or the text looks like source code or poetry.
NOTA BENE: Fully justified paragraphs (text aligned to both left and right margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text is written with a left-to-right script and with JUSTIFICATION_RIGHT if their text is written in a right-to-left script.
Interpretation for text read in vertical lines: "Left" is wherever the starting reading position is.
JUSTIFICATION_LEFT Each line, except possibly the first, is flush to the same left tab stop.
JUSTIFICATION_CENTER The text lines of the paragraph are centered about a line going down through their middle of the text lines.
JUSTIFICATION_RIGHT Each line, except possibly the first, is flush to the same right tab stop.
Definition at line 239 of file publictypes.h.
Definition at line 46 of file strokewidth.h.
| enum tesseract::ScriptPos |
| TA_LEFT_ALIGNED | |
| TA_LEFT_RAGGED | |
| TA_CENTER_JUSTIFIED | |
| TA_RIGHT_ALIGNED | |
| TA_RIGHT_RAGGED | |
| TA_SEPARATOR | |
| TA_COUNT |
Definition at line 43 of file tabvector.h.
Definition at line 53 of file tessdatamanager.h.
{
TESSDATA_LANG_CONFIG, // 0
TESSDATA_UNICHARSET, // 1
TESSDATA_AMBIGS, // 2
TESSDATA_INTTEMP, // 3
TESSDATA_PFFMTABLE, // 4
TESSDATA_NORMPROTO, // 5
TESSDATA_PUNC_DAWG, // 6
TESSDATA_SYSTEM_DAWG, // 7
TESSDATA_NUMBER_DAWG, // 8
TESSDATA_FREQ_DAWG, // 9
TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated
TESSDATA_CUBE_UNICHARSET, // 11
TESSDATA_CUBE_SYSTEM_DAWG, // 12
TESSDATA_SHAPE_TABLE, // 13
TESSDATA_BIGRAM_DAWG, // 14
TESSDATA_UNAMBIG_DAWG, // 15
TESSDATA_PARAMS_MODEL, // 16
TESSDATA_NUM_ENTRIES
};
The text lines are read in the given sequence.
In English, the order is top-to-bottom. In Chinese, vertical text lines are read right-to-left. Mongolian is written in vertical columns top to bottom like Chinese, but the lines order left-to right.
Note that only some combinations make sense. For example, WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM
Definition at line 140 of file publictypes.h.
{
TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,
TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,
TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,
};
The grapheme clusters within a line of text are laid out logically in this direction, judged when looking at the text line rotated so that its Orientation is "page up".
For English text, the writing direction is left-to-right. For the Chinese text in the above example, the writing direction is top-to-bottom.
| WRITING_DIRECTION_LEFT_TO_RIGHT | |
| WRITING_DIRECTION_RIGHT_TO_LEFT | |
| WRITING_DIRECTION_TOP_TO_BOTTOM |
Definition at line 123 of file publictypes.h.
| void tesseract::AffineMatrix | ( | int | writing_direction, |
| int | line_x1, | ||
| int | line_y1, | ||
| int | line_x2, | ||
| int | line_y2, | ||
| double * | a, | ||
| double * | b, | ||
| double * | c, | ||
| double * | d | ||
| ) |
Definition at line 246 of file pdfrenderer.cpp.
{
double theta = atan2(static_cast<double>(line_y1 - line_y2),
static_cast<double>(line_x2 - line_x1));
*a = cos(theta);
*b = sin(theta);
*c = -sin(theta);
*d = cos(theta);
switch(writing_direction) {
case WRITING_DIRECTION_RIGHT_TO_LEFT:
*a = -*a;
*b = -*b;
break;
case WRITING_DIRECTION_TOP_TO_BOTTOM:
// TODO(jbreiden) Consider using the vertical PDF writing mode.
break;
default:
break;
}
}
| bool tesseract::AsciiLikelyListItem | ( | const STRING & | word | ) |
Definition at line 270 of file paragraphs.cpp.
{
| void tesseract::assign_blobs_to_blocks2 | ( | Pix * | pix, |
| BLOCK_LIST * | blocks, | ||
| TO_BLOCK_LIST * | port_blocks | ||
| ) |
Definition at line 157 of file tordmain.cpp.
{ // output list
BLOCK *block; // current block
BLOBNBOX *newblob; // created blob
C_BLOB *blob; // current blob
BLOCK_IT block_it = blocks;
C_BLOB_IT blob_it; // iterator
BLOBNBOX_IT port_box_it; // iterator
// destination iterator
TO_BLOCK_IT port_block_it = port_blocks;
TO_BLOCK *port_block; // created block
for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
block = block_it.data();
port_block = new TO_BLOCK(block);
// Convert the good outlines to block->blob_list
port_box_it.set_to_list(&port_block->blobs);
blob_it.set_to_list(block->blob_list());
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
blob = blob_it.extract();
newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
SetBlobStrokeWidth(pix, newblob);
port_box_it.add_after_then_move(newblob);
}
// Put the rejected outlines in block->noise_blobs, which allows them to
// be reconsidered and sorted back into rows and recover outlines mistakenly
// rejected.
port_box_it.set_to_list(&port_block->noise_blobs);
blob_it.set_to_list(block->reject_blobs());
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
blob = blob_it.extract();
newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
SetBlobStrokeWidth(pix, newblob);
port_box_it.add_after_then_move(newblob);
}
port_block_it.add_after_then_move(port_block);
}
}
| TrainingSample * tesseract::BlobToTrainingSample | ( | const TBLOB & | blob, |
| bool | nonlinear_norm, | ||
| INT_FX_RESULT_STRUCT * | fx_info, | ||
| GenericVector< INT_FEATURE_STRUCT > * | bl_features | ||
| ) |
Definition at line 81 of file intfx.cpp.
{
GenericVector<INT_FEATURE_STRUCT> cn_features;
Classify::ExtractFeatures(blob, nonlinear_norm, bl_features,
&cn_features, fx_info, NULL);
// TODO(rays) Use blob->PreciseBoundingBox() instead.
TBOX box = blob.bounding_box();
TrainingSample* sample = NULL;
int num_features = fx_info->NumCN;
if (num_features > 0) {
sample = TrainingSample::CopyFromFeatures(*fx_info, box, &cn_features[0],
num_features);
}
if (sample != NULL) {
// Set the bounding box (in original image coordinates) in the sample.
TPOINT topleft, botright;
topleft.x = box.left();
topleft.y = box.top();
botright.x = box.right();
botright.y = box.bottom();
TPOINT original_topleft, original_botright;
blob.denorm().DenormTransform(NULL, topleft, &original_topleft);
blob.denorm().DenormTransform(NULL, botright, &original_botright);
sample->set_bounding_box(TBOX(original_topleft.x, original_botright.y,
original_botright.x, original_topleft.y));
}
return sample;
}
| Pix* tesseract::CairoARGB32ToPixFormat | ( | cairo_surface_t * | surface | ) |
Definition at line 79 of file stringrenderer.cpp.
{
printf("Unexpected surface format %d\n",
cairo_image_surface_get_format(surface));
return NULL;
}
const int width = cairo_image_surface_get_width(surface);
const int height = cairo_image_surface_get_height(surface);
Pix* pix = pixCreate(width, height, 32);
int byte_stride = cairo_image_surface_get_stride(surface);
for (int i = 0; i < height; ++i) {
memcpy(reinterpret_cast<unsigned char*>(pix->data + i * pix->wpl) + 1,
cairo_image_surface_get_data(surface) + i * byte_stride,
byte_stride - ((i == height - 1) ? 1 : 0));
}
return pix;
}
| void tesseract::CalculateTabStops | ( | GenericVector< RowScratchRegisters > * | rows, |
| int | row_start, | ||
| int | row_end, | ||
| int | tolerance, | ||
| GenericVector< Cluster > * | left_tabs, | ||
| GenericVector< Cluster > * | right_tabs | ||
| ) |
Definition at line 694 of file paragraphs.cpp.
{
if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
return;
// First pass: toss all left and right indents into clusterers.
SimpleClusterer initial_lefts(tolerance);
SimpleClusterer initial_rights(tolerance);
GenericVector<Cluster> initial_left_tabs;
GenericVector<Cluster> initial_right_tabs;
for (int i = row_start; i < row_end; i++) {
initial_lefts.Add((*rows)[i].lindent_);
initial_rights.Add((*rows)[i].rindent_);
}
initial_lefts.GetClusters(&initial_left_tabs);
initial_rights.GetClusters(&initial_right_tabs);
// Second pass: cluster only lines that are not "stray"
// An example of a stray line is a page number -- a line whose start
// and end tab-stops are far outside the typical start and end tab-stops
// for the block.
// Put another way, we only cluster data from lines whose start or end
// tab stop is frequent.
SimpleClusterer lefts(tolerance);
SimpleClusterer rights(tolerance);
// Outlier elimination. We might want to switch this to test outlier-ness
// based on how strange a position an outlier is in instead of or in addition
// to how rare it is. These outliers get re-added if we end up having too
// few tab stops, to work with, however.
int infrequent_enough_to_ignore = 0;
if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
for (int i = row_start; i < row_end; i++) {
int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
lefts.Add((*rows)[i].lindent_);
rights.Add((*rows)[i].rindent_);
}
}
lefts.GetClusters(left_tabs);
rights.GetClusters(right_tabs);
if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
(right_tabs->size() == 1 && left_tabs->size() >= 4)) {
// One side is really ragged, and the other only has one tab stop,
// so those "insignificant outliers" are probably important, actually.
// This often happens on a page of an index. Add back in the ones
// we omitted in the first pass.
for (int i = row_start; i < row_end; i++) {
int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
lefts.Add((*rows)[i].lindent_);
rights.Add((*rows)[i].rindent_);
}
}
}
lefts.GetClusters(left_tabs);
rights.GetClusters(right_tabs);
// If one side is almost a two-indent aligned side, and the other clearly
// isn't, try to prune out the least frequent tab stop from that side.
if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
int to_prune = -1;
for (int i = left_tabs->size() - 1; i >= 0; i--) {
if (to_prune < 0 ||
(*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
to_prune = i;
}
}
if (to_prune >= 0 &&
(*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
left_tabs->remove(to_prune);
}
}
if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
int to_prune = -1;
for (int i = right_tabs->size() - 1; i >= 0; i--) {
if (to_prune < 0 ||
(*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
to_prune = i;
}
}
if (to_prune >= 0 &&
(*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
right_tabs->remove(to_prune);
}
}
}
// Given a paragraph model mark rows[row_start, row_end) as said model
// start or body lines.
| void tesseract::CallWithUTF8 | ( | TessCallback1< const char * > * | cb, |
| const WERD_CHOICE * | wc | ||
| ) |
| void tesseract::CanonicalizeDetectionResults | ( | GenericVector< PARA * > * | row_owners, |
| PARA_LIST * | paragraphs | ||
| ) |
Definition at line 2235 of file paragraphs.cpp.
{
if (rows[i] == NULL) {
if (i == 0 || rows[i - 1] != formerly_null) {
rows[i] = formerly_null = new PARA();
} else {
rows[i] = formerly_null;
continue;
}
} else if (i > 0 && rows[i - 1] == rows[i]) {
continue;
}
out.add_after_then_move(rows[i]);
}
}
// Main entry point for Paragraph Detection Algorithm.
//
Definition at line 44 of file cube_control.cpp.
| void tesseract::ClearFeatureSpaceWindow | ( | NORM_METHOD | norm_method, |
| ScrollView * | window | ||
| ) |
Clears the given window and draws the featurespace guides for the appropriate normalization method.
Definition at line 1104 of file intproto.cpp.
{
window->Clear();
window->Pen(ScrollView::GREY);
// Draw the feature space limit rectangle.
window->Rectangle(0, 0, INT_MAX_X, INT_MAX_Y);
if (norm_method == baseline) {
window->SetCursor(0, INT_DESCENDER);
window->DrawTo(INT_MAX_X, INT_DESCENDER);
window->SetCursor(0, INT_BASELINE);
window->DrawTo(INT_MAX_X, INT_BASELINE);
window->SetCursor(0, INT_XHEIGHT);
window->DrawTo(INT_MAX_X, INT_XHEIGHT);
window->SetCursor(0, INT_CAPHEIGHT);
window->DrawTo(INT_MAX_X, INT_CAPHEIGHT);
} else {
window->Rectangle(INT_XCENTER - INT_XRADIUS, INT_YCENTER - INT_YRADIUS,
INT_XCENTER + INT_XRADIUS, INT_YCENTER + INT_YRADIUS);
}
}
| void tesseract::ClipBaseline | ( | int | ppi, |
| int | x1, | ||
| int | y1, | ||
| int | x2, | ||
| int | y2, | ||
| int * | line_x1, | ||
| int * | line_y1, | ||
| int * | line_x2, | ||
| int * | line_y2 | ||
| ) |
Definition at line 275 of file pdfrenderer.cpp.
{
*line_x1 = x1;
*line_y1 = y1;
*line_x2 = x2;
*line_y2 = y2;
double rise = abs(y2 - y1) * 72 / ppi;
double run = abs(x2 - x1) * 72 / ppi;
if (rise < 2.0 && 2.0 < run)
*line_y1 = *line_y2 = (y1 + y2) / 2;
}
| int tesseract::ClosestCluster | ( | const GenericVector< Cluster > & | clusters, |
| int | value | ||
| ) |
Definition at line 668 of file paragraphs.cpp.
{
clusters->clear();
| bool tesseract::cmp_eq | ( | T const & | t1, |
| T const & | t2 | ||
| ) |
Definition at line 382 of file genericvector.h.
{
return t1 == t2;
}
| bool tesseract::CompareFontInfo | ( | const FontInfo & | fi1, |
| const FontInfo & | fi2 | ||
| ) |
Definition at line 120 of file fontinfo.cpp.
{
// The font properties are required to be the same for two font with the same
// name, so there is no need to test them.
// Consequently, querying the table with only its font name as information is
// enough to retrieve its properties.
return strcmp(fi1.name, fi2.name) == 0;
}
| bool tesseract::CompareFontSet | ( | const FontSet & | fs1, |
| const FontSet & | fs2 | ||
| ) |
Definition at line 128 of file fontinfo.cpp.
{
if (fs1.size != fs2.size)
return false;
for (int i = 0; i < fs1.size; ++i) {
if (fs1.configs[i] != fs2.configs[i])
return false;
}
return true;
}
| void tesseract::ConvertHypothesizedModelRunsToParagraphs | ( | int | debug_level, |
| const GenericVector< RowScratchRegisters > & | rows, | ||
| GenericVector< PARA * > * | row_owners, | ||
| ParagraphTheory * | theory | ||
| ) |
Definition at line 2044 of file paragraphs.cpp.
{
int end = rows.size();
int start;
for (; end > 0; end = start) {
start = end - 1;
const ParagraphModel *model = NULL;
// TODO(eger): Be smarter about dealing with multiple hypotheses.
bool single_line_paragraph = false;
SetOfModels models;
rows[start].NonNullHypotheses(&models);
if (models.size() > 0) {
model = models[0];
if (rows[start].GetLineType(model) != LT_BODY)
single_line_paragraph = true;
}
if (model && !single_line_paragraph) {
// walk back looking for more body lines and then a start line.
while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
// do nothing
}
if (start < 0 || rows[start].GetLineType(model) != LT_START) {
model = NULL;
}
}
if (model == NULL) {
continue;
}
// rows[start, end) should be a paragraph.
PARA *p = new PARA();
if (model == kCrownLeft || model == kCrownRight) {
p->is_very_first_or_continuation = true;
// Crown paragraph.
// If we can find an existing ParagraphModel that fits, use it,
// else create a new one.
for (int row = end; row < rows.size(); row++) {
if ((*row_owners)[row] &&
(ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
(start == 0 ||
ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
model = (*row_owners)[row]->model;
break;
}
}
if (model == kCrownLeft) {
// No subsequent model fits, so cons one up.
model = theory->AddModel(ParagraphModel(
JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_,
0, 0, Epsilon(rows[start].ri_->average_interword_space)));
} else if (model == kCrownRight) {
// No subsequent model fits, so cons one up.
model = theory->AddModel(ParagraphModel(
JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_,
0, 0, Epsilon(rows[start].ri_->average_interword_space)));
}
}
rows[start].SetUnknown();
rows[start].AddStartLine(model);
for (int i = start + 1; i < end; i++) {
rows[i].SetUnknown();
rows[i].AddBodyLine(model);
}
p->model = model;
p->has_drop_cap = rows[start].ri_->has_drop_cap;
p->is_list_item =
model->justification() == JUSTIFICATION_RIGHT
? rows[start].ri_->rword_indicates_list_item
: rows[start].ri_->lword_indicates_list_item;
for (int row = start; row < end; row++) {
if ((*row_owners)[row] != NULL) {
tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
"more than once!\n");
}
(*row_owners)[row] = p;
}
}
}
struct Interval {
Interval() : begin(0), end(0) {}
| bool tesseract::CrownCompatible | ( | const GenericVector< RowScratchRegisters > * | rows, |
| int | a, | ||
| int | b, | ||
| const ParagraphModel * | model | ||
| ) |
Definition at line 1291 of file paragraphs.cpp.
{
return NearlyEqual(row_a.rindent_ + row_a.rmargin_,
row_b.rindent_ + row_b.rmargin_,
Epsilon(row_a.ri_->average_interword_space));
}
return NearlyEqual(row_a.lindent_ + row_a.lmargin_,
row_b.lindent_ + row_b.lmargin_,
Epsilon(row_a.ri_->average_interword_space));
}
// =============== Implementation of ParagraphModelSmearer ====================
| struct Pix * tesseract::DegradeImage | ( | Pix * | input, |
| int | exposure, | ||
| TRand * | randomizer, | ||
| float * | rotation | ||
| ) | [read] |
Definition at line 66 of file degradeimage.cpp.
{
Pix* pix = pixConvertTo8(input, false);
pixDestroy(&input);
input = pix;
int width = pixGetWidth(input);
int height = pixGetHeight(input);
if (exposure >= 2) {
// An erosion simulates the spreading darkening of a dark copy.
// This is backwards to binary morphology,
// see http://www.leptonica.com/grayscale-morphology.html
pix = input;
input = pixErodeGray(pix, 3, 3);
pixDestroy(&pix);
}
// A convolution is essential to any mode as no scanner produces an
// image as sharp as the electronic image.
pix = pixBlockconv(input, 1, 1);
pixDestroy(&input);
// A small random rotation helps to make the edges jaggy in a realistic way.
if (rotation != NULL) {
float radians_clockwise = 0.0f;
if (*rotation) {
radians_clockwise = *rotation;
} else if (randomizer != NULL) {
radians_clockwise = randomizer->SignedRand(kRotationRange);
}
input = pixRotate(pix, radians_clockwise,
L_ROTATE_AREA_MAP, L_BRING_IN_WHITE,
0, 0);
// Rotate the boxes to match.
*rotation = radians_clockwise;
pixDestroy(&pix);
} else {
input = pix;
}
if (exposure >= 3 || exposure == 1) {
// Erosion after the convolution is not as heavy as before, so it is
// good for level 1 and in addition as a level 3.
// This is backwards to binary morphology,
// see http://www.leptonica.com/grayscale-morphology.html
pix = input;
input = pixErodeGray(pix, 3, 3);
pixDestroy(&pix);
}
// The convolution really needed to be 2x2 to be realistic enough, but
// we only have 3x3, so we have to bias the image darker or lose thin
// strokes.
int erosion_offset = 0;
// For light and 0 exposure, there is no dilation, so compensate for the
// convolution with a big darkening bias which is undone for lighter
// exposures.
if (exposure <= 0)
erosion_offset = -3 * kExposureFactor;
// Add in a general offset of the greyscales for the exposure level so
// a threshold of 128 gives a reasonable binary result.
erosion_offset -= exposure * kExposureFactor;
// Add a gradual fade over the page and a small amount of salt and pepper
// noise to simulate noise in the sensor/paper fibres and varying
// illumination.
l_uint32* data = pixGetData(input);
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
int pixel = GET_DATA_BYTE(data, x);
if (randomizer != NULL)
pixel += randomizer->IntRand() % (kSaltnPepper*2 + 1) - kSaltnPepper;
if (height + width > kMinRampSize)
pixel -= (2*x + y) * 32 / (height + width);
pixel += erosion_offset;
if (pixel < 0)
pixel = 0;
if (pixel > 255)
pixel = 255;
SET_DATA_BYTE(data, x, pixel);
}
data += input->wpl;
}
return input;
}
| void tesseract::DeleteObject | ( | T * | object | ) |
Definition at line 166 of file tablefind.cpp.
| void tesseract::DetectParagraphs | ( | int | debug_level, |
| GenericVector< RowInfo > * | row_infos, | ||
| GenericVector< PARA * > * | row_owners, | ||
| PARA_LIST * | paragraphs, | ||
| GenericVector< ParagraphModel * > * | models | ||
| ) |
Definition at line 2267 of file paragraphs.cpp.
{
GenericVector<RowScratchRegisters> rows;
ParagraphTheory theory(models);
// Initialize row_owners to be a bunch of NULL pointers.
row_owners->init_to_size(row_infos->size(), NULL);
// Set up row scratch registers for the main algorithm.
rows.init_to_size(row_infos->size(), RowScratchRegisters());
for (int i = 0; i < row_infos->size(); i++) {
rows[i].Init((*row_infos)[i]);
}
// Pass 1:
// Detect sequences of lines that all contain leader dots (.....)
// These are likely Tables of Contents. If there are three text lines in
// a row with leader dots, it's pretty safe to say the middle one should
// be a paragraph of its own.
SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
GenericVector<Interval> leftovers;
LeftoverSegments(rows, &leftovers, 0, rows.size());
for (int i = 0; i < leftovers.size(); i++) {
// Pass 2a:
// Find any strongly evidenced start-of-paragraph lines. If they're
// followed by two lines that look like body lines, make a paragraph
// model for that and see if that model applies throughout the text
// (that is, "smear" it).
StrongEvidenceClassify(debug_level, &rows,
leftovers[i].begin, leftovers[i].end, &theory);
// Pass 2b:
// If we had any luck in pass 2a, we got part of the page and didn't
// know how to classify a few runs of rows. Take the segments that
// didn't find a model and reprocess them individually.
GenericVector<Interval> leftovers2;
LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
bool pass2a_was_useful = leftovers2.size() > 1 ||
(leftovers2.size() == 1 &&
(leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
if (pass2a_was_useful) {
for (int j = 0; j < leftovers2.size(); j++) {
StrongEvidenceClassify(debug_level, &rows,
leftovers2[j].begin, leftovers2[j].end,
&theory);
}
}
}
DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
// Pass 3:
// These are the dregs for which we didn't have enough strong textual
// and geometric clues to form matching models for. Let's see if
// the geometric clues are simple enough that we could just use those.
LeftoverSegments(rows, &leftovers, 0, rows.size());
for (int i = 0; i < leftovers.size(); i++) {
GeometricClassify(debug_level, &rows,
leftovers[i].begin, leftovers[i].end, &theory);
}
// Undo any flush models for which there's little evidence.
DowngradeWeakestToCrowns(debug_level, &theory, &rows);
DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
// Pass 4:
// Take everything that's still not marked up well and clear all markings.
LeftoverSegments(rows, &leftovers, 0, rows.size());
for (int i = 0; i < leftovers.size(); i++) {
for (int j = leftovers[i].begin; j < leftovers[i].end; j++) {
rows[j].SetUnknown();
}
}
DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
// Convert all of the unique hypothesis runs to PARAs.
ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
&theory);
DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
// Finally, clean up any dangling NULL row paragraph parents.
CanonicalizeDetectionResults(row_owners, paragraphs);
}
// ============ Code interfacing with the rest of Tesseract ==================
| void tesseract::DetectParagraphs | ( | int | debug_level, |
| bool | after_text_recognition, | ||
| const MutableIterator * | block_start, | ||
| GenericVector< ParagraphModel * > * | models | ||
| ) |
Definition at line 2512 of file paragraphs.cpp.
{
// Clear out any preconceived notions.
if (block_start->Empty(RIL_TEXTLINE)) {
return;
}
BLOCK *block = block_start->PageResIt()->block()->block;
block->para_list()->clear();
bool is_image_block = block->poly_block() && !block->poly_block()->IsText();
// Convert the Tesseract structures to RowInfos
// for the paragraph detection algorithm.
MutableIterator row(*block_start);
if (row.Empty(RIL_TEXTLINE))
return; // end of input already.
GenericVector<RowInfo> row_infos;
do {
if (!row.PageResIt()->row())
continue; // empty row.
row.PageResIt()->row()->row->set_para(NULL);
row_infos.push_back(RowInfo());
RowInfo &ri = row_infos.back();
InitializeRowInfo(after_text_recognition, row, &ri);
} while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
row.Next(RIL_TEXTLINE));
// If we're called before text recognition, we might not have
// tight block bounding boxes, so trim by the minimum on each side.
if (row_infos.size() > 0) {
int min_lmargin = row_infos[0].pix_ldistance;
int min_rmargin = row_infos[0].pix_rdistance;
for (int i = 1; i < row_infos.size(); i++) {
if (row_infos[i].pix_ldistance < min_lmargin)
min_lmargin = row_infos[i].pix_ldistance;
if (row_infos[i].pix_rdistance < min_rmargin)
min_rmargin = row_infos[i].pix_rdistance;
}
if (min_lmargin > 0 || min_rmargin > 0) {
for (int i = 0; i < row_infos.size(); i++) {
row_infos[i].pix_ldistance -= min_lmargin;
row_infos[i].pix_rdistance -= min_rmargin;
}
}
}
// Run the paragraph detection algorithm.
GenericVector<PARA *> row_owners;
GenericVector<PARA *> the_paragraphs;
if (!is_image_block) {
DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(),
models);
} else {
row_owners.init_to_size(row_infos.size(), NULL);
CanonicalizeDetectionResults(&row_owners, block->para_list());
}
// Now stitch in the row_owners into the rows.
row = *block_start;
for (int i = 0; i < row_owners.size(); i++) {
while (!row.PageResIt()->row())
row.Next(RIL_TEXTLINE);
row.PageResIt()->row()->row->set_para(row_owners[i]);
row.Next(RIL_TEXTLINE);
}
}
} // namespace
| void tesseract::DiscardUnusedModels | ( | const GenericVector< RowScratchRegisters > & | rows, |
| ParagraphTheory * | theory | ||
| ) |
Definition at line 1458 of file paragraphs.cpp.
{
rows[i].StrongHypotheses(&used_models);
}
theory->DiscardUnusedModels(used_models);
}
// DowngradeWeakestToCrowns:
// Forget any flush-{left, right} models unless we see two or more
| long tesseract::dist2 | ( | int | x1, |
| int | y1, | ||
| int | x2, | ||
| int | y2 | ||
| ) |
Definition at line 192 of file pdfrenderer.cpp.
{
return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
}
| void tesseract::DowngradeWeakestToCrowns | ( | int | debug_level, |
| ParagraphTheory * | theory, | ||
| GenericVector< RowScratchRegisters > * | rows | ||
| ) |
Definition at line 1491 of file paragraphs.cpp.
{
// Search back for a body line of a unique type.
const ParagraphModel *model = NULL;
while (end > 0 &&
(model = (*rows)[end - 1].UniqueBodyHypothesis()) == NULL) {
end--;
}
if (end == 0) break;
start = end - 1;
while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
start--; // walk back to the first line that is not the same body type.
}
if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
StrongModel(model) &&
NearlyEqual(model->first_indent(), model->body_indent(),
model->tolerance())) {
start--;
}
start++;
// Now rows[start, end) is a sequence of unique body hypotheses of model.
if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER)
continue;
if (!StrongModel(model)) {
while (start > 0 &&
CrownCompatible(rows, start - 1, start, model))
start--;
}
if (start == 0 ||
(!StrongModel(model)) ||
(StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
// crownify rows[start, end)
const ParagraphModel *crown_model = model;
if (StrongModel(model)) {
if (model->justification() == JUSTIFICATION_LEFT)
crown_model = kCrownLeft;
else
crown_model = kCrownRight;
}
(*rows)[start].SetUnknown();
(*rows)[start].AddStartLine(crown_model);
for (int row = start + 1; row < end; row++) {
(*rows)[row].SetUnknown();
(*rows)[row].AddBodyLine(crown_model);
}
}
}
DiscardUnusedModels(*rows, theory);
}
// Clear all hypotheses about lines [start, end) and reset margins.
| tesseract::ELISTIZE | ( | AmbigSpec | ) |
| tesseract::ELISTIZE | ( | ViterbiStateEntry | ) |
| tesseract::ELISTIZEH | ( | AmbigSpec | ) |
| tesseract::ELISTIZEH | ( | ViterbiStateEntry | ) |
---------------------------------------------------------------------------- Public Code ----------------------------------------------------------------------------
Definition at line 46 of file blobclass.cpp.
{
*fontname = classify_font_name;
if (*fontname == kUnknownFontName) {
// filename is expected to be of the form [lang].[fontname].exp[num]
// The [lang], [fontname] and [num] fields should not have '.' characters.
const char *basename = strrchr(filename.string(), '/');
const char *firstdot = strchr(basename ? basename : filename.string(), '.');
const char *lastdot = strrchr(filename.string(), '.');
if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) {
++firstdot;
*fontname = firstdot;
fontname->truncate_at(lastdot - firstdot);
}
}
}
| void tesseract::ExtractFontProperties | ( | const string & | utf8_text, |
| StringRenderer * | render, | ||
| const string & | output_base | ||
| ) |
Definition at line 212 of file text2image.cpp.
{
map<string, SpacingProperties> spacing_map;
map<string, SpacingProperties>::iterator spacing_map_it0;
map<string, SpacingProperties>::iterator spacing_map_it1;
int x_bearing, x_advance;
int len = utf8_text.length();
int offset = 0;
const char* text = utf8_text.c_str();
while (offset < len) {
offset += render->RenderToImage(text + offset, strlen(text + offset), NULL);
const vector<BoxChar*> &boxes = render->GetBoxes();
// If the page break split a bigram, correct the offset so we try the bigram
// on the next iteration.
if (boxes.size() > 2 && !IsWhitespaceBox(boxes[boxes.size() - 1]) &&
IsWhitespaceBox(boxes[boxes.size() - 2])) {
if (boxes.size() > 3) {
tprintf("WARNING: Adjusting to bad page break after '%s%s'\n",
boxes[boxes.size() - 4]->ch().c_str(),
boxes[boxes.size() - 3]->ch().c_str());
}
offset -= boxes[boxes.size() - 1]->ch().size();
}
for (int b = 0; b < boxes.size(); b += 2) {
while (b < boxes.size() && IsWhitespaceBox(boxes[b])) ++b;
if (b + 1 >= boxes.size()) break;
const string &ch0 = boxes[b]->ch();
// We encountered a ligature. This happens in at least two scenarios:
// One is when the rendered bigram forms a grapheme cluster (eg. the
// second character in the bigram is a combining vowel), in which case we
// correctly output only one bounding box.
// A second far less frequent case is when caused some fonts like 'DejaVu
// Sans Ultra-Light' force Pango to render a ligatured character even if
// the input consists of the separated characters. NOTE(ranjith): As per
// behdad@ this is not currently controllable at the level of the Pango
// API.
// Safeguard against these cases here by just skipping the bigram.
if (IsWhitespaceBox(boxes[b+1])) {
continue;
}
int xgap = (boxes[b+1]->box()->x -
(boxes[b]->box()->x + boxes[b]->box()->w));
spacing_map_it0 = spacing_map.find(ch0);
int ok_count = 0;
if (spacing_map_it0 == spacing_map.end() &&
render->font().GetSpacingProperties(ch0, &x_bearing, &x_advance)) {
spacing_map[ch0] = SpacingProperties(
x_bearing, x_advance - x_bearing - boxes[b]->box()->w);
spacing_map_it0 = spacing_map.find(ch0);
++ok_count;
}
const string &ch1 = boxes[b+1]->ch();
tlog(3, "%s%s\n", ch0.c_str(), ch1.c_str());
spacing_map_it1 = spacing_map.find(ch1);
if (spacing_map_it1 == spacing_map.end() &&
render->font().GetSpacingProperties(ch1, &x_bearing, &x_advance)) {
spacing_map[ch1] = SpacingProperties(
x_bearing, x_advance - x_bearing - boxes[b+1]->box()->w);
spacing_map_it1 = spacing_map.find(ch1);
++ok_count;
}
if (ok_count == 2 && xgap != (spacing_map_it0->second.x_gap_after +
spacing_map_it1->second.x_gap_before)) {
spacing_map_it0->second.kerned_x_gaps[ch1] = xgap;
}
}
render->ClearBoxes();
}
string output_string;
const int kBufSize = 1024;
char buf[kBufSize];
snprintf(buf, kBufSize, "%d\n", static_cast<int>(spacing_map.size()));
output_string.append(buf);
map<string, SpacingProperties>::const_iterator spacing_map_it;
for (spacing_map_it = spacing_map.begin();
spacing_map_it != spacing_map.end(); ++spacing_map_it) {
snprintf(buf, kBufSize,
"%s %d %d %d", spacing_map_it->first.c_str(),
spacing_map_it->second.x_gap_before,
spacing_map_it->second.x_gap_after,
static_cast<int>(spacing_map_it->second.kerned_x_gaps.size()));
output_string.append(buf);
map<string, int>::const_iterator kern_it;
for (kern_it = spacing_map_it->second.kerned_x_gaps.begin();
kern_it != spacing_map_it->second.kerned_x_gaps.end(); ++kern_it) {
snprintf(buf, kBufSize,
" %s %d", kern_it->first.c_str(), kern_it->second);
output_string.append(buf);
}
output_string.append("\n");
}
File::WriteStringToFileOrDie(output_string, output_base + ".fontinfo");
}
| bool tesseract::FirstWordWouldHaveFit | ( | const RowScratchRegisters & | before, |
| const RowScratchRegisters & | after, | ||
| tesseract::ParagraphJustification | justification | ||
| ) |
Definition at line 1624 of file paragraphs.cpp.
{
tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
}
int available_space;
if (justification == JUSTIFICATION_CENTER) {
available_space = before.lindent_ + before.rindent_;
} else {
available_space = before.OffsideIndent(justification);
}
available_space -= before.ri_->average_interword_space;
if (before.ri_->ltr)
return after.ri_->lword_box.width() < available_space;
return after.ri_->rword_box.width() < available_space;
}
// Return whether the first word on the after line can fit in the space at
// the end of the before line (not knowing which way the text goes) in a left
| bool tesseract::FirstWordWouldHaveFit | ( | const RowScratchRegisters & | before, |
| const RowScratchRegisters & | after | ||
| ) |
Definition at line 1649 of file paragraphs.cpp.
{
| void tesseract::FontInfoDeleteCallback | ( | FontInfo | f | ) |
Definition at line 139 of file fontinfo.cpp.
{
if (f.spacing_vec != NULL) {
f.spacing_vec->delete_data_pointers();
delete f.spacing_vec;
}
delete[] f.name;
}
| void tesseract::FontSetDeleteCallback | ( | FontSet | fs | ) |
Definition at line 146 of file fontinfo.cpp.
{
delete[] fs.configs;
}
Definition at line 240 of file normstrngs.cpp.
{
if (ch != 0x3000) return ch;
}
// Special case for fullwidth left and right "white parentheses".
if (ch == 0xFF5F) return 0x2985;
if (ch == 0xFF60) return 0x2986;
// Construct a full-to-half width transliterator.
IcuErrorCode error_code;
icu::UnicodeString uch_str(static_cast<UChar32>(ch));
const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance(
"Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
error_code.assertSuccess();
error_code.reset();
fulltohalf->transliterate(uch_str);
delete fulltohalf;
ASSERT_HOST(uch_str.length() != 0);
return uch_str[0];
}
| void tesseract::GeometricClassify | ( | int | debug_level, |
| GenericVector< RowScratchRegisters > * | rows, | ||
| int | row_start, | ||
| int | row_end, | ||
| ParagraphTheory * | theory | ||
| ) |
Definition at line 1080 of file paragraphs.cpp.
{
if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
return;
if (debug_level > 1) {
tprintf("###############################################\n");
tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n",
row_start, row_end);
tprintf("###############################################\n");
}
RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
GeometricClassifierState s(debug_level, rows, row_start, row_end);
if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
s.Fail(2, "Too much variety for simple outline classification.");
return;
}
if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
s.Fail(1, "Not enough variety for simple outline classification.");
return;
}
if (s.left_tabs.size() + s.right_tabs.size() == 3) {
GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
return;
}
// At this point, we know that one side has at least two tab stops, and the
// other side has one or two tab stops.
// Left to determine:
// (1) Which is the body indent and which is the first line indent?
// (2) Is the text fully justified?
// If one side happens to have three or more tab stops, assume that side
// is opposite of the aligned side.
if (s.right_tabs.size() > 2) {
s.AssumeLeftJustification();
} else if (s.left_tabs.size() > 2) {
s.AssumeRightJustification();
} else if (s.ltr) { // guess based on script direction
s.AssumeLeftJustification();
} else {
s.AssumeRightJustification();
}
if (s.AlignTabs().size() == 2) {
// For each tab stop on the aligned side, how many of them appear
// to be paragraph start lines? [first lines]
int firsts[2] = {0, 0};
// Count the first line as a likely paragraph start line.
firsts[s.AlignsideTabIndex(s.row_start)]++;
// For each line, if the first word would have fit on the previous
// line count it as a likely paragraph start line.
bool jam_packed = true;
for (int i = s.row_start + 1; i < s.row_end; i++) {
if (s.FirstWordWouldHaveFit(i - 1, i)) {
firsts[s.AlignsideTabIndex(i)]++;
jam_packed = false;
}
}
// Make an extra accounting for the last line of the paragraph just
// in case it's the only short line in the block. That is, take its
// first word as typical and see if this looks like the *last* line
// of a paragraph. If so, mark the *other* indent as probably a first.
if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
}
int percent0firsts, percent1firsts;
percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
// TODO(eger): Tune these constants if necessary.
if ((percent0firsts < 20 && 30 < percent1firsts) ||
percent0firsts + 30 < percent1firsts) {
s.first_indent = s.AlignTabs()[1].center;
s.body_indent = s.AlignTabs()[0].center;
} else if ((percent1firsts < 20 && 30 < percent0firsts) ||
percent1firsts + 30 < percent0firsts) {
s.first_indent = s.AlignTabs()[0].center;
s.body_indent = s.AlignTabs()[1].center;
} else {
// Ambiguous! Probably lineated (poetry)
if (debug_level > 1) {
tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
s.AlignTabs()[0].center, percent0firsts);
tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
s.AlignTabs()[1].center, percent1firsts);
s.PrintRows();
}
return;
}
} else {
// There's only one tab stop for the "aligned to" side.
s.first_indent = s.body_indent = s.AlignTabs()[0].center;
}
// At this point, we have our model.
const ParagraphModel *model = theory->AddModel(s.Model());
// Now all we have to do is figure out if the text is fully justified or not.
// eop_threshold: default to fully justified unless we see evidence below.
// See description on MarkRowsWithModel()
s.eop_threshold =
(s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
// If the text is not fully justified, re-set the eop_threshold to 0.
if (s.AlignTabs().size() == 2) {
// Paragraphs with a paragraph-start indent.
for (int i = s.row_start; i < s.row_end - 1; i++) {
if (ValidFirstLine(s.rows, i + 1, model) &&
!NearlyEqual(s.OffsideTabs()[0].center,
(*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
// We found a non-end-of-paragraph short line: not fully justified.
s.eop_threshold = 0;
break;
}
}
} else {
// Paragraphs with no paragraph-start indent.
for (int i = s.row_start; i < s.row_end - 1; i++) {
if (!s.FirstWordWouldHaveFit(i, i + 1) &&
!NearlyEqual(s.OffsideTabs()[0].center,
(*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
// We found a non-end-of-paragraph short line: not fully justified.
s.eop_threshold = 0;
break;
}
}
}
MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
}
// =============== Implementation of ParagraphTheory =====================
| void tesseract::GeometricClassifyThreeTabStopTextBlock | ( | int | debug_level, |
| GeometricClassifierState & | s, | ||
| ParagraphTheory * | theory | ||
| ) |
Definition at line 988 of file paragraphs.cpp.
{
int num_rows = s.row_end - s.row_start;
int num_full_rows = 0;
int last_row_full = 0;
for (int i = s.row_start; i < s.row_end; i++) {
if (s.IsFullRow(i)) {
num_full_rows++;
if (i == s.row_end - 1) last_row_full++;
}
}
if (num_full_rows < 0.7 * num_rows) {
s.Fail(1, "Not enough full lines to know which lines start paras.");
return;
}
// eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
s.eop_threshold = 0;
if (s.ltr) {
s.AssumeLeftJustification();
} else {
s.AssumeRightJustification();
}
if (debug_level > 0) {
tprintf("# Not enough variety for clear outline classification. "
"Guessing these are %s aligned based on script.\n",
s.ltr ? "left" : "right");
s.PrintRows();
}
if (s.AlignTabs().size() == 2) { // case A1 or A2
s.first_indent = s.AlignTabs()[1].center;
s.body_indent = s.AlignTabs()[0].center;
} else { // case B1 or B2
if (num_rows - 1 == num_full_rows - last_row_full) {
// case B2
const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
(*s.rows)[s.row_start].AddStartLine(model);
for (int i = s.row_start + 1; i < s.row_end; i++) {
(*s.rows)[i].AddBodyLine(model);
}
return;
} else {
// case B1
s.first_indent = s.body_indent = s.AlignTabs()[0].center;
s.eop_threshold = (s.OffsideTabs()[0].center +
s.OffsideTabs()[1].center) / 2;
}
}
const ParagraphModel *model = theory->AddModel(s.Model());
MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,
s.ltr, s.eop_threshold);
return;
}
// This function is called if strong textual clues were not available, but
// the caller hopes that the paragraph breaks will be super obvious just
| void tesseract::GetWordBaseline | ( | int | writing_direction, |
| int | ppi, | ||
| int | height, | ||
| int | word_x1, | ||
| int | word_y1, | ||
| int | word_x2, | ||
| int | word_y2, | ||
| int | line_x1, | ||
| int | line_y1, | ||
| int | line_x2, | ||
| int | line_y2, | ||
| double * | x0, | ||
| double * | y0, | ||
| double * | length | ||
| ) |
Definition at line 204 of file pdfrenderer.cpp.
{
if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
Swap(&word_x1, &word_x2);
Swap(&word_y1, &word_y2);
}
double word_length;
double x, y;
{
int px = word_x1;
int py = word_y1;
double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
if (l2 == 0) {
x = line_x1;
y = line_y1;
} else {
double t = ((px - line_x2) * (line_x2 - line_x1) +
(py - line_y2) * (line_y2 - line_y1)) / l2;
x = line_x2 + t * (line_x2 - line_x1);
y = line_y2 + t * (line_y2 - line_y1);
}
word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
word_x2, word_y2)));
word_length = word_length * 72.0 / ppi;
x = x * 72 / ppi;
y = height - (y * 72.0 / ppi);
}
*x0 = x;
*y0 = y;
*length = word_length;
}
| Pix* tesseract::GridReducedPix | ( | const TBOX & | box, |
| int | gridsize, | ||
| ICOORD | bleft, | ||
| int * | left, | ||
| int * | bottom | ||
| ) |
Definition at line 212 of file bbgrid.cpp.
{
// Compute grid bounds of the outline and pad all round by 1.
int grid_left = (box.left() - bleft.x()) / gridsize - 1;
int grid_bottom = (box.bottom() - bleft.y()) / gridsize - 1;
int grid_right = (box.right() - bleft.x()) / gridsize + 1;
int grid_top = (box.top() - bleft.y()) / gridsize + 1;
*left = grid_left;
*bottom = grid_bottom;
return pixCreate(grid_right - grid_left + 1,
grid_top - grid_bottom + 1,
1);
}
| void tesseract::HistogramRect | ( | Pix * | src_pix, |
| int | channel, | ||
| int | left, | ||
| int | top, | ||
| int | width, | ||
| int | height, | ||
| int * | histogram | ||
| ) |
Definition at line 158 of file otsuthr.cpp.
{
PERF_COUNT_START("HistogramRect")
int num_channels = pixGetDepth(src_pix) / 8;
channel = ClipToRange(channel, 0, num_channels - 1);
int bottom = top + height;
memset(histogram, 0, sizeof(*histogram) * kHistogramSize);
int src_wpl = pixGetWpl(src_pix);
l_uint32* srcdata = pixGetData(src_pix);
for (int y = top; y < bottom; ++y) {
const l_uint32* linedata = srcdata + y * src_wpl;
for (int x = 0; x < width; ++x) {
int pixel = GET_DATA_BYTE(const_cast<void*>(
reinterpret_cast<const void *>(linedata)),
(x + left) * num_channels + channel);
++histogram[pixel];
}
}
PERF_COUNT_END
}
| STRING tesseract::HOcrEscape | ( | const char * | text | ) |
Escape a char string - remove &<>"' with HTML codes.
| void tesseract::InitializeRowInfo | ( | bool | after_recognition, |
| const MutableIterator & | it, | ||
| RowInfo * | info | ||
| ) |
Definition at line 2414 of file paragraphs.cpp.
{
ROW *row = it.PageResIt()->row()->row;
info->pix_ldistance = row->lmargin();
info->pix_rdistance = row->rmargin();
info->average_interword_space =
row->space() > 0 ? row->space() : MAX(row->x_height(), 1);
info->pix_xheight = row->x_height();
info->has_leaders = false;
info->has_drop_cap = row->has_drop_cap();
info->ltr = true; // set below depending on word scripts
} else {
info->pix_ldistance = info->pix_rdistance = 0;
info->average_interword_space = 1;
info->pix_xheight = 1.0;
info->has_leaders = false;
info->has_drop_cap = false;
info->ltr = true;
}
info->num_words = 0;
info->lword_indicates_list_item = false;
info->lword_likely_starts_idea = false;
info->lword_likely_ends_idea = false;
info->rword_indicates_list_item = false;
info->rword_likely_starts_idea = false;
info->rword_likely_ends_idea = false;
info->has_leaders = false;
info->ltr = 1;
if (!after_recognition) {
InitializeTextAndBoxesPreRecognition(it, info);
return;
}
info->text = "";
char *text = it.GetUTF8Text(RIL_TEXTLINE);
int trailing_ws_idx = strlen(text); // strip trailing space
while (trailing_ws_idx > 0 &&
// isspace() only takes ASCII
((text[trailing_ws_idx - 1] & 0x80) == 0) &&
isspace(text[trailing_ws_idx - 1]))
trailing_ws_idx--;
if (trailing_ws_idx > 0) {
int lspaces = info->pix_ldistance / info->average_interword_space;
for (int i = 0; i < lspaces; i++)
info->text += ' ';
for (int i = 0; i < trailing_ws_idx; i++)
info->text += text[i];
}
delete []text;
if (info->text.size() == 0) {
return;
}
PAGE_RES_IT page_res_it = *it.PageResIt();
GenericVector<WERD_RES *> werds;
WERD_RES *word_res = page_res_it.restart_row();
ROW_RES *this_row = page_res_it.row();
int num_leaders = 0;
int ltr = 0;
int rtl = 0;
do {
if (word_res && word_res->best_choice->unichar_string().length() > 0) {
werds.push_back(word_res);
ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
if (word_res->word->flag(W_REP_CHAR)) num_leaders++;
}
word_res = page_res_it.forward();
} while (page_res_it.row() == this_row);
info->ltr = ltr >= rtl;
info->has_leaders = num_leaders > 3;
info->num_words = werds.size();
if (werds.size() > 0) {
WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
info->lword_text = lword->best_choice->unichar_string().string();
info->rword_text = rword->best_choice->unichar_string().string();
info->lword_box = lword->word->bounding_box();
info->rword_box = rword->word->bounding_box();
LeftWordAttributes(lword->uch_set, lword->best_choice,
info->lword_text,
&info->lword_indicates_list_item,
&info->lword_likely_starts_idea,
&info->lword_likely_ends_idea);
RightWordAttributes(rword->uch_set, rword->best_choice,
info->rword_text,
&info->rword_indicates_list_item,
&info->rword_likely_starts_idea,
&info->rword_likely_ends_idea);
}
}
// This is called after rows have been identified and words are recognized.
// Much of this could be implemented before word recognition, but text helps
| void tesseract::InitializeTextAndBoxesPreRecognition | ( | const MutableIterator & | it, |
| RowInfo * | info | ||
| ) |
Definition at line 2362 of file paragraphs.cpp.
{
do {
fake_text += "x";
if (first_word) info->lword_text += "x";
info->rword_text += "x";
if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
fake_text += " ";
info->rword_text = "";
first_word = false;
}
} while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) &&
pit.Next(RIL_SYMBOL));
}
if (fake_text.size() == 0) return;
int lspaces = info->pix_ldistance / info->average_interword_space;
for (int i = 0; i < lspaces; i++) {
info->text += ' ';
}
info->text += fake_text;
// Set up lword_box, rword_box, and num_words.
PAGE_RES_IT page_res_it = *it.PageResIt();
WERD_RES *word_res = page_res_it.restart_row();
ROW_RES *this_row = page_res_it.row();
WERD_RES *lword = NULL;
WERD_RES *rword = NULL;
info->num_words = 0;
do {
if (word_res) {
if (!lword) lword = word_res;
if (rword != word_res) info->num_words++;
rword = word_res;
}
word_res = page_res_it.forward();
} while (page_res_it.row() == this_row);
if (lword) info->lword_box = lword->word->bounding_box();
if (rword) info->rword_box = rword->word->bounding_box();
}
// Given a Tesseract Iterator pointing to a text line, fill in the paragraph
| ParagraphModel tesseract::InternalParagraphModelByOutline | ( | const GenericVector< RowScratchRegisters > * | rows, |
| int | start, | ||
| int | end, | ||
| int | tolerance, | ||
| bool * | consistent | ||
| ) |
Definition at line 1695 of file paragraphs.cpp.
{
ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
}
bool ltr = (ltr_line_count >= (end - start) / 2);
*consistent = true;
if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
return ParagraphModel();
// Ensure the caller only passed us a region with a common rmargin and
// lmargin.
int lmargin = (*rows)[start].lmargin_;
int rmargin = (*rows)[start].rmargin_;
int lmin, lmax, rmin, rmax, cmin, cmax;
lmin = lmax = (*rows)[start + 1].lindent_;
rmin = rmax = (*rows)[start + 1].rindent_;
cmin = cmax = 0;
for (int i = start + 1; i < end; i++) {
if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
tprintf("Margins don't match! Software error.\n");
*consistent = false;
return ParagraphModel();
}
UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
}
int ldiff = lmax - lmin;
int rdiff = rmax - rmin;
int cdiff = cmax - cmin;
if (rdiff > tolerance && ldiff > tolerance) {
if (cdiff < tolerance * 2) {
if (end - start < 3)
return ParagraphModel();
return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
}
*consistent = false;
return ParagraphModel();
}
if (end - start < 3) // Don't return a model for two line paras.
return ParagraphModel();
// These booleans keep us from saying something is aligned left when the body
// left variance is too large.
bool body_admits_left_alignment = ldiff < tolerance;
bool body_admits_right_alignment = rdiff < tolerance;
ParagraphModel left_model =
ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
(lmin + lmax) / 2, tolerance);
ParagraphModel right_model =
ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
(rmin + rmax) / 2, tolerance);
// These booleans keep us from having an indent on the "wrong side" for the
// first line.
bool text_admits_left_alignment = ltr || left_model.is_flush();
bool text_admits_right_alignment = !ltr || right_model.is_flush();
// At least one of the edges is less than tolerance in variance.
// If the other is obviously ragged, it can't be the one aligned to.
// [Note the last line is included in this raggedness.]
if (tolerance < rdiff) {
if (body_admits_left_alignment && text_admits_left_alignment)
return left_model;
*consistent = false;
return ParagraphModel();
}
if (tolerance < ldiff) {
if (body_admits_right_alignment && text_admits_right_alignment)
return right_model;
*consistent = false;
return ParagraphModel();
}
// At this point, we know the body text doesn't vary much on either side.
// If the first line juts out oddly in one direction or the other,
// that likely indicates the side aligned to.
int first_left = (*rows)[start].lindent_;
int first_right = (*rows)[start].rindent_;
if (ltr && body_admits_left_alignment &&
(first_left < lmin || first_left > lmax))
return left_model;
if (!ltr && body_admits_right_alignment &&
(first_right < rmin || first_right > rmax))
return right_model;
*consistent = false;
return ParagraphModel();
}
// Examine rows[start, end) and try to determine what sort of ParagraphModel
// would fit them as a single paragraph. If nothing fits,
| int tesseract::InterwordSpace | ( | const GenericVector< RowScratchRegisters > & | rows, |
| int | row_start, | ||
| int | row_end | ||
| ) |
Definition at line 1601 of file paragraphs.cpp.
{
if (rows[i].ri_->num_words > 1) {
spacing_widths.add(rows[i].ri_->average_interword_space, 1);
}
}
int minimum_reasonable_space = word_height / 3;
if (minimum_reasonable_space < 2)
minimum_reasonable_space = 2;
int median = spacing_widths.median();
return (median > minimum_reasonable_space)
? median : minimum_reasonable_space;
}
// Return whether the first word on the after line can fit in the space at
// the end of the before line (knowing which way the text is aligned and read).
| bool tesseract::is_double_quote | ( | const char32 | ch | ) |
Definition at line 98 of file normstrngs.cpp.
{
'"',
0x201C, // left double quotation mark (English, others)
0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
0x201F, // double high-reversed-9 quotation mark (PropList.txt)
0x2033, // double prime
0x301D, // reversed double prime quotation mark (East Asian langs, horiz.)
0x301E, // close double prime (East Asian languages written horizontally)
0xFF02, // fullwidth quotation mark
};
for (int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
if (kDoubleQuoteUnicodes[i] == ch)
return true;
}
return false;
}
| bool tesseract::is_hyphen_punc | ( | const char32 | ch | ) |
Definition at line 59 of file normstrngs.cpp.
{
'-',
0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, // hyphen..horizontal bar
0x207b, // superscript minus
0x208b, // subscript minus
0x2212, // minus sign
0xfe58, // small em dash
0xfe63, // small hyphen-minus
0xff0d, // fullwidth hyphen-minus
};
for (int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
if (kHyphenPuncUnicodes[i] == ch)
return true;
}
return false;
}
| bool tesseract::is_single_quote | ( | const char32 | ch | ) |
Definition at line 78 of file normstrngs.cpp.
{
'\'',
'`',
0x2018, // left single quotation mark (English, others)
0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
// We may have to introduce a comma set with 0x201a
0x201B, // single high-reveresed-9 quotation mark (PropList.txt)
0x2032, // prime
0x300C, // left corner bracket (East Asian languages)
0xFF07, // fullwidth apostrophe
};
for (int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
if (kSingleQuoteUnicodes[i] == ch)
return true;
}
return false;
}
| bool tesseract::IsDigitLike | ( | int | ch | ) |
Definition at line 200 of file paragraphs.cpp.
{
return strchr("'\"({[", ch) != NULL;
| bool tesseract::IsInterchangeValid | ( | const char32 | ch | ) |
Definition at line 209 of file normstrngs.cpp.
{
| bool tesseract::IsInterchangeValid7BitAscii | ( | const char32 | ch | ) |
Definition at line 233 of file normstrngs.cpp.
{
| bool tesseract::IsLatinLetter | ( | int | ch | ) |
Definition at line 196 of file paragraphs.cpp.
{
return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
| bool tesseract::IsLeftIndented | ( | const EquationDetect::IndentType | type | ) | [inline] |
Definition at line 95 of file equationdetect.cpp.
{
return type == EquationDetect::LEFT_INDENT ||
type == EquationDetect::BOTH_INDENT;
}
Definition at line 167 of file normstrngs.cpp.
{
| bool tesseract::IsOpeningPunct | ( | int | ch | ) |
Definition at line 204 of file paragraphs.cpp.
{
return strchr(":'\".?!]})", ch) != NULL;
| bool tesseract::IsRightIndented | ( | const EquationDetect::IndentType | type | ) | [inline] |
Definition at line 100 of file equationdetect.cpp.
{
return type == EquationDetect::RIGHT_INDENT ||
type == EquationDetect::BOTH_INDENT;
}
| bool tesseract::IsTerminalPunct | ( | int | ch | ) |
Definition at line 208 of file paragraphs.cpp.
{
| bool tesseract::IsTextOrEquationType | ( | PolyBlockType | type | ) | [inline] |
Definition at line 91 of file equationdetect.cpp.
{
return PTIsTextType(type) || type == PT_EQUATION;
}
| bool tesseract::IsUTF8Whitespace | ( | const char * | text | ) |
Definition at line 183 of file normstrngs.cpp.
{
| bool tesseract::IsValidCodepoint | ( | const char32 | ch | ) |
Definition at line 171 of file normstrngs.cpp.
{
| bool tesseract::IsWhitespace | ( | const char32 | ch | ) |
Definition at line 177 of file normstrngs.cpp.
: 0x%x\n", ch); return u_isUWhiteSpace(static_cast<UChar32>(ch)); }
| void tesseract::LeftoverSegments | ( | const GenericVector< RowScratchRegisters > & | rows, |
| GenericVector< Interval > * | to_fix, | ||
| int | row_start, | ||
| int | row_end | ||
| ) |
Definition at line 2184 of file paragraphs.cpp.
{
bool needs_fixing = false;
SetOfModels models;
SetOfModels models_w_crowns;
rows[i].StrongHypotheses(&models);
rows[i].NonNullHypotheses(&models_w_crowns);
if (models.empty() && models_w_crowns.size() > 0) {
// Crown paragraph. Is it followed by a modeled line?
for (int end = i + 1; end < rows.size(); end++) {
SetOfModels end_models;
SetOfModels strong_end_models;
rows[end].NonNullHypotheses(&end_models);
rows[end].StrongHypotheses(&strong_end_models);
if (end_models.size() == 0) {
needs_fixing = true;
break;
} else if (strong_end_models.size() > 0) {
needs_fixing = false;
break;
}
}
} else if (models.empty() && rows[i].ri_->num_words > 0) {
// No models at all.
needs_fixing = true;
}
if (!needs_fixing && !models.empty()) {
needs_fixing = RowIsStranded(rows, i);
}
if (needs_fixing) {
if (!to_fix->empty() && to_fix->back().end == i - 1)
to_fix->back().end = i;
else
to_fix->push_back(Interval(i, i));
}
}
// Convert inclusive intervals to half-open intervals.
for (int i = 0; i < to_fix->size(); i++) {
(*to_fix)[i].end = (*to_fix)[i].end + 1;
}
}
// Given a set of row_owners pointing to PARAs or NULL (no paragraph known),
// normalize each row_owner to point to an actual PARA, and output the
| void tesseract::LeftWordAttributes | ( | const UNICHARSET * | unicharset, |
| const WERD_CHOICE * | werd, | ||
| const STRING & | utf8, | ||
| bool * | is_list, | ||
| bool * | starts_idea, | ||
| bool * | ends_idea | ||
| ) |
Definition at line 397 of file paragraphs.cpp.
{ // Empty
*ends_idea = true;
return;
}
if (unicharset && werd) { // We have a proper werd and unicharset so use it.
if (UniLikelyListItem(unicharset, werd)) {
*is_list = true;
*starts_idea = true;
*ends_idea = true;
}
if (unicharset->get_isupper(werd->unichar_id(0))) {
*starts_idea = true;
}
if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
*starts_idea = true;
*ends_idea = true;
}
} else { // Assume utf8 is mostly ASCII
if (AsciiLikelyListItem(utf8)) {
*is_list = true;
*starts_idea = true;
}
int start_letter = utf8[0];
if (IsOpeningPunct(start_letter)) {
*starts_idea = true;
}
if (IsTerminalPunct(start_letter)) {
*ends_idea = true;
}
if (start_letter >= 'A' && start_letter <= 'Z') {
*starts_idea = true;
}
}
}
// Given the rightmost word of a line either as a Tesseract unicharset + werd
// or a utf8 string, set the following attributes for it:
| bool tesseract::LikelyListMark | ( | const STRING & | word | ) |
Definition at line 265 of file paragraphs.cpp.
{
return LikelyListMark(word) || LikelyListNumeral(word);
| bool tesseract::LikelyListMarkUnicode | ( | int | ch | ) |
Definition at line 331 of file paragraphs.cpp.
{
// TODO(eger) expand this list of unicodes as needed.
case 0x00B0: // degree sign
case 0x2022: // bullet
case 0x25E6: // white bullet
case 0x00B7: // middle dot
case 0x25A1: // white square
case 0x25A0: // black square
case 0x25AA: // black small square
case 0x2B1D: // black very small square
case 0x25BA: // black right-pointing pointer
case 0x25CF: // black circle
case 0x25CB: // white circle
return true;
default:
break; // fall through
}
return false;
}
// Return whether it is very likely that this is a numeral marker that could
// start a list item. Some examples include:
| bool tesseract::LikelyListNumeral | ( | const STRING & | word | ) |
Definition at line 231 of file paragraphs.cpp.
{(";
const char *kSep = ":;-.,";
const char *kClose = "]})";
int num_segments = 0;
const char *pos = word.string();
while (*pos != '\0' && num_segments < 3) {
// skip up to two open parens.
const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
const char *numeral_end = SkipChars(numeral_start, kRomans);
if (numeral_end != numeral_start) {
// Got Roman Numeral. Great.
} else {
numeral_end = SkipChars(numeral_start, kDigits);
if (numeral_end == numeral_start) {
// If there's a single latin letter, we can use that.
numeral_end = SkipChars(numeral_start, IsLatinLetter);
if (numeral_end - numeral_start != 1)
break;
}
}
// We got some sort of numeral.
num_segments++;
// Skip any trailing parens or punctuation.
pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
if (pos == numeral_end)
break;
}
return *pos == '\0';
}
bool LikelyListMark(const STRING &word) {
const char *kListMarks = "0Oo*.,+.";
| bool tesseract::LikelyParagraphStart | ( | const RowScratchRegisters & | before, |
| const RowScratchRegisters & | after | ||
| ) |
Definition at line 1675 of file paragraphs.cpp.
| bool tesseract::LikelyParagraphStart | ( | const RowScratchRegisters & | before, |
| const RowScratchRegisters & | after, | ||
| tesseract::ParagraphJustification | j | ||
| ) |
Definition at line 1682 of file paragraphs.cpp.
{
| bool tesseract::LoadDataFromFile | ( | const STRING & | filename, |
| GenericVector< char > * | data | ||
| ) | [inline] |
Definition at line 356 of file genericvector.h.
{
FILE* fp = fopen(filename.string(), "rb");
if (fp == NULL) return false;
fseek(fp, 0, SEEK_END);
size_t size = ftell(fp);
fseek(fp, 0, SEEK_SET);
// Pad with a 0, just in case we treat the result as a string.
data->init_to_size((int)size + 1, 0);
bool result = fread(&(*data)[0], 1, size, fp) == size;
fclose(fp);
return result;
}
| ShapeTable * tesseract::LoadShapeTable | ( | const STRING & | file_prefix | ) |
Definition at line 118 of file commontraining.cpp.
{
ShapeTable* shape_table = NULL;
STRING shape_table_file = file_prefix;
shape_table_file += kShapeTableFileSuffix;
FILE* shape_fp = fopen(shape_table_file.string(), "rb");
if (shape_fp != NULL) {
shape_table = new ShapeTable;
if (!shape_table->DeSerialize(false, shape_fp)) {
delete shape_table;
shape_table = NULL;
tprintf("Error: Failed to read shape table %s\n",
shape_table_file.string());
} else {
int num_shapes = shape_table->NumShapes();
tprintf("Read shape table %s of %d shapes\n",
shape_table_file.string(), num_shapes);
}
fclose(shape_fp);
} else {
tprintf("Warning: No shape table file present: %s\n",
shape_table_file.string());
}
return shape_table;
}
| MasterTrainer * tesseract::LoadTrainingData | ( | int | argc, |
| const char *const * | argv, | ||
| bool | replication, | ||
| ShapeTable ** | shape_table, | ||
| STRING * | file_prefix | ||
| ) |
Creates a MasterTraininer and loads the training data into it: Initializes feature_defs and IntegerFX. Loads the shape_table if shape_table != NULL. Loads initial unicharset from -U command-line option. If FLAGS_T is set, loads the majority of data from there, else:
Definition at line 175 of file commontraining.cpp.
{
InitFeatureDefs(&feature_defs);
InitIntegerFX();
*file_prefix = "";
if (!FLAGS_D.empty()) {
*file_prefix += FLAGS_D.c_str();
*file_prefix += "/";
}
// If we are shape clustering (NULL shape_table) or we successfully load
// a shape_table written by a previous shape clustering, then
// shape_analysis will be true, meaning that the MasterTrainer will replace
// some members of the unicharset with their fragments.
bool shape_analysis = false;
if (shape_table != NULL) {
*shape_table = LoadShapeTable(*file_prefix);
if (*shape_table != NULL)
shape_analysis = true;
} else {
shape_analysis = true;
}
MasterTrainer* trainer = new MasterTrainer(NM_CHAR_ANISOTROPIC,
shape_analysis,
replication,
FLAGS_debug_level);
IntFeatureSpace fs;
fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets);
if (FLAGS_T.empty()) {
trainer->LoadUnicharset(FLAGS_U.c_str());
// Get basic font information from font_properties.
if (!FLAGS_F.empty()) {
if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
delete trainer;
return NULL;
}
}
if (!FLAGS_X.empty()) {
if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
delete trainer;
return NULL;
}
}
trainer->SetFeatureSpace(fs);
const char* page_name;
// Load training data from .tr files on the command line.
while ((page_name = GetNextFilename(argc, argv)) != NULL) {
tprintf("Reading %s ...\n", page_name);
trainer->ReadTrainingSamples(page_name, feature_defs, false);
// If there is a file with [lang].[fontname].exp[num].fontinfo present,
// read font spacing information in to fontinfo_table.
int pagename_len = strlen(page_name);
char *fontinfo_file_name = new char[pagename_len + 7];
strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
trainer->AddSpacingInfo(fontinfo_file_name);
delete[] fontinfo_file_name;
// Load the images into memory if required by the classifier.
if (FLAGS_load_images) {
STRING image_name = page_name;
// Chop off the tr and replace with tif. Extension must be tif!
image_name.truncate_at(image_name.length() - 2);
image_name += "tif";
trainer->LoadPageImages(image_name.string());
}
}
trainer->PostLoadCleanup();
// Write the master trainer if required.
if (!FLAGS_output_trainer.empty()) {
FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
if (fp == NULL) {
tprintf("Can't create saved trainer data!\n");
} else {
trainer->Serialize(fp);
fclose(fp);
}
}
} else {
bool success = false;
tprintf("Loading master trainer from file:%s\n",
FLAGS_T.c_str());
FILE* fp = fopen(FLAGS_T.c_str(), "rb");
if (fp == NULL) {
tprintf("Can't read file %s to initialize master trainer\n",
FLAGS_T.c_str());
} else {
success = trainer->DeSerialize(false, fp);
fclose(fp);
}
if (!success) {
tprintf("Deserialize of master trainer failed!\n");
delete trainer;
return NULL;
}
trainer->SetFeatureSpace(fs);
}
trainer->PreTrainingSetup();
if (!FLAGS_O.empty() &&
!trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
delete trainer;
return NULL;
}
if (shape_table != NULL) {
// If we previously failed to load a shapetable, then shape clustering
// wasn't run so make a flat one now.
if (*shape_table == NULL) {
*shape_table = new ShapeTable;
trainer->SetupFlatShapeTable(*shape_table);
tprintf("Flat shape table summary: %s\n",
(*shape_table)->SummaryStr().string());
}
(*shape_table)->set_unicharset(trainer->unicharset());
}
return trainer;
}
| bool tesseract::MakeIndividualGlyphs | ( | Pix * | pix, |
| const vector< BoxChar * > & | vbox, | ||
| const int | input_tiff_page | ||
| ) |
Definition at line 309 of file text2image.cpp.
{
// If checks fail, return false without exiting text2image
if (!pix) {
tprintf("ERROR: MakeIndividualGlyphs(): Input Pix* is NULL\n");
return false;
} else if (FLAGS_glyph_resized_size <= 0) {
tprintf("ERROR: --glyph_resized_size must be positive\n");
return false;
} else if (FLAGS_glyph_num_border_pixels_to_pad < 0) {
tprintf("ERROR: --glyph_num_border_pixels_to_pad must be 0 or positive\n");
return false;
}
const int n_boxes = vbox.size();
int n_boxes_saved = 0;
int current_tiff_page = 0;
int y_previous = 0;
static int glyph_count = 0;
for (int i = 0; i < n_boxes; i++) {
// Get one bounding box
Box* b = vbox[i]->mutable_box();
if (!b) continue;
const int x = b->x;
const int y = b->y;
const int w = b->w;
const int h = b->h;
// Check present tiff page (for multipage tiff)
if (y < y_previous-pixGetHeight(pix)/10) {
tprintf("ERROR: Wrap-around encountered, at i=%d\n", i);
current_tiff_page++;
}
if (current_tiff_page < input_tiff_page) continue;
else if (current_tiff_page > input_tiff_page) break;
// Check box validity
if (x < 0 || y < 0 ||
(x+w-1) >= pixGetWidth(pix) ||
(y+h-1) >= pixGetHeight(pix)) {
tprintf("ERROR: MakeIndividualGlyphs(): Index out of range, at i=%d"
" (x=%d, y=%d, w=%d, h=%d\n)", i, x, y, w, h);
continue;
} else if (w < FLAGS_glyph_num_border_pixels_to_pad &&
h < FLAGS_glyph_num_border_pixels_to_pad) {
tprintf("ERROR: Input image too small to be a character, at i=%d\n", i);
continue;
}
// Crop the boxed character
Pix* pix_glyph = pixClipRectangle(pix, b, NULL);
if (!pix_glyph) {
tprintf("ERROR: MakeIndividualGlyphs(): Failed to clip, at i=%d\n", i);
continue;
}
// Resize to square
Pix* pix_glyph_sq = pixScaleToSize(pix_glyph,
FLAGS_glyph_resized_size,
FLAGS_glyph_resized_size);
if (!pix_glyph_sq) {
tprintf("ERROR: MakeIndividualGlyphs(): Failed to resize, at i=%d\n", i);
continue;
}
// Zero-pad
Pix* pix_glyph_sq_pad = pixAddBorder(pix_glyph_sq,
FLAGS_glyph_num_border_pixels_to_pad,
0);
if (!pix_glyph_sq_pad) {
tprintf("ERROR: MakeIndividualGlyphs(): Failed to zero-pad, at i=%d\n",
i);
continue;
}
// Write out
Pix* pix_glyph_sq_pad_8 = pixConvertTo8(pix_glyph_sq_pad, false);
char filename[1024];
snprintf(filename, 1024, "%s_%d.jpg", FLAGS_outputbase.c_str(),
glyph_count++);
if (pixWriteJpeg(filename, pix_glyph_sq_pad_8, 100, 0)) {
tprintf("ERROR: MakeIndividualGlyphs(): Failed to write JPEG to %s,"
" at i=%d\n", filename, i);
continue;
}
pixDestroy(&pix_glyph);
pixDestroy(&pix_glyph_sq);
pixDestroy(&pix_glyph_sq_pad);
pixDestroy(&pix_glyph_sq_pad_8);
n_boxes_saved++;
y_previous = y;
}
if (n_boxes_saved == 0) {
return false;
} else {
tprintf("Total number of characters saved = %d\n", n_boxes_saved);
return true;
}
}
| void tesseract::MarkRowsWithModel | ( | GenericVector< RowScratchRegisters > * | rows, |
| int | row_start, | ||
| int | row_end, | ||
| const ParagraphModel * | model, | ||
| bool | ltr, | ||
| int | eop_threshold | ||
| ) |
Definition at line 810 of file paragraphs.cpp.
{
if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
return;
for (int row = row_start; row < row_end; row++) {
bool valid_first = ValidFirstLine(rows, row, model);
bool valid_body = ValidBodyLine(rows, row, model);
if (valid_first && !valid_body) {
(*rows)[row].AddStartLine(model);
} else if (valid_body && !valid_first) {
(*rows)[row].AddBodyLine(model);
} else if (valid_body && valid_first) {
bool after_eop = (row == row_start);
if (row > row_start) {
if (eop_threshold > 0) {
if (model->justification() == JUSTIFICATION_LEFT) {
after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
} else {
after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
}
} else {
after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row],
model->justification());
}
}
if (after_eop) {
(*rows)[row].AddStartLine(model);
} else {
(*rows)[row].AddBodyLine(model);
}
} else {
// Do nothing. Stray row.
}
}
}
// GeometricClassifierState holds all of the information we'll use while
// trying to determine a paragraph model for the text lines in a block of
| void tesseract::MarkStrongEvidence | ( | GenericVector< RowScratchRegisters > * | rows, |
| int | row_start, | ||
| int | row_end | ||
| ) |
Definition at line 1833 of file paragraphs.cpp.
{
const RowScratchRegisters &prev = (*rows)[i - 1];
RowScratchRegisters &curr = (*rows)[i];
tesseract::ParagraphJustification typical_justification =
prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
if (!curr.ri_->rword_likely_starts_idea &&
!curr.ri_->lword_likely_starts_idea &&
!FirstWordWouldHaveFit(prev, curr, typical_justification)) {
curr.SetBodyLine();
}
}
// Record patently obvious start paragraph lines.
//
// It's an extremely good signal of the start of a paragraph that
// the first word would have fit on the end of the previous line.
// However, applying just that signal would have us mark random
// start lines of lineated text (poetry and source code) and some
// centered headings as paragraph start lines. Therefore, we use
// a second qualification for a paragraph start: Not only should
// the first word of this line have fit on the previous line,
// but also, this line should go full to the right of the block,
// disallowing a subsequent word from having fit on this line.
// First row:
{
RowScratchRegisters &curr = (*rows)[row_start];
RowScratchRegisters &next = (*rows)[row_start + 1];
tesseract::ParagraphJustification j =
curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
if (curr.GetLineType() == LT_UNKNOWN &&
!FirstWordWouldHaveFit(curr, next, j) &&
(curr.ri_->lword_likely_starts_idea ||
curr.ri_->rword_likely_starts_idea)) {
curr.SetStartLine();
}
}
// Middle rows
for (int i = row_start + 1; i < row_end - 1; i++) {
RowScratchRegisters &prev = (*rows)[i - 1];
RowScratchRegisters &curr = (*rows)[i];
RowScratchRegisters &next = (*rows)[i + 1];
tesseract::ParagraphJustification j =
curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
if (curr.GetLineType() == LT_UNKNOWN &&
!FirstWordWouldHaveFit(curr, next, j) &&
LikelyParagraphStart(prev, curr, j)) {
curr.SetStartLine();
}
}
// Last row
{ // the short circuit at the top means we have at least two lines.
RowScratchRegisters &prev = (*rows)[row_end - 2];
RowScratchRegisters &curr = (*rows)[row_end - 1];
tesseract::ParagraphJustification j =
curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
if (curr.GetLineType() == LT_UNKNOWN &&
!FirstWordWouldHaveFit(curr, curr, j) &&
LikelyParagraphStart(prev, curr, j)) {
curr.SetStartLine();
}
}
}
// Look for sequences of a start line followed by some body lines in
// rows[row_start, row_end) and create ParagraphModels for them if
| void tesseract::ModelStrongEvidence | ( | int | debug_level, |
| GenericVector< RowScratchRegisters > * | rows, | ||
| int | row_start, | ||
| int | row_end, | ||
| bool | allow_flush_models, | ||
| ParagraphTheory * | theory | ||
| ) |
Definition at line 1903 of file paragraphs.cpp.
{
if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
return;
int start = row_start;
while (start < row_end) {
while (start < row_end && (*rows)[start].GetLineType() != LT_START)
start++;
if (start >= row_end - 1)
break;
int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
int end = start;
ParagraphModel last_model;
bool next_consistent;
do {
++end;
// rows[row, end) was consistent.
// If rows[row, end + 1) is not consistent,
// just model rows[row, end)
if (end < row_end - 1) {
RowScratchRegisters &next = (*rows)[end];
LineType lt = next.GetLineType();
next_consistent = lt == LT_BODY ||
(lt == LT_UNKNOWN &&
!FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
} else {
next_consistent = false;
}
if (next_consistent) {
ParagraphModel next_model = InternalParagraphModelByOutline(
rows, start, end + 1, tolerance, &next_consistent);
if (((*rows)[start].ri_->ltr &&
last_model.justification() == JUSTIFICATION_LEFT &&
next_model.justification() != JUSTIFICATION_LEFT) ||
(!(*rows)[start].ri_->ltr &&
last_model.justification() == JUSTIFICATION_RIGHT &&
next_model.justification() != JUSTIFICATION_RIGHT)) {
next_consistent = false;
}
last_model = next_model;
} else {
next_consistent = false;
}
} while (next_consistent && end < row_end);
// At this point, rows[start, end) looked like it could have been a
// single paragraph. If we can make a good ParagraphModel for it,
// do so and mark this sequence with that model.
if (end > start + 1) {
// emit a new paragraph if we have more than one line.
const ParagraphModel *model = NULL;
ParagraphModel new_model = ParagraphModelByOutline(
debug_level, rows, start, end,
Epsilon(InterwordSpace(*rows, start, end)));
if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
// couldn't create a good model, oh well.
} else if (new_model.is_flush()) {
if (end == start + 2) {
// It's very likely we just got two paragraph starts in a row.
end = start + 1;
} else if (start == row_start) {
// Mark this as a Crown.
if (new_model.justification() == JUSTIFICATION_LEFT) {
model = kCrownLeft;
} else {
model = kCrownRight;
}
} else if (allow_flush_models) {
model = theory->AddModel(new_model);
}
} else {
model = theory->AddModel(new_model);
}
if (model) {
(*rows)[start].AddStartLine(model);
for (int i = start + 1; i < end; i++) {
(*rows)[i].AddBodyLine(model);
}
}
}
start = end;
}
}
// We examine rows[row_start, row_end) and do the following:
// (1) Clear all existing hypotheses for the rows being considered.
| void tesseract::NormalizeChar32 | ( | char32 | ch, |
| GenericVector< char32 > * | str | ||
| ) |
Definition at line 132 of file normstrngs.cpp.
{
// If any spaces were added by NFKC, pretend normalization is a nop.
if (norm_str[i] == ' ') {
str->clear();
str->push_back(ch);
break;
} else {
str->push_back(OCRNormalize(static_cast<char32>(norm_str[i])));
}
}
}
| uinT8 tesseract::NormalizeDirection | ( | uinT8 | dir, |
| const FCOORD & | unnormed_pos, | ||
| const DENORM & | denorm, | ||
| const DENORM * | root_denorm | ||
| ) |
Definition at line 171 of file intfx.cpp.
{
// Convert direction to a vector.
FCOORD unnormed_end;
unnormed_end.from_direction(dir);
unnormed_end += unnormed_pos;
FCOORD normed_pos, normed_end;
denorm.NormTransform(root_denorm, unnormed_pos, &normed_pos);
denorm.NormTransform(root_denorm, unnormed_end, &normed_end);
normed_end -= normed_pos;
return normed_end.to_direction();
}
| STRING tesseract::NormalizeUTF8String | ( | const char * | str8 | ) |
Definition at line 117 of file normstrngs.cpp.
{
norm_str.clear();
NormalizeChar32(str32[i], &norm_str);
for (int j = 0; j < norm_str.length(); ++j) {
out_str32.push_back(norm_str[j]);
}
}
STRING out_str8;
UTF32ToUTF8(out_str32, &out_str8);
return out_str8;
}
Definition at line 157 of file normstrngs.cpp.
{
| int tesseract::OtsuStats | ( | const int * | histogram, |
| int * | H_out, | ||
| int * | omega0_out | ||
| ) |
Definition at line 183 of file otsuthr.cpp.
{
H += histogram[i];
mu_T += static_cast<double>(i) * histogram[i];
}
// Now maximize sig_sq_B over t.
// http://www.ctie.monash.edu.au/hargreave/Cornall_Terry_328.pdf
int best_t = -1;
int omega_0, omega_1;
int best_omega_0 = 0;
double best_sig_sq_B = 0.0;
double mu_0, mu_1, mu_t;
omega_0 = 0;
mu_t = 0.0;
for (int t = 0; t < kHistogramSize - 1; ++t) {
omega_0 += histogram[t];
mu_t += t * static_cast<double>(histogram[t]);
if (omega_0 == 0)
continue;
omega_1 = H - omega_0;
if (omega_1 == 0)
break;
mu_0 = mu_t / omega_0;
mu_1 = (mu_T - mu_t) / omega_1;
double sig_sq_B = mu_1 - mu_0;
sig_sq_B *= sig_sq_B * omega_0 * omega_1;
if (best_t < 0 || sig_sq_B > best_sig_sq_B) {
best_sig_sq_B = sig_sq_B;
best_t = t;
best_omega_0 = omega_0;
}
}
if (H_out != NULL) *H_out = H;
if (omega0_out != NULL) *omega0_out = best_omega_0;
return best_t;
}
| int tesseract::OtsuThreshold | ( | Pix * | src_pix, |
| int | left, | ||
| int | top, | ||
| int | width, | ||
| int | height, | ||
| int ** | thresholds, | ||
| int ** | hi_values | ||
| ) |
Definition at line 40 of file otsuthr.cpp.
{
int num_channels = pixGetDepth(src_pix) / 8;
// Of all channels with no good hi_value, keep the best so we can always
// produce at least one answer.
PERF_COUNT_START("OtsuThreshold")
int best_hi_value = 1;
int best_hi_index = 0;
bool any_good_hivalue = false;
double best_hi_dist = 0.0;
*thresholds = new int[num_channels];
*hi_values = new int[num_channels];
// all of channel 0 then all of channel 1...
int *histogramAllChannels = new int[kHistogramSize * num_channels];
// only use opencl if compiled w/ OpenCL and selected device is opencl
#ifdef USE_OPENCL
// Calculate Histogram on GPU
OpenclDevice od;
if (od.selectedDeviceIsOpenCL() &&
(num_channels == 1 || num_channels == 4) && top == 0 && left == 0 ) {
od.HistogramRectOCL(
(const unsigned char*)pixGetData(src_pix),
num_channels,
pixGetWpl(src_pix) * 4,
left,
top,
width,
height,
kHistogramSize,
histogramAllChannels);
// Calculate Threshold from Histogram on cpu
for (int ch = 0; ch < num_channels; ++ch) {
(*thresholds)[ch] = -1;
(*hi_values)[ch] = -1;
int *histogram = &histogramAllChannels[kHistogramSize * ch];
int H;
int best_omega_0;
int best_t = OtsuStats(histogram, &H, &best_omega_0);
if (best_omega_0 == 0 || best_omega_0 == H) {
// This channel is empty.
continue;
}
// To be a convincing foreground we must have a small fraction of H
// or to be a convincing background we must have a large fraction of H.
// In between we assume this channel contains no thresholding information.
int hi_value = best_omega_0 < H * 0.5;
(*thresholds)[ch] = best_t;
if (best_omega_0 > H * 0.75) {
any_good_hivalue = true;
(*hi_values)[ch] = 0;
} else if (best_omega_0 < H * 0.25) {
any_good_hivalue = true;
(*hi_values)[ch] = 1;
} else {
// In case all channels are like this, keep the best of the bad lot.
double hi_dist = hi_value ? (H - best_omega_0) : best_omega_0;
if (hi_dist > best_hi_dist) {
best_hi_dist = hi_dist;
best_hi_value = hi_value;
best_hi_index = ch;
}
}
}
} else {
#endif
for (int ch = 0; ch < num_channels; ++ch) {
(*thresholds)[ch] = -1;
(*hi_values)[ch] = -1;
// Compute the histogram of the image rectangle.
int histogram[kHistogramSize];
HistogramRect(src_pix, ch, left, top, width, height, histogram);
int H;
int best_omega_0;
int best_t = OtsuStats(histogram, &H, &best_omega_0);
if (best_omega_0 == 0 || best_omega_0 == H) {
// This channel is empty.
continue;
}
// To be a convincing foreground we must have a small fraction of H
// or to be a convincing background we must have a large fraction of H.
// In between we assume this channel contains no thresholding information.
int hi_value = best_omega_0 < H * 0.5;
(*thresholds)[ch] = best_t;
if (best_omega_0 > H * 0.75) {
any_good_hivalue = true;
(*hi_values)[ch] = 0;
} else if (best_omega_0 < H * 0.25) {
any_good_hivalue = true;
(*hi_values)[ch] = 1;
} else {
// In case all channels are like this, keep the best of the bad lot.
double hi_dist = hi_value ? (H - best_omega_0) : best_omega_0;
if (hi_dist > best_hi_dist) {
best_hi_dist = hi_dist;
best_hi_value = hi_value;
best_hi_index = ch;
}
}
}
#ifdef USE_OPENCL
}
#endif // USE_OPENCL
delete[] histogramAllChannels;
if (!any_good_hivalue) {
// Use the best of the ones that were not good enough.
(*hi_values)[best_hi_index] = best_hi_value;
}
PERF_COUNT_END
return num_channels;
}
| ParagraphModel tesseract::ParagraphModelByOutline | ( | int | debug_level, |
| const GenericVector< RowScratchRegisters > * | rows, | ||
| int | start, | ||
| int | end, | ||
| int | tolerance | ||
| ) |
Definition at line 1796 of file paragraphs.cpp.
{
bool unused_consistent;
ParagraphModel retval = InternalParagraphModelByOutline(
rows, start, end, tolerance, &unused_consistent);
if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
tprintf("Could not determine a model for this paragraph:\n");
PrintRowRange(*rows, start, end);
}
return retval;
}
// Do rows[start, end) form a single instance of the given paragraph model?
bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,
| int tesseract::ParamsTrainingFeatureByName | ( | const char * | name | ) |
Definition at line 26 of file params_training_featdef.cpp.
{
if (name == NULL)
return -1;
int array_size = sizeof(kParamsTrainingFeatureTypeName) /
sizeof(kParamsTrainingFeatureTypeName[0]);
for (int i = 0; i < array_size; i++) {
if (kParamsTrainingFeatureTypeName[i] == NULL)
continue;
if (strcmp(name, kParamsTrainingFeatureTypeName[i]) == 0)
return i;
}
return -1;
}
| void tesseract::ParseCommandLineFlags | ( | const char * | usage, |
| int * | argc, | ||
| char *** | argv, | ||
| const bool | remove_flags | ||
| ) |
Definition at line 312 of file commandlineflags.cpp.
{
InitGoogle(usage, argc, argv, remove_flags);
}
| double tesseract::prec | ( | double | x | ) |
Definition at line 184 of file pdfrenderer.cpp.
{
double kPrecision = 1000.0;
double a = round(x * kPrecision) / kPrecision;
if (a == -0)
return 0;
return a;
}
| bool tesseract::PSM_BLOCK_FIND_ENABLED | ( | int | pageseg_mode | ) | [inline] |
Definition at line 191 of file publictypes.h.
{
return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN;
}
| bool tesseract::PSM_COL_FIND_ENABLED | ( | int | pageseg_mode | ) | [inline] |
Definition at line 185 of file publictypes.h.
{
return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO;
}
| bool tesseract::PSM_LINE_FIND_ENABLED | ( | int | pageseg_mode | ) | [inline] |
Definition at line 194 of file publictypes.h.
{
return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK;
}
| bool tesseract::PSM_ORIENTATION_ENABLED | ( | int | pageseg_mode | ) | [inline] |
Definition at line 182 of file publictypes.h.
{
return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD;
}
| bool tesseract::PSM_OSD_ENABLED | ( | int | pageseg_mode | ) | [inline] |
Inline functions that act on a PageSegMode to determine whether components of layout analysis are enabled. *Depend critically on the order of elements of PageSegMode.* NOTE that arg is an int for compatibility with INT_PARAM.
Definition at line 179 of file publictypes.h.
{
return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD;
}
| bool tesseract::PSM_SPARSE | ( | int | pageseg_mode | ) | [inline] |
Definition at line 188 of file publictypes.h.
{
return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
}
| bool tesseract::PSM_WORD_FIND_ENABLED | ( | int | pageseg_mode | ) | [inline] |
Definition at line 197 of file publictypes.h.
{
return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) ||
pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
}
| bool tesseract::read_info | ( | FILE * | f, |
| FontInfo * | fi, | ||
| bool | swap | ||
| ) |
Definition at line 152 of file fontinfo.cpp.
{
inT32 size;
if (fread(&size, sizeof(size), 1, f) != 1) return false;
if (swap)
Reverse32(&size);
char* font_name = new char[size + 1];
fi->name = font_name;
if (static_cast<int>(fread(font_name, sizeof(*font_name), size, f)) != size)
return false;
font_name[size] = '\0';
if (fread(&fi->properties, sizeof(fi->properties), 1, f) != 1) return false;
if (swap)
Reverse32(&fi->properties);
return true;
}
| bool tesseract::read_set | ( | FILE * | f, |
| FontSet * | fs, | ||
| bool | swap | ||
| ) |
Definition at line 240 of file fontinfo.cpp.
{
if (fread(&fs->size, sizeof(fs->size), 1, f) != 1) return false;
if (swap)
Reverse32(&fs->size);
fs->configs = new int[fs->size];
for (int i = 0; i < fs->size; ++i) {
if (fread(&fs->configs[i], sizeof(fs->configs[i]), 1, f) != 1) return false;
if (swap)
Reverse32(&fs->configs[i]);
}
return true;
}
| bool tesseract::read_spacing_info | ( | FILE * | f, |
| FontInfo * | fi, | ||
| bool | swap | ||
| ) |
Definition at line 177 of file fontinfo.cpp.
{
inT32 vec_size, kern_size;
if (fread(&vec_size, sizeof(vec_size), 1, f) != 1) return false;
if (swap) Reverse32(&vec_size);
ASSERT_HOST(vec_size >= 0);
if (vec_size == 0) return true;
fi->init_spacing(vec_size);
for (int i = 0; i < vec_size; ++i) {
FontSpacingInfo *fs = new FontSpacingInfo();
if (fread(&fs->x_gap_before, sizeof(fs->x_gap_before), 1, f) != 1 ||
fread(&fs->x_gap_after, sizeof(fs->x_gap_after), 1, f) != 1 ||
fread(&kern_size, sizeof(kern_size), 1, f) != 1) {
delete fs;
return false;
}
if (swap) {
ReverseN(&(fs->x_gap_before), sizeof(fs->x_gap_before));
ReverseN(&(fs->x_gap_after), sizeof(fs->x_gap_after));
Reverse32(&kern_size);
}
if (kern_size < 0) { // indication of a NULL entry in fi->spacing_vec
delete fs;
continue;
}
if (kern_size > 0 && (!fs->kerned_unichar_ids.DeSerialize(swap, f) ||
!fs->kerned_x_gaps.DeSerialize(swap, f))) {
delete fs;
return false;
}
fi->add_spacing(i, fs);
}
return true;
}
| bool tesseract::read_t | ( | PAGE_RES_IT * | page_res_it, |
| TBOX * | tbox | ||
| ) |
Definition at line 53 of file recogtraining.cpp.
{
while (page_res_it->block() != NULL && page_res_it->word() == NULL)
page_res_it->forward();
if (page_res_it->word() != NULL) {
*tbox = page_res_it->word()->word->bounding_box();
// If tbox->left() is negative, the training image has vertical text and
// all the coordinates of bounding boxes of page_res are rotated by 90
// degrees in a counterclockwise direction. We need to rotate the TBOX back
// in order to compare with the TBOXes of box files.
if (tbox->left() < 0) {
tbox->rotate(FCOORD(0.0, -1.0));
}
return true;
} else {
return false;
}
}
| void tesseract::RecomputeMarginsAndClearHypotheses | ( | GenericVector< RowScratchRegisters > * | rows, |
| int | start, | ||
| int | end, | ||
| int | percentile | ||
| ) |
Definition at line 1561 of file paragraphs.cpp.
{
RowScratchRegisters &sr = (*rows)[i];
sr.SetUnknown();
if (sr.ri_->num_words == 0)
continue;
UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
}
STATS lefts(lmin, lmax + 1);
STATS rights(rmin, rmax + 1);
for (int i = start; i < end; i++) {
RowScratchRegisters &sr = (*rows)[i];
if (sr.ri_->num_words == 0)
continue;
lefts.add(sr.lmargin_ + sr.lindent_, 1);
rights.add(sr.rmargin_ + sr.rindent_, 1);
}
int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
for (int i = start; i < end; i++) {
RowScratchRegisters &sr = (*rows)[i];
int ldelta = ignorable_left - sr.lmargin_;
sr.lmargin_ += ldelta;
sr.lindent_ -= ldelta;
int rdelta = ignorable_right - sr.rmargin_;
sr.rmargin_ += rdelta;
sr.rindent_ -= rdelta;
}
}
// Return the median inter-word space in rows[row_start, row_end).
int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
| void tesseract::RightWordAttributes | ( | const UNICHARSET * | unicharset, |
| const WERD_CHOICE * | werd, | ||
| const STRING & | utf8, | ||
| bool * | is_list, | ||
| bool * | starts_idea, | ||
| bool * | ends_idea | ||
| ) |
Definition at line 444 of file paragraphs.cpp.
{ // Empty
*ends_idea = true;
return;
}
if (unicharset && werd) { // We have a proper werd and unicharset so use it.
if (UniLikelyListItem(unicharset, werd)) {
*is_list = true;
*starts_idea = true;
}
UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
if (unicharset->get_ispunctuation(last_letter)) {
*ends_idea = true;
}
} else { // Assume utf8 is mostly ASCII
if (AsciiLikelyListItem(utf8)) {
*is_list = true;
*starts_idea = true;
}
int last_letter = utf8[utf8.size() - 1];
if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
*ends_idea = true;
}
}
}
// =============== Implementation of RowScratchRegisters =====================
/* static */
| bool tesseract::RowIsStranded | ( | const GenericVector< RowScratchRegisters > & | rows, |
| int | row | ||
| ) |
Definition at line 2142 of file paragraphs.cpp.
{
bool all_starts = rows[row].GetLineType();
int run_length = 1;
bool continues = true;
for (int i = row - 1; i >= 0 && continues; i--) {
SetOfModels models;
rows[i].NonNullHypotheses(&models);
switch (rows[i].GetLineType(row_models[m])) {
case LT_START: run_length++; break;
case LT_MULTIPLE: // explicit fall-through
case LT_BODY: run_length++; all_starts = false; break;
case LT_UNKNOWN: // explicit fall-through
default: continues = false;
}
}
continues = true;
for (int i = row + 1; i < rows.size() && continues; i++) {
SetOfModels models;
rows[i].NonNullHypotheses(&models);
switch (rows[i].GetLineType(row_models[m])) {
case LT_START: run_length++; break;
case LT_MULTIPLE: // explicit fall-through
case LT_BODY: run_length++; all_starts = false; break;
case LT_UNKNOWN: // explicit fall-through
default: continues = false;
}
}
if (run_length > 2 || (!all_starts && run_length > 1)) return false;
}
return true;
}
// Go through rows[row_start, row_end) and gather up sequences that need better
// classification.
| bool tesseract::RowsFitModel | ( | const GenericVector< RowScratchRegisters > * | rows, |
| int | start, | ||
| int | end, | ||
| const ParagraphModel * | model | ||
| ) |
Definition at line 1811 of file paragraphs.cpp.
{
if (!ValidBodyLine(rows, i, model)) return false;
}
return true;
}
// Examine rows[row_start, row_end) as an independent section of text,
// and mark rows that are exceptionally clear as start-of-paragraph
Definition at line 124 of file paragraphs.cpp.
{
| bool tesseract::SaveDataToFile | ( | const GenericVector< char > & | data, |
| const STRING & | filename | ||
| ) | [inline] |
Definition at line 371 of file genericvector.h.
| const char * tesseract::ScriptPosToString | ( | enum ScriptPos | script_pos | ) |
Definition at line 180 of file ratngs.cpp.
{
switch (script_pos) {
case SP_NORMAL: return "NORM";
case SP_SUBSCRIPT: return "SUB";
case SP_SUPERSCRIPT: return "SUPER";
case SP_DROPCAP: return "DROPC";
}
return "SP_UNKNOWN";
}
| void tesseract::SeparateSimpleLeaderLines | ( | GenericVector< RowScratchRegisters > * | rows, |
| int | row_start, | ||
| int | row_end, | ||
| ParagraphTheory * | theory | ||
| ) |
Definition at line 2028 of file paragraphs.cpp.
{
if ((*rows)[i - 1].ri_->has_leaders &&
(*rows)[i].ri_->has_leaders &&
(*rows)[i + 1].ri_->has_leaders) {
const ParagraphModel *model = theory->AddModel(
ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
(*rows)[i].AddStartLine(model);
}
}
}
// Collect sequences of unique hypotheses in row registers and create proper
// paragraphs for them, referencing the paragraphs in row_owners.
| void tesseract::SetBlobStrokeWidth | ( | Pix * | pix, |
| BLOBNBOX * | blob | ||
| ) |
Definition at line 58 of file tordmain.cpp.
{
// Cut the blob rectangle into a Pix.
int pix_height = pixGetHeight(pix);
const TBOX& box = blob->bounding_box();
int width = box.width();
int height = box.height();
Box* blob_pix_box = boxCreate(box.left(), pix_height - box.top(),
width, height);
Pix* pix_blob = pixClipRectangle(pix, blob_pix_box, NULL);
boxDestroy(&blob_pix_box);
Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG);
pixDestroy(&pix_blob);
// Compute the stroke widths.
uinT32* data = pixGetData(dist_pix);
int wpl = pixGetWpl(dist_pix);
// Horizontal width of stroke.
STATS h_stats(0, width + 1);
for (int y = 0; y < height; ++y) {
uinT32* pixels = data + y*wpl;
int prev_pixel = 0;
int pixel = GET_DATA_BYTE(pixels, 0);
for (int x = 1; x < width; ++x) {
int next_pixel = GET_DATA_BYTE(pixels, x);
// We are looking for a pixel that is equal to its vertical neighbours,
// yet greater than its left neighbour.
if (prev_pixel < pixel &&
(y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
(y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
if (pixel > next_pixel) {
// Single local max, so an odd width.
h_stats.add(pixel * 2 - 1, 1);
} else if (pixel == next_pixel && x + 1 < width &&
pixel > GET_DATA_BYTE(pixels, x + 1)) {
// Double local max, so an even width.
h_stats.add(pixel * 2, 1);
}
}
prev_pixel = pixel;
pixel = next_pixel;
}
}
// Vertical width of stroke.
STATS v_stats(0, height + 1);
for (int x = 0; x < width; ++x) {
int prev_pixel = 0;
int pixel = GET_DATA_BYTE(data, x);
for (int y = 1; y < height; ++y) {
uinT32* pixels = data + y*wpl;
int next_pixel = GET_DATA_BYTE(pixels, x);
// We are looking for a pixel that is equal to its horizontal neighbours,
// yet greater than its upper neighbour.
if (prev_pixel < pixel &&
(x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
(x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
if (pixel > next_pixel) {
// Single local max, so an odd width.
v_stats.add(pixel * 2 - 1, 1);
} else if (pixel == next_pixel && y + 1 < height &&
pixel > GET_DATA_BYTE(pixels + wpl, x)) {
// Double local max, so an even width.
v_stats.add(pixel * 2, 1);
}
}
prev_pixel = pixel;
pixel = next_pixel;
}
}
pixDestroy(&dist_pix);
// Store the horizontal and vertical width in the blob, keeping both
// widths if there is enough information, otherwse only the one with
// the most samples.
// If there are insufficent samples, store zero, rather than using
// 2*area/perimeter, as the numbers that gives do not match the numbers
// from the distance method.
if (h_stats.get_total() >= (width + height) / 4) {
blob->set_horz_stroke_width(h_stats.ile(0.5f));
if (v_stats.get_total() >= (width + height) / 4)
blob->set_vert_stroke_width(v_stats.ile(0.5f));
else
blob->set_vert_stroke_width(0.0f);
} else {
if (v_stats.get_total() >= (width + height) / 4 ||
v_stats.get_total() > h_stats.get_total()) {
blob->set_horz_stroke_width(0.0f);
blob->set_vert_stroke_width(v_stats.ile(0.5f));
} else {
blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f)
: 0.0f);
blob->set_vert_stroke_width(0.0f);
}
}
}
| void tesseract::SetPropertiesForInputFile | ( | const string & | script_dir, |
| const string & | input_unicharset_file, | ||
| const string & | output_unicharset_file, | ||
| const string & | output_xheights_file | ||
| ) |
Definition at line 148 of file unicharset_training_utils.cpp.
{
UNICHARSET unicharset;
// Load the input unicharset
unicharset.load_from_file(input_unicharset_file.c_str());
tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
input_unicharset_file.c_str());
// Set unichar properties
tprintf("Setting unichar properties\n");
SetupBasicProperties(true, &unicharset);
string xheights_str;
for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
// Load the unicharset for the script if available.
string filename = script_dir + "/" +
unicharset.get_script_from_script_id(s) + ".unicharset";
UNICHARSET script_set;
if (script_set.load_from_file(filename.c_str())) {
unicharset.SetPropertiesFromOther(script_set);
}
// Load the xheights for the script if available.
filename = script_dir + "/" + unicharset.get_script_from_script_id(s) +
".xheights";
string script_heights;
if (File::ReadFileToString(filename, &script_heights))
xheights_str += script_heights;
}
if (!output_xheights_file.empty())
File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset.size(); ++c) {
if (unicharset.PropertiesIncomplete(c)) {
tprintf("Warning: properties incomplete for index %d = %s\n",
c, unicharset.id_to_unichar(c));
}
}
// Write the output unicharset
tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
unicharset.save_to_file(output_unicharset_file.c_str());
}
| void tesseract::SetupBasicProperties | ( | bool | report_errors, |
| UNICHARSET * | unicharset | ||
| ) |
Definition at line 40 of file unicharset_training_utils.cpp.
{
for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
// Convert any custom ligatures.
const char* unichar_str = unicharset->id_to_unichar(unichar_id);
for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) {
if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
unichar_str = UNICHARSET::kCustomLigatures[i][0];
break;
}
}
// Convert the unichar to UTF32 representation
GenericVector<char32> uni_vector;
tesseract::UTF8ToUTF32(unichar_str, &uni_vector);
// Assume that if the property is true for any character in the string,
// then it holds for the whole "character".
bool unichar_isalpha = false;
bool unichar_islower = false;
bool unichar_isupper = false;
bool unichar_isdigit = false;
bool unichar_ispunct = false;
for (int i = 0; i < uni_vector.size(); ++i) {
if (u_isalpha(uni_vector[i]))
unichar_isalpha = true;
if (u_islower(uni_vector[i]))
unichar_islower = true;
if (u_isupper(uni_vector[i]))
unichar_isupper = true;
if (u_isdigit(uni_vector[i]))
unichar_isdigit = true;
if (u_ispunct(uni_vector[i]))
unichar_ispunct = true;
}
unicharset->set_isalpha(unichar_id, unichar_isalpha);
unicharset->set_islower(unichar_id, unichar_islower);
unicharset->set_isupper(unichar_id, unichar_isupper);
unicharset->set_isdigit(unichar_id, unichar_isdigit);
unicharset->set_ispunctuation(unichar_id, unichar_ispunct);
tesseract::IcuErrorCode err;
unicharset->set_script(unichar_id, uscript_getName(
uscript_getScript(uni_vector[0], err)));
const int num_code_points = uni_vector.size();
// Obtain the lower/upper case if needed and record it in the properties.
unicharset->set_other_case(unichar_id, unichar_id);
if (unichar_islower || unichar_isupper) {
GenericVector<char32> other_case(num_code_points, 0);
for (int i = 0; i < num_code_points; ++i) {
// TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
// However since they deal with UChars (so need a conversion function
// from char32 or UTF8string) and require a meaningful locale string,
// for now u_tolower()/u_toupper() are used.
other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
u_tolower(uni_vector[i]);
}
STRING other_case_uch;
tesseract::UTF32ToUTF8(other_case, &other_case_uch);
UNICHAR_ID other_case_id =
unicharset->unichar_to_id(other_case_uch.c_str());
if (other_case_id != INVALID_UNICHAR_ID) {
unicharset->set_other_case(unichar_id, other_case_id);
} else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {
tprintf("Other case %s of %s is not in unicharset\n",
other_case_uch.c_str(), unichar_str);
}
}
// Set RTL property and obtain mirror unichar ID from ICU.
GenericVector<char32> mirrors(num_code_points, 0);
for (int i = 0; i < num_code_points; ++i) {
mirrors[i] = u_charMirror(uni_vector[i]);
if (i == 0) { // set directionality to that of the 1st code point
unicharset->set_direction(unichar_id,
static_cast<UNICHARSET::Direction>(
u_charDirection(uni_vector[i])));
}
}
STRING mirror_uch;
tesseract::UTF32ToUTF8(mirrors, &mirror_uch);
UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
if (mirror_uch_id != INVALID_UNICHAR_ID) {
unicharset->set_mirror(unichar_id, mirror_uch_id);
} else if (report_errors) {
tprintf("Mirror %s of %s is not in unicharset\n",
mirror_uch.c_str(), unichar_str);
}
// Record normalized version of this unichar.
STRING normed_str = tesseract::NormalizeUTF8String(unichar_str);
if (unichar_id != 0 && normed_str.length() > 0) {
unicharset->set_normed(unichar_id, normed_str.c_str());
} else {
unicharset->set_normed(unichar_id, unichar_str);
}
ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());
}
unicharset->post_load_setup();
}
| const char* tesseract::SkipChars | ( | const char * | str, |
| bool(*)(int) | skip | ||
| ) |
Definition at line 218 of file paragraphs.cpp.
{
if (*str != '\0' && strchr(toskip, *str)) return str + 1;
| const char* tesseract::SkipChars | ( | const char * | str, |
| const char * | toskip | ||
| ) |
Definition at line 213 of file paragraphs.cpp.
{
while (*str != '\0' && skip(*str)) { str++; }
| const char* tesseract::SkipOne | ( | const char * | str, |
| const char * | toskip | ||
| ) |
Definition at line 223 of file paragraphs.cpp.
{
| int tesseract::sort_cmp | ( | const void * | t1, |
| const void * | t2 | ||
| ) |
Definition at line 391 of file genericvector.h.
{
const T* a = static_cast<const T *> (t1);
const T* b = static_cast<const T *> (t2);
if (*a < *b) {
return -1;
} else if (*b < *a) {
return 1;
} else {
return 0;
}
}
| int tesseract::sort_ptr_cmp | ( | const void * | t1, |
| const void * | t2 | ||
| ) |
Definition at line 408 of file genericvector.h.
{
const T* a = *reinterpret_cast<T * const *>(t1);
const T* b = *reinterpret_cast<T * const *>(t2);
if (*a < *b) {
return -1;
} else if (*b < *a) {
return 1;
} else {
return 0;
}
}
| int tesseract::SortByBoxBottom | ( | const void * | void1, |
| const void * | void2 | ||
| ) |
Definition at line 408 of file bbgrid.h.
{
// The void*s are actually doubly indirected, so get rid of one level.
const BBC* p1 = *reinterpret_cast<const BBC* const *>(void1);
const BBC* p2 = *reinterpret_cast<const BBC* const *>(void2);
int result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
if (result != 0)
return result;
result = p1->bounding_box().top() - p2->bounding_box().top();
if (result != 0)
return result;
result = p1->bounding_box().left() - p2->bounding_box().left();
if (result != 0)
return result;
return p1->bounding_box().right() - p2->bounding_box().right();
}
| int tesseract::SortByBoxLeft | ( | const void * | void1, |
| const void * | void2 | ||
| ) |
Definition at line 372 of file bbgrid.h.
{
// The void*s are actually doubly indirected, so get rid of one level.
const BBC* p1 = *reinterpret_cast<const BBC* const *>(void1);
const BBC* p2 = *reinterpret_cast<const BBC* const *>(void2);
int result = p1->bounding_box().left() - p2->bounding_box().left();
if (result != 0)
return result;
result = p1->bounding_box().right() - p2->bounding_box().right();
if (result != 0)
return result;
result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
if (result != 0)
return result;
return p1->bounding_box().top() - p2->bounding_box().top();
}
| int tesseract::SortByRating | ( | const void * | void1, |
| const void * | void2 | ||
| ) |
Definition at line 86 of file pieces.cpp.
{
const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);
if (p1->rating() < p2->rating())
return 1;
return -1;
}
| int tesseract::SortByUnicharID | ( | const void * | void1, |
| const void * | void2 | ||
| ) |
Definition at line 78 of file pieces.cpp.
{
const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);
return p1->unichar_id() - p2->unichar_id();
}
| int tesseract::SortRightToLeft | ( | const void * | void1, |
| const void * | void2 | ||
| ) |
Definition at line 390 of file bbgrid.h.
{
// The void*s are actually doubly indirected, so get rid of one level.
const BBC* p1 = *reinterpret_cast<const BBC* const *>(void1);
const BBC* p2 = *reinterpret_cast<const BBC* const *>(void2);
int result = p2->bounding_box().right() - p1->bounding_box().right();
if (result != 0)
return result;
result = p2->bounding_box().left() - p1->bounding_box().left();
if (result != 0)
return result;
result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
if (result != 0)
return result;
return p1->bounding_box().top() - p2->bounding_box().top();
}
| int tesseract::SpanUTF8NotWhitespace | ( | const char * | text | ) |
Definition at line 198 of file normstrngs.cpp.
{
if (IsWhitespace(*it)) break;
n_notwhite += it.utf8_len();
}
return n_notwhite;
}
| int tesseract::SpanUTF8Whitespace | ( | const char * | text | ) |
Definition at line 187 of file normstrngs.cpp.
{
if (!IsWhitespace(*it)) break;
n_white += it.utf8_len();
}
return n_white;
}
| void tesseract::StrongEvidenceClassify | ( | int | debug_level, |
| GenericVector< RowScratchRegisters > * | rows, | ||
| int | row_start, | ||
| int | row_end, | ||
| ParagraphTheory * | theory | ||
| ) |
Definition at line 1998 of file paragraphs.cpp.
{
if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
return;
if (debug_level > 1) {
tprintf("#############################################\n");
tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
tprintf("#############################################\n");
}
RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
MarkStrongEvidence(rows, row_start, row_end);
DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
// Create paragraph models.
ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
// At this point, some rows are marked up as paragraphs with model numbers,
// and some rows are marked up as either LT_START or LT_BODY. Now let's
// smear any good paragraph hypotheses forward and backward.
ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
smearer.Smear();
}
void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows,
int row_start, int row_end,
| bool tesseract::StrongModel | ( | const ParagraphModel * | model | ) | [inline] |
Definition at line 75 of file paragraphs_internal.h.
{
return model != NULL && model != kCrownLeft && model != kCrownRight;
}
| bool tesseract::TextSupportsBreak | ( | const RowScratchRegisters & | before, |
| const RowScratchRegisters & | after | ||
| ) |
Definition at line 1664 of file paragraphs.cpp.
{
return before.ri_->lword_likely_ends_idea &&
after.ri_->rword_likely_starts_idea;
}
}
bool LikelyParagraphStart(const RowScratchRegisters &before,
const RowScratchRegisters &after) {
| Pix * tesseract::TraceBlockOnReducedPix | ( | BLOCK * | block, |
| int | gridsize, | ||
| ICOORD | bleft, | ||
| int * | left, | ||
| int * | bottom | ||
| ) |
Definition at line 258 of file bbgrid.cpp.
{
TBOX box = block->bounding_box();
Pix* pix = GridReducedPix(box, gridsize, bleft, left, bottom);
int wpl = pixGetWpl(pix);
l_uint32* data = pixGetData(pix);
ICOORDELT_IT it(block->poly_block()->points());
for (it.mark_cycle_pt(); !it.cycled_list();) {
ICOORD pos = *it.data();
it.forward();
ICOORD next_pos = *it.data();
ICOORD line_vector = next_pos - pos;
int major, minor;
ICOORD major_step, minor_step;
line_vector.setup_render(&major_step, &minor_step, &major, &minor);
int accumulator = major / 2;
while (pos != next_pos) {
int grid_x = (pos.x() - bleft.x()) / gridsize - *left;
int grid_y = (pos.y() - bleft.y()) / gridsize - *bottom;
SET_DATA_BIT(data + grid_y * wpl, grid_x);
pos += major_step;
accumulator += minor;
if (accumulator >= major) {
accumulator -= major;
pos += minor_step;
}
}
}
return pix;
}
| Pix * tesseract::TraceOutlineOnReducedPix | ( | C_OUTLINE * | outline, |
| int | gridsize, | ||
| ICOORD | bleft, | ||
| int * | left, | ||
| int * | bottom | ||
| ) |
Definition at line 232 of file bbgrid.cpp.
{
TBOX box = outline->bounding_box();
Pix* pix = GridReducedPix(box, gridsize, bleft, left, bottom);
int wpl = pixGetWpl(pix);
l_uint32* data = pixGetData(pix);
int length = outline->pathlength();
ICOORD pos = outline->start_pos();
for (int i = 0; i < length; ++i) {
int grid_x = (pos.x() - bleft.x()) / gridsize - *left;
int grid_y = (pos.y() - bleft.y()) / gridsize - *bottom;
SET_DATA_BIT(data + grid_y * wpl, grid_x);
pos += outline->step(i);
}
return pix;
}
| int tesseract::UnicodeFor | ( | const UNICHARSET * | u, |
| const WERD_CHOICE * | werd, | ||
| int | pos | ||
| ) |
Definition at line 277 of file paragraphs.cpp.
{
| bool tesseract::UniLikelyListItem | ( | const UNICHARSET * | u, |
| const WERD_CHOICE * | werd | ||
| ) |
Definition at line 360 of file paragraphs.cpp.
{
int numeral_start = m.SkipPunc(pos);
if (numeral_start > pos + 1) break;
int numeral_end = m.SkipRomans(numeral_start);
if (numeral_end == numeral_start) {
numeral_end = m.SkipDigits(numeral_start);
if (numeral_end == numeral_start) {
// If there's a single latin letter, we can use that.
numeral_end = m.SkipAlpha(numeral_start);
if (numeral_end - numeral_start != 1)
break;
}
}
// We got some sort of numeral.
num_segments++;
// Skip any trailing punctuation.
pos = m.SkipPunc(numeral_end);
if (pos == numeral_end)
break;
}
return pos == werd->length();
}
// ========= Brain Dead Language Model (combined entry points) ================
| void tesseract::UTF32ToUTF8 | ( | const GenericVector< char32 > & | str32, |
| STRING * | utf8_str | ||
| ) |
Definition at line 46 of file normstrngs.cpp.
{
UNICHAR uni_ch(str32[i]);
char *utf8 = uni_ch.utf8_str();
if (utf8 != NULL) {
(*utf8_str) += utf8;
delete[] utf8;
}
}
}
| void tesseract::UTF8ToUTF32 | ( | const char * | utf8_str, |
| GenericVector< char32 > * | str32 | ||
| ) |
Definition at line 32 of file normstrngs.cpp.
{
step = UNICHAR::utf8_step(utf8_str + ch);
if (step > 0) {
UNICHAR uni_ch(utf8_str + ch, step);
(*str32) += uni_ch.first_uni();
}
}
}
| bool tesseract::ValidBodyLine | ( | const GenericVector< RowScratchRegisters > * | rows, |
| int | row, | ||
| const ParagraphModel * | model | ||
| ) |
Definition at line 1280 of file paragraphs.cpp.
{
| bool tesseract::ValidFirstLine | ( | const GenericVector< RowScratchRegisters > * | rows, |
| int | row, | ||
| const ParagraphModel * | model | ||
| ) |
Definition at line 1269 of file paragraphs.cpp.
{
| bool tesseract::write_info | ( | FILE * | f, |
| const FontInfo & | fi | ||
| ) |
Definition at line 168 of file fontinfo.cpp.
{
inT32 size = strlen(fi.name);
if (fwrite(&size, sizeof(size), 1, f) != 1) return false;
if (static_cast<int>(fwrite(fi.name, sizeof(*fi.name), size, f)) != size)
return false;
if (fwrite(&fi.properties, sizeof(fi.properties), 1, f) != 1) return false;
return true;
}
| bool tesseract::write_set | ( | FILE * | f, |
| const FontSet & | fs | ||
| ) |
Definition at line 253 of file fontinfo.cpp.
{
if (fwrite(&fs.size, sizeof(fs.size), 1, f) != 1) return false;
for (int i = 0; i < fs.size; ++i) {
if (fwrite(&fs.configs[i], sizeof(fs.configs[i]), 1, f) != 1) return false;
}
return true;
}
| bool tesseract::write_spacing_info | ( | FILE * | f, |
| const FontInfo & | fi | ||
| ) |
Definition at line 211 of file fontinfo.cpp.
{
inT32 vec_size = (fi.spacing_vec == NULL) ? 0 : fi.spacing_vec->size();
if (fwrite(&vec_size, sizeof(vec_size), 1, f) != 1) return false;
inT16 x_gap_invalid = -1;
for (int i = 0; i < vec_size; ++i) {
FontSpacingInfo *fs = fi.spacing_vec->get(i);
inT32 kern_size = (fs == NULL) ? -1 : fs->kerned_x_gaps.size();
if (fs == NULL) {
// Valid to have the identical fwrites. Writing invalid x-gaps.
if (fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
return false;
}
} else {
if (fwrite(&(fs->x_gap_before), sizeof(fs->x_gap_before), 1, f) != 1 ||
fwrite(&(fs->x_gap_after), sizeof(fs->x_gap_after), 1, f) != 1 ||
fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
return false;
}
}
if (kern_size > 0 && (!fs->kerned_unichar_ids.Serialize(f) ||
!fs->kerned_x_gaps.Serialize(f))) {
return false;
}
}
return true;
}
| void tesseract::WriteShapeTable | ( | const STRING & | file_prefix, |
| const ShapeTable & | shape_table | ||
| ) |
Definition at line 144 of file commontraining.cpp.
{
STRING shape_table_file = file_prefix;
shape_table_file += kShapeTableFileSuffix;
FILE* fp = fopen(shape_table_file.string(), "wb");
if (fp != NULL) {
if (!shape_table.Serialize(fp)) {
fprintf(stderr, "Error writing shape table: %s\n",
shape_table_file.string());
}
fclose(fp);
} else {
fprintf(stderr, "Error creating shape table: %s\n",
shape_table_file.string());
}
}
| void tesseract::YOutlierPieces | ( | WERD_RES * | word, |
| int | rebuilt_blob_index, | ||
| int | super_y_bottom, | ||
| int | sub_y_top, | ||
| ScriptPos * | leading_pos, | ||
| int * | num_leading_outliers, | ||
| ScriptPos * | trailing_pos, | ||
| int * | num_trailing_outliers | ||
| ) |
Given a recognized blob, see if a contiguous collection of sub-pieces (chopped blobs) starting at its left might qualify as being a subscript or superscript letter based only on y position. Also do this for the right side.
Definition at line 46 of file superscript.cpp.
{
ScriptPos sp_unused1, sp_unused2;
int unused1, unused2;
if (!leading_pos) leading_pos = &sp_unused1;
if (!num_leading_outliers) num_leading_outliers = &unused1;
if (!trailing_pos) trailing_pos = &sp_unused2;
if (!num_trailing_outliers) num_trailing_outliers = &unused2;
*num_leading_outliers = *num_trailing_outliers = 0;
*leading_pos = *trailing_pos = SP_NORMAL;
int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
int num_chopped_pieces = word->best_state[rebuilt_blob_index];
ScriptPos last_pos = SP_NORMAL;
int trailing_outliers = 0;
for (int i = 0; i < num_chopped_pieces; i++) {
TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box();
ScriptPos pos = SP_NORMAL;
if (box.bottom() >= super_y_bottom) {
pos = SP_SUPERSCRIPT;
} else if (box.top() <= sub_y_top) {
pos = SP_SUBSCRIPT;
}
if (pos == SP_NORMAL) {
if (trailing_outliers == i) {
*num_leading_outliers = trailing_outliers;
*leading_pos = last_pos;
}
trailing_outliers = 0;
} else {
if (pos == last_pos) {
trailing_outliers++;
} else {
trailing_outliers = 1;
}
}
last_pos = pos;
}
*num_trailing_outliers = trailing_outliers;
*trailing_pos = last_pos;
}
| const int tesseract::case_state_table[6][4] |
{ {
0, 1, 5, 4
},
{
0, 3, 2, 4
},
{
0, -1, 2, -1
},
{
0, 3, -1, 4
},
{
0, -1, -1, 4
},
{
5, -1, 2, -1
},
}
Definition at line 36 of file context.cpp.
| const int tesseract::kAdjacentLeaderSearchPadding = 2 |
Definition at line 126 of file tablefind.cpp.
| const double tesseract::kAlignedFraction = 0.03125 |
Definition at line 39 of file alignedblob.cpp.
| const double tesseract::kAlignedGapFraction = 0.75 |
Definition at line 43 of file alignedblob.cpp.
| const char* tesseract::kAlignmentNames[] |
{
"Left Aligned",
"Left Ragged",
"Center",
"Right Aligned",
"Right Ragged",
"Separator"
}
Definition at line 515 of file tabvector.cpp.
| const double tesseract::kAllowBlobArea = 0.05 |
Definition at line 62 of file tablefind.cpp.
| const double tesseract::kAllowBlobHeight = 0.3 |
Definition at line 60 of file tablefind.cpp.
| const double tesseract::kAllowBlobWidth = 0.4 |
Definition at line 61 of file tablefind.cpp.
| const double tesseract::kAllowTextArea = 0.8 |
Definition at line 55 of file tablefind.cpp.
| const double tesseract::kAllowTextHeight = 0.5 |
Definition at line 53 of file tablefind.cpp.
| const double tesseract::kAllowTextWidth = 0.6 |
Definition at line 54 of file tablefind.cpp.
| const char * tesseract::kApostropheLikeUTF8 |
{
"'",
"`",
"\u2018",
"\u2019",
"\u2032",
NULL,
}
Definition at line 49 of file unicodes.cpp.
| const int tesseract::kBasicBufSize = 2048 |
Definition at line 155 of file pdfrenderer.cpp.
| const double tesseract::kBigPartSizeRatio = 1.75 |
Definition at line 47 of file colpartitiongrid.cpp.
| const int tesseract::kBoxClipTolerance = 2 |
Definition at line 31 of file boxword.cpp.
| const double tesseract::kBrokenCJKIterationFraction = 0.125 |
Definition at line 71 of file strokewidth.cpp.
| const int tesseract::kCellSplitColumnThreshold = 0 |
Definition at line 40 of file tablerecog.cpp.
| const int tesseract::kCellSplitRowThreshold = 0 |
Definition at line 39 of file tablerecog.cpp.
| const double tesseract::kCharVerticalOverlapFraction = 0.375 |
Definition at line 62 of file tabfind.cpp.
| const int tesseract::kCharWidth = 2 |
Definition at line 158 of file pdfrenderer.cpp.
| const double tesseract::kCJKAspectRatio = 1.25 |
Definition at line 65 of file strokewidth.cpp.
| const double tesseract::kCJKAspectRatioIncrease = 1.0625 |
Definition at line 67 of file strokewidth.cpp.
| const double tesseract::kCJKBrokenDistanceFraction = 0.25 |
Definition at line 61 of file strokewidth.cpp.
| const int tesseract::kCJKMaxComponents = 8 |
Definition at line 63 of file strokewidth.cpp.
| const int tesseract::kCJKRadius = 2 |
Definition at line 59 of file strokewidth.cpp.
| const int tesseract::kColumnWidthFactor = 20 |
| const double tesseract::kCosMaxSkewAngle = 0.866025 |
Definition at line 81 of file tabfind.cpp.
| const int tesseract::kCrackSpacing = 100 |
Spacing of cracks across the page to break up tall vertical lines.
Definition at line 45 of file linefind.cpp.
| const ParagraphModel * tesseract::kCrownLeft = reinterpret_cast<ParagraphModel *>(0xDEAD111F) |
Definition at line 48 of file paragraphs.cpp.
| const ParagraphModel * tesseract::kCrownRight = reinterpret_cast<ParagraphModel *>(0xDEAD888F) |
Definition at line 50 of file paragraphs.cpp.
| const int tesseract::kDefaultResolution = 300 |
Default resolution used if input in not believable.
Definition at line 60 of file pagesegmain.cpp.
| const double tesseract::kDiacriticXPadRatio = 7.0 |
Definition at line 74 of file strokewidth.cpp.
| const double tesseract::kDiacriticYPadRatio = 1.75 |
Definition at line 77 of file strokewidth.cpp.
| const char tesseract::kDoNotReverse[] = "RRP_DO_NO_REVERSE" |
| const int tesseract::kExposureFactor = 16 |
Definition at line 33 of file degradeimage.cpp.
| const int tesseract::kFeaturePadding = 2 |
Definition at line 34 of file imagedata.h.
| const float tesseract::kFontMergeDistance = 0.025 |
Definition at line 53 of file mastertrainer.cpp.
| const char tesseract::kForceReverse[] = "RRP_FORCE_REVERSE" |
| const double tesseract::kGoodRowNumberOfColumnsLarge = 0.7 |
Definition at line 58 of file tablerecog.cpp.
| const double tesseract::kGoodRowNumberOfColumnsSmall[] = { 2, 2, 2, 2, 2, 3, 3 } |
Definition at line 54 of file tablerecog.cpp.
sizeof(kGoodRowNumberOfColumnsSmall) / sizeof(double) - 1
Definition at line 55 of file tablerecog.cpp.
| const int tesseract::kGutterMultiple = 4 |
Definition at line 38 of file tabvector.cpp.
| const int tesseract::kGutterToNeighbourRatio = 3 |
Definition at line 40 of file tabvector.cpp.
| const int tesseract::kHistogramSize = 256 |
| const double tesseract::kHorizontalGapMergeFraction = 0.5 |
Definition at line 61 of file colfind.cpp.
| const double tesseract::kHorizontalSpacing = 0.30 |
Definition at line 33 of file tablerecog.cpp.
| const int tesseract::kHorzStrongTextlineAspect = 5 |
Definition at line 69 of file colpartition.cpp.
| const int tesseract::kHorzStrongTextlineCount = 8 |
Definition at line 65 of file colpartition.cpp.
| const int tesseract::kHorzStrongTextlineHeight = 10 |
Definition at line 67 of file colpartition.cpp.
| const char * tesseract::kHyphenLikeUTF8 |
{
"-",
"\u05BE",
"\u2010",
"\u2011",
"\u2012",
"\u2013",
"\u2014",
"\u2015",
"\u2212",
"\uFE58",
"\uFE63",
"\uFF0D",
NULL,
}
The following are confusable internal word punctuation symbols which we normalize to the first variant when matching in dawgs.
Definition at line 33 of file unicodes.cpp.
| const int tesseract::kImagePadding = 4 |
Definition at line 36 of file imagedata.h.
| const float tesseract::kInfiniteDist = 999.0f |
Definition at line 912 of file mastertrainer.cpp.
| const double tesseract::kLargeTableProjectionThreshold = 0.45 |
Definition at line 111 of file tablefind.cpp.
| const int tesseract::kLargeTableRowCount = 6 |
Definition at line 113 of file tablefind.cpp.
| const int tesseract::kLeftIndentAlignmentCountTh = 1 |
Definition at line 88 of file equationdetect.cpp.
| const double tesseract::kLineCountReciprocal = 4.0 |
Definition at line 51 of file tabvector.cpp.
| const int tesseract::kLinedTableMinHorizontalLines = 3 |
Definition at line 43 of file tablerecog.cpp.
| const int tesseract::kLinedTableMinVerticalLines = 3 |
Definition at line 42 of file tablerecog.cpp.
| const int tesseract::kLineFindGridSize = 50 |
Grid size used by line finder. Not very critical.
Definition at line 47 of file linefind.cpp.
| const double tesseract::kLineFragmentAspectRatio = 10.0 |
Definition at line 56 of file tabfind.cpp.
| const double tesseract::kLineResidueAspectRatio = 8.0 |
Definition at line 100 of file strokewidth.cpp.
| const int tesseract::kLineResiduePadRatio = 3 |
Definition at line 102 of file strokewidth.cpp.
| const double tesseract::kLineResidueSizeRatio = 1.75 |
Definition at line 104 of file strokewidth.cpp.
| const int tesseract::kLineTrapLongest = 4 |
Definition at line 93 of file strokewidth.cpp.
| const int tesseract::kLineTrapShortest = 2 |
Definition at line 95 of file strokewidth.cpp.
| const char * tesseract::kLRM = "\u200E" |
Definition at line 28 of file unicodes.cpp.
| const double tesseract::kMarginFactor = 1.1 |
Definition at line 48 of file tablerecog.cpp.
| const double tesseract::kMarginOverlapFraction = 0.25 |
Definition at line 58 of file colfind.cpp.
| const float tesseract::kMathDigitDensityTh1 = 0.25 |
Definition at line 83 of file equationdetect.cpp.
| const float tesseract::kMathDigitDensityTh2 = 0.1 |
Definition at line 84 of file equationdetect.cpp.
| const float tesseract::kMathItalicDensityTh = 0.5 |
Definition at line 85 of file equationdetect.cpp.
| usr src packages BUILD tesseract ccutil ambigs cpp usr src packages BUILD tesseract ccutil ambigs cpp const int tesseract::kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) |
Definition at line 42 of file ambigs.cpp.
| const double tesseract::kMaxBaselineError = 0.4375 |
Definition at line 72 of file colpartition.cpp.
| const double tesseract::kMaxBlobOverlapFactor = 4.0 |
Definition at line 81 of file tablefind.cpp.
| const int tesseract::kMaxBlobWidth = 500 |
Definition at line 44 of file tablefind.cpp.
| const inT16 tesseract::kMaxBoxEdgeDiff = 2 |
Definition at line 32 of file recogtraining.cpp.
| const int tesseract::kMaxBoxesInDataPartition = 20 |
Definition at line 70 of file tablefind.cpp.
| const int tesseract::kMaxCaptionLines = 7 |
Definition at line 39 of file colpartitiongrid.cpp.
| const int tesseract::kMaxCharTopRange = 48 |
Definition at line 66 of file fixxht.cpp.
| const int tesseract::kMaxCircleErosions = 8 |
Definition at line 62 of file pagesegmain.cpp.
| const int tesseract::kMaxCJKSizeRatio = 5 |
Definition at line 69 of file strokewidth.cpp.
| const int tesseract::kMaxColorDistance = 900 |
Definition at line 79 of file colpartition.cpp.
| const int tesseract::kMaxColumnHeaderDistance = 4 |
Definition at line 89 of file tablefind.cpp.
| const double tesseract::kMaxDiacriticDistanceRatio = 1.25 |
Definition at line 83 of file strokewidth.cpp.
| const double tesseract::kMaxDiacriticGapToBaseCharHeight = 1.0 |
Definition at line 86 of file strokewidth.cpp.
| const double tesseract::kMaxDistToPartSizeRatio = 1.5 |
Definition at line 68 of file colfind.cpp.
| const int tesseract::kMaxFillinMultiple = 11 |
Definition at line 47 of file tabvector.cpp.
| const double tesseract::kMaxGapInTextPartition = 4.0 |
Definition at line 73 of file tablefind.cpp.
| const double tesseract::kMaxGutterWidthAbsolute = 2.00 |
Definition at line 51 of file tabfind.cpp.
| const double tesseract::kMaxHorizontalGap = 3.0 |
Definition at line 64 of file tabfind.cpp.
| const int tesseract::kMaxIncompatibleColumnCount = 2 |
Definition at line 56 of file colfind.cpp.
| const int tesseract::kMaxLargeOverlaps = 3 |
Definition at line 109 of file strokewidth.cpp.
| const int tesseract::kMaxLargeOverlapsWithMedium = 12 |
Definition at line 47 of file ccnontextdetect.cpp.
| const int tesseract::kMaxLargeOverlapsWithSmall = 3 |
Definition at line 38 of file ccnontextdetect.cpp.
| const double tesseract::kMaxLeaderGapFractionOfMax = 0.25 |
Definition at line 55 of file colpartition.cpp.
| const double tesseract::kMaxLeaderGapFractionOfMin = 0.5 |
Definition at line 57 of file colpartition.cpp.
| const int tesseract::kMaxLigature = 0xfb17 |
Definition at line 49 of file ligature_table.cpp.
| const int tesseract::kMaxLineLength = 1024 |
Definition at line 290 of file boxchar.cpp.
| const int tesseract::kMaxLineResidue = 6 |
Definition at line 53 of file linefind.cpp.
| const int tesseract::kMaxMediumOverlapsWithSmall = 12 |
Definition at line 43 of file ccnontextdetect.cpp.
| const int tesseract::kMaxNeighbourDistFactor = 4 |
Definition at line 37 of file colpartitiongrid.cpp.
| const double tesseract::kMaxNonLineDensity = 0.25 |
Definition at line 58 of file linefind.cpp.
| const int tesseract::kMaxOffsetDist = 32 |
Definition at line 32 of file intfeaturemap.cpp.
| const int tesseract::kMaxPadFactor = 6 |
Definition at line 34 of file colpartitiongrid.cpp.
| const double tesseract::kMaxParagraphEndingLeftSpaceMultiple = 3.0 |
Definition at line 135 of file tablefind.cpp.
| const double tesseract::kMaxPartitionSpacing = 1.75 |
Definition at line 62 of file colpartitiongrid.cpp.
| const int tesseract::kMaxRaggedSearch = 25 |
Definition at line 39 of file tabfind.cpp.
| const int tesseract::kMaxRealDistance = 2.0 |
Definition at line 38 of file detlinefit.cpp.
| const double tesseract::kMaxRectangularFraction = 0.75 |
Definition at line 46 of file imagefind.cpp.
| const double tesseract::kMaxRectangularGradient = 0.1 |
Definition at line 49 of file imagefind.cpp.
| const int tesseract::kMaxRMSColorNoise = 128 |
Definition at line 76 of file colpartition.cpp.
| const double tesseract::kMaxRowSize = 2.5 |
Definition at line 51 of file tablerecog.cpp.
| const double tesseract::kMaxSameBlockLineSpacing = 3 |
Definition at line 51 of file colpartition.cpp.
| const double tesseract::kMaxSizeRatio = 1.5 |
Definition at line 53 of file colpartition.cpp.
| const int tesseract::kMaxSkewFactor = 15 |
Definition at line 65 of file alignedblob.cpp.
| usr src packages BUILD tesseract textord ccnontextdetect cpp usr src packages BUILD tesseract textord ccnontextdetect cpp usr src packages BUILD tesseract textord ccnontextdetect cpp const double tesseract::kMaxSmallNeighboursPerPix = 1.0 / 32 |
Definition at line 35 of file ccnontextdetect.cpp.
| const double tesseract::kMaxSpacingDrift = 1.0 / 72 |
Definition at line 45 of file colpartition.cpp.
| const double tesseract::kMaxStaveHeight = 1.0 |
Definition at line 60 of file linefind.cpp.
| const double tesseract::kMaxTableCellXheight = 2.0 |
Definition at line 85 of file tablefind.cpp.
| const int tesseract::kMaxTextLineBlobRatio = 5 |
Definition at line 72 of file tabfind.cpp.
| const double tesseract::kMaxTopSpacingFraction = 0.25 |
Definition at line 48 of file colpartition.cpp.
| const int tesseract::kMaxUnicharsPerCluster = 2000 |
Definition at line 51 of file mastertrainer.cpp.
| const int tesseract::kMaxVerticalSearch = 12 |
Definition at line 38 of file tabfind.cpp.
| usr src packages BUILD tesseract textord tablefind cpp const int tesseract::kMaxVerticalSpacing = 500 |
Definition at line 42 of file tablefind.cpp.
| const double tesseract::kMaxXProjectionGapFactor = 2.0 |
Definition at line 145 of file tablefind.cpp.
| const double tesseract::kMinAlignedGutter = 0.25 |
Definition at line 53 of file tabvector.cpp.
| const int tesseract::kMinAlignedTabs = 4 |
Definition at line 55 of file alignedblob.cpp.
| const double tesseract::kMinBaselineCoverage = 0.5 |
Definition at line 74 of file colpartition.cpp.
| const int tesseract::kMinBoxesInTextPartition = 10 |
Definition at line 67 of file tablefind.cpp.
| const double tesseract::kMinCaptionGapHeightRatio = 0.5 |
Definition at line 43 of file colpartitiongrid.cpp.
| const double tesseract::kMinCaptionGapRatio = 2.0 |
Definition at line 41 of file colpartitiongrid.cpp.
| const int tesseract::kMinChainTextValue = 3 |
Definition at line 63 of file colpartition.cpp.
| usr src packages BUILD tesseract classify mastertrainer cpp const int tesseract::kMinClusteredShapes = 1 |
Definition at line 49 of file mastertrainer.cpp.
| const int tesseract::kMinColorDifference = 16 |
Definition at line 55 of file imagefind.cpp.
| const int tesseract::kMinColumnWidth = 100 |
Definition at line 53 of file colfind.cpp.
| const int tesseract::kMinCredibleResolution = 70 |
Minimum believable resolution.
Definition at line 58 of file pagesegmain.cpp.
| const double tesseract::kMinDiacriticSizeRatio = 1.0625 |
Definition at line 80 of file strokewidth.cpp.
| const int tesseract::kMinEvaluatedTabs = 3 |
Definition at line 69 of file tabfind.cpp.
| const double tesseract::kMinFilledArea = 0.35 |
Definition at line 61 of file tablerecog.cpp.
| const double tesseract::kMinFractionalLinesInColumn = 0.125 |
Definition at line 45 of file tabfind.cpp.
| const double tesseract::kMinGoodTextPARatio = 1.5 |
Definition at line 63 of file ccnontextdetect.cpp.
| const double tesseract::kMinGutterFraction = 0.5 |
Definition at line 49 of file tabvector.cpp.
| const double tesseract::kMinGutterWidthAbsolute = 0.02 |
Definition at line 49 of file tabfind.cpp.
| const double tesseract::kMinGutterWidthGrid = 0.5 |
Definition at line 65 of file colfind.cpp.
| const double tesseract::kMinImageArea = 0.5 |
Definition at line 77 of file tabfind.cpp.
| const int tesseract::kMinImageFindSize = 100 |
Definition at line 51 of file imagefind.cpp.
| const int tesseract::kMinLeaderCount = 5 |
Definition at line 59 of file colpartition.cpp.
| const int tesseract::kMinLigature = 0xfb00 |
Definition at line 48 of file ligature_table.cpp.
| const int tesseract::kMinLineLengthFraction = 4 |
Denominator of resolution makes min pixels to demand line lengths to be.
Definition at line 43 of file linefind.cpp.
| const int tesseract::kMinLinesInColumn = 10 |
Definition at line 41 of file tabfind.cpp.
| const double tesseract::kMinMaxGapInTextPartition = 0.5 |
Definition at line 77 of file tablefind.cpp.
| const double tesseract::kMinMusicPixelFraction = 0.75 |
Definition at line 62 of file linefind.cpp.
| const double tesseract::kMinNonNoiseFraction = 0.5 |
Definition at line 63 of file colfind.cpp.
| const int tesseract::kMinOutlierSamples = 5 |
Definition at line 37 of file trainingsampleset.cpp.
| const double tesseract::kMinOverlapWithTable = 0.6 |
Definition at line 101 of file tablefind.cpp.
| const double tesseract::kMinParagraphEndingTextToWhitespaceRatio = 3.0 |
Definition at line 141 of file tablefind.cpp.
| const double tesseract::kMinPCLengthIncrease = 1.0 / 1024 |
Definition at line 33 of file intfeaturemap.cpp.
| const int tesseract::kMinPointsForErrorCount = 16 |
Definition at line 35 of file detlinefit.cpp.
| const double tesseract::kMinRaggedGutter = 1.5 |
Definition at line 55 of file tabvector.cpp.
| const int tesseract::kMinRaggedTabs = 5 |
Definition at line 53 of file alignedblob.cpp.
| const int tesseract::kMinRampSize = 1000 |
Definition at line 37 of file degradeimage.cpp.
| const double tesseract::kMinRectangularFraction = 0.125 |
Definition at line 44 of file imagefind.cpp.
| const int tesseract::kMinRowsInTable = 3 |
Definition at line 116 of file tablefind.cpp.
| const int tesseract::kMinStrongTextValue = 6 |
Definition at line 61 of file colpartition.cpp.
| const double tesseract::kMinTabGradient = 4.0 |
Definition at line 61 of file alignedblob.cpp.
| const int tesseract::kMinTextLineBlobRatio = 3 |
Definition at line 75 of file tabfind.cpp.
| const int tesseract::kMinThickLineWidth = 12 |
Definition at line 49 of file linefind.cpp.
| const int tesseract::kMinVerticalSearch = 3 |
Definition at line 37 of file tabfind.cpp.
| const int tesseract::kMostlyOneDirRatio = 3 |
Definition at line 98 of file strokewidth.cpp.
| const double tesseract::kNeighbourSearchFactor = 2.5 |
Definition at line 111 of file strokewidth.cpp.
| const double tesseract::kNoiseOverlapAreaFactor = 1.0 / 512 |
Definition at line 116 of file strokewidth.cpp.
| const double tesseract::kNoiseOverlapGrowthFactor = 4.0 |
Definition at line 113 of file strokewidth.cpp.
| const int tesseract::kNoisePadding = 4 |
Definition at line 54 of file ccnontextdetect.cpp.
| usr src packages BUILD tesseract ccstruct detlinefit cpp const int tesseract::kNumEndPoints = 3 |
Definition at line 29 of file detlinefit.cpp.
| const int tesseract::kNumLiteralCnt = 5 |
Definition at line 36 of file tess_lang_model.h.
| const int tesseract::kNumPagesPerMiniBatch = 100 |
Definition at line 38 of file imagedata.h.
| const int tesseract::kOriginalNoiseMultiple = 8 |
Definition at line 50 of file ccnontextdetect.cpp.
| const double tesseract::kParagraphEndingPreviousLineRatio = 1.3 |
Definition at line 131 of file tablefind.cpp.
| const char * tesseract::kPDF = "\u202C" |
Definition at line 31 of file unicodes.cpp.
| const double tesseract::kPhotoOffsetFraction = 0.375 |
Definition at line 57 of file ccnontextdetect.cpp.
| const int tesseract::kPrime1 = 17 |
Definition at line 34 of file trainingsampleset.cpp.
| const int tesseract::kPrime2 = 13 |
Definition at line 35 of file trainingsampleset.cpp.
| const double tesseract::kRaggedFraction = 2.5 |
Definition at line 41 of file alignedblob.cpp.
| const double tesseract::kRaggedGapFraction = 1.0 |
Definition at line 45 of file alignedblob.cpp.
| const int tesseract::kRaggedGutterMultiple = 5 |
Definition at line 53 of file tabfind.cpp.
| const int tesseract::kRandomizingCenter = 128 |
Definition at line 35 of file trainingsample.cpp.
| usr src packages BUILD tesseract classify errorcounter cpp usr src packages BUILD tesseract classify errorcounter cpp usr src packages BUILD tesseract classify errorcounter cpp const double tesseract::kRatingEpsilon = 1.0 / 32 |
Definition at line 34 of file errorcounter.cpp.
| const double tesseract::kRequiredColumns = 0.7 |
Definition at line 46 of file tablerecog.cpp.
| const double tesseract::kRequiredFullJustifiedSpacing = 4.0 |
Definition at line 121 of file tablefind.cpp.
| const char tesseract::kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL" |
| const int tesseract::kRGBRMSColors = 4 |
Definition at line 36 of file colpartition.h.
| const char * tesseract::kRLE = "\u202A" |
Definition at line 30 of file unicodes.cpp.
| const char * tesseract::kRLM = "\u200F" |
Definition at line 29 of file unicodes.cpp.
| const double tesseract::kRMSFitScaling = 8.0 |
Definition at line 53 of file imagefind.cpp.
| usr src packages BUILD tesseract training degradeimage cpp const float tesseract::kRotationRange = 0.02f |
Definition at line 31 of file degradeimage.cpp.
| const int tesseract::kRulingVerticalMargin = 3 |
Definition at line 97 of file tablefind.cpp.
| const int tesseract::kSaltnPepper = 5 |
Definition at line 35 of file degradeimage.cpp.
| const int tesseract::kSearchRadius = 2 |
Definition at line 88 of file strokewidth.cpp.
| const int tesseract::kSeedBlobsCountTh = 10 |
Definition at line 87 of file equationdetect.cpp.
| const double tesseract::kShapePerimeterRatio = 3.0 |
Definition at line 118 of file strokewidth.cpp.
| const int tesseract::kSideSpaceMargin = 10 |
Definition at line 106 of file tablefind.cpp.
| const int tesseract::kSimilarRaggedDist = 50 |
Definition at line 45 of file tabvector.cpp.
| const int tesseract::kSimilarVectorDist = 10 |
Definition at line 42 of file tabvector.cpp.
| const int tesseract::ksizeofUniversalAmbigsFile = sizeof(kUniversalAmbigsFile) |
Definition at line 24 of file universalambigs.h.
| const float tesseract::kSizeRatioToReject = 2.0 |
Definition at line 106 of file strokewidth.cpp.
| const double tesseract::kSmallTableProjectionThreshold = 0.35 |
Definition at line 110 of file tablefind.cpp.
| const int tesseract::kSmoothDecisionMargin = 4 |
Definition at line 65 of file colpartitiongrid.cpp.
| const double tesseract::kSmoothFactor = 0.25 |
Definition at line 58 of file tabfind.cpp.
| const double tesseract::kSplitPartitionSize = 2.0 |
Definition at line 48 of file tablefind.cpp.
| const int tesseract::kSquareLimit = 25 |
Definition at line 32 of file trainingsampleset.cpp.
| const int tesseract::kStateCnt = 4 |
Definition at line 35 of file tess_lang_model.h.
| const double tesseract::kStrokeWidthCJK = 2.0 |
Definition at line 56 of file strokewidth.cpp.
| const double tesseract::kStrokeWidthConstantTolerance = 2.0 |
Definition at line 150 of file tablefind.cpp.
| const double tesseract::kStrokeWidthFractionalTolerance = 0.25 |
Definition at line 149 of file tablefind.cpp.
| const double tesseract::kStrokeWidthFractionCJK = 0.25 |
Definition at line 55 of file strokewidth.cpp.
| const double tesseract::kStrokeWidthFractionTolerance = 0.125 |
Allowed proportional change in stroke width to be the same font.
Definition at line 48 of file strokewidth.cpp.
| const double tesseract::kStrokeWidthTolerance = 1.5 |
Allowed constant change in stroke width to be the same font. Really 1.5 pixels.
Definition at line 53 of file strokewidth.cpp.
| const double tesseract::kTableColumnThreshold = 3.0 |
Definition at line 93 of file tablefind.cpp.
| const int tesseract::kTabRadiusFactor = 5 |
Definition at line 35 of file tabfind.cpp.
| const int tesseract::kTestChar = -1 |
Definition at line 30 of file trainingsampleset.cpp.
| const char* tesseract::kTextordDebugPix = "psdebug_pix" |
Definition at line 68 of file alignedblob.cpp.
| const double tesseract::kThickLengthMultiple = 0.75 |
Definition at line 56 of file linefind.cpp.
| const int tesseract::kThinLineFraction = 20 |
Denominator of resolution makes max pixel width to allow thin lines.
Definition at line 41 of file linefind.cpp.
| const double tesseract::kTinyEnoughTextlineOverlapFraction = 0.25 |
Definition at line 49 of file colpartitiongrid.cpp.
| const float tesseract::kUnclearDensityTh = 0.25 |
Definition at line 86 of file equationdetect.cpp.
| const char tesseract::kUniversalAmbigsFile |
Definition at line 23 of file universalambigs.h.
| const char * tesseract::kUTF8LineSeparator = "\u2028" |
Definition at line 26 of file unicodes.cpp.
| const char * tesseract::kUTF8ParagraphSeparator = "\u2029" |
Definition at line 27 of file unicodes.cpp.
| const double tesseract::kVerticalSpacing = -0.2 |
Definition at line 36 of file tablerecog.cpp.
| const int tesseract::kVLineAlignment = 3 |
Definition at line 47 of file alignedblob.cpp.
| const int tesseract::kVLineGutter = 1 |
Definition at line 49 of file alignedblob.cpp.
| const int tesseract::kVLineMinLength = 500 |
Definition at line 57 of file alignedblob.cpp.
| const int tesseract::kVLineSearchSize = 150 |
Definition at line 51 of file alignedblob.cpp.
| const char* const tesseract::RTLReversePolicyNames[] |
| bool tesseract::textord_dump_table_images = false |
"Paint table detection output"
Definition at line 152 of file tablefind.cpp.
| bool tesseract::textord_show_tables = false |
"Show table regions"
Definition at line 153 of file tablefind.cpp.
| bool tesseract::textord_tabfind_find_tables = true |
"run table detection"
Definition at line 78 of file colfind.cpp.
| bool tesseract::textord_tabfind_only_strokewidths = false |
"Only run stroke widths"
Definition at line 45 of file strokewidth.cpp.
| bool tesseract::textord_tabfind_show_blocks = false |
"Show final block bounds"
Definition at line 77 of file colfind.cpp.
| bool tesseract::textord_tabfind_show_color_fit = false |
"Show stroke widths"
Definition at line 30 of file colpartitiongrid.cpp.
| bool tesseract::textord_tabfind_show_columns = false |
"Show column bounds"
Definition at line 76 of file colfind.cpp.
| bool tesseract::textord_tabfind_show_finaltabs = false |
"Show tab vectors"
Definition at line 84 of file tabfind.cpp.
| bool tesseract::textord_tabfind_show_initial_partitions = false |
"Show partition bounds"
Definition at line 71 of file colfind.cpp.
| bool tesseract::textord_tabfind_show_initialtabs = false |
"Show tab candidates"
Definition at line 83 of file tabfind.cpp.
"Show partition bounds, waiting if >1"
Definition at line 75 of file colfind.cpp.
| bool tesseract::textord_tabfind_show_reject_blobs = false |
"Show blobs rejected as noise"
Definition at line 73 of file colfind.cpp.
"Show stroke widths"
Definition at line 44 of file strokewidth.cpp.
| bool tesseract::textord_tablefind_recognize_tables = false |
"Enables the table recognizer for table layout and filtering."
Definition at line 159 of file tablefind.cpp.
| bool tesseract::textord_tablefind_show_mark = false |
"Debug table marking steps in detail"
Definition at line 155 of file tablefind.cpp.
| bool tesseract::textord_tablefind_show_stats = false |
"Show page stats used in table finding"
Definition at line 157 of file tablefind.cpp.
| double tesseract::textord_tabvector_vertical_box_ratio = 0.5 |
"Fraction of box matches required to declare a line vertical"
Definition at line 61 of file tabvector.cpp.
| double tesseract::textord_tabvector_vertical_gap_fraction = 0.5 |
"max fraction of mean blob width allowed for vertical gaps in vertical text"
"Max fraction of mean blob width allowed for vertical gaps in vertical text"
Definition at line 58 of file tabvector.cpp.
Definition at line 52 of file ccutil.cpp.