Class UkrainianWordTokenizer

java.lang.Object
org.languagetool.tokenizers.uk.UkrainianWordTokenizer
All Implemented Interfaces:
Tokenizer

public class UkrainianWordTokenizer extends Object implements Tokenizer
Tokenizes a sentence into words. Punctuation and whitespace gets its own token. Specific to Ukrainian: apostrophes (0x27 and U+2019) not in the list as they are part of the word
  • Field Details

    • SPLIT_CHARS

      private static final String SPLIT_CHARS
    • SPLIT_CHARS_REGEX

      private static final Pattern SPLIT_CHARS_REGEX
    • DECIMAL_COMMA_SUBST

      private static final char DECIMAL_COMMA_SUBST
      See Also:
    • NON_BREAKING_SPACE_SUBST

      private static final char NON_BREAKING_SPACE_SUBST
      See Also:
    • NON_BREAKING_DOT_SUBST

      private static final char NON_BREAKING_DOT_SUBST
      See Also:
    • NON_BREAKING_COLON_SUBST

      private static final char NON_BREAKING_COLON_SUBST
      See Also:
    • LEFT_BRACE_SUBST

      private static final char LEFT_BRACE_SUBST
      See Also:
    • RIGHT_BRACE_SUBST

      private static final char RIGHT_BRACE_SUBST
      See Also:
    • NON_BREAKING_SLASH_SUBST

      private static final char NON_BREAKING_SLASH_SUBST
      See Also:
    • LEFT_ANGLE_SUBST

      private static final char LEFT_ANGLE_SUBST
      See Also:
    • RIGHT_ANGLE_SUBST

      private static final char RIGHT_ANGLE_SUBST
      See Also:
    • SLASH_SUBST

      private static final char SLASH_SUBST
      See Also:
    • NON_BREAKING_PLACEHOLDER

      private static final String NON_BREAKING_PLACEHOLDER
      See Also:
    • BREAKING_PLACEHOLDER

      private static final String BREAKING_PLACEHOLDER
      See Also:
    • NON_BREAKING_PLACEHOLDER2

      private static final String NON_BREAKING_PLACEHOLDER2
      See Also:
    • WEIRD_APOSTROPH_PATTERN

      private static final Pattern WEIRD_APOSTROPH_PATTERN
    • WORDS_WITH_BRACKETS_PATTERN

      public static final Pattern WORDS_WITH_BRACKETS_PATTERN
    • DECIMAL_COMMA_PATTERN

      private static final Pattern DECIMAL_COMMA_PATTERN
    • DECIMAL_COMMA_REPL

      private static final String DECIMAL_COMMA_REPL
      See Also:
    • DECIMAL_SPACE_PATTERN

      private static final Pattern DECIMAL_SPACE_PATTERN
    • DASH_NUMBERS_PATTERN

      private static final Pattern DASH_NUMBERS_PATTERN
    • DASH_NUMBERS_REPL

      private static final String DASH_NUMBERS_REPL
      See Also:
    • N_DASH_SPACE_PATTERN

      private static final Pattern N_DASH_SPACE_PATTERN
    • N_DASH_SPACE_PATTERN2

      private static final Pattern N_DASH_SPACE_PATTERN2
    • N_DASH_SPACE_REPL

      private static final String N_DASH_SPACE_REPL
      See Also:
    • DOTTED_NUMBERS_PATTERN

      private static final Pattern DOTTED_NUMBERS_PATTERN
    • DOTTED_NUMBERS_PATTERN3

      private static final Pattern DOTTED_NUMBERS_PATTERN3
    • COLON_NUMBERS_PATTERN

      private static final Pattern COLON_NUMBERS_PATTERN
    • COLON_NUMBERS_REPL

      private static final String COLON_NUMBERS_REPL
      See Also:
    • BRACE_IN_WORD_PATTERN

      private static final Pattern BRACE_IN_WORD_PATTERN
    • XML_TAG_PATTERN

      private static final Pattern XML_TAG_PATTERN
    • INITIALS_DOT_PATTERN_SP_2

      private static final Pattern INITIALS_DOT_PATTERN_SP_2
    • INITIALS_DOT_PATTERN_SP_1

      private static final Pattern INITIALS_DOT_PATTERN_SP_1
    • INITIALS_DOT_PATTERN_RSP_2

      private static final Pattern INITIALS_DOT_PATTERN_RSP_2
    • INITIALS_DOT_PATTERN_RSP_1

      private static final Pattern INITIALS_DOT_PATTERN_RSP_1
    • INITIALS_DOT_REPL_SP_2

      private static final String INITIALS_DOT_REPL_SP_2
      See Also:
    • INITIALS_DOT_REPL_SP_1

      private static final String INITIALS_DOT_REPL_SP_1
      See Also:
    • INITIALS_DOT_REPL_RSP_2

      private static final String INITIALS_DOT_REPL_RSP_2
      See Also:
    • INITIALS_DOT_REPL_RSP_1

      private static final String INITIALS_DOT_REPL_RSP_1
      See Also:
    • ABBR_DOT_VO_PATTERN1

      private static final Pattern ABBR_DOT_VO_PATTERN1
    • ABBR_DOT_VO_PATTERN2

      private static final Pattern ABBR_DOT_VO_PATTERN2
    • ABBR_DOT_VO_PATTERN3

      private static final Pattern ABBR_DOT_VO_PATTERN3
    • ABBR_DOT_TYS_PATTERN1

      private static final Pattern ABBR_DOT_TYS_PATTERN1
    • ABBR_DOT_TYS_PATTERN2

      private static final Pattern ABBR_DOT_TYS_PATTERN2
    • ABBR_DOT_ART_PATTERN

      private static final Pattern ABBR_DOT_ART_PATTERN
    • ABBR_DOT_MAN_PATTERN

      private static final Pattern ABBR_DOT_MAN_PATTERN
    • ABBR_DOT_LAT_PATTERN

      private static final Pattern ABBR_DOT_LAT_PATTERN
    • ABBR_DOT_PROF_PATTERN

      private static final Pattern ABBR_DOT_PROF_PATTERN
    • ABBR_DOT_GUB_PATTERN

      private static final Pattern ABBR_DOT_GUB_PATTERN
    • ABBR_DOT_DASH_PATTERN

      private static final Pattern ABBR_DOT_DASH_PATTERN
    • ABBR_DOT_KUB_SM_PATTERN

      private static final Pattern ABBR_DOT_KUB_SM_PATTERN
    • ABBR_DOT_S_G_PATTERN

      private static final Pattern ABBR_DOT_S_G_PATTERN
    • ABBR_DOT_CHL_KOR_PATTERN

      private static final Pattern ABBR_DOT_CHL_KOR_PATTERN
    • ABBR_DOT_PN_ZAH_PATTERN

      private static final Pattern ABBR_DOT_PN_ZAH_PATTERN
    • INVALID_MLN_DOT_PATTERN

      private static final Pattern INVALID_MLN_DOT_PATTERN
    • ABBR_DOT_2_SMALL_LETTERS_PATTERN

      private static final Pattern ABBR_DOT_2_SMALL_LETTERS_PATTERN
    • ABBR_DOT_2_SMALL_LETTERS_REPL

      private static final String ABBR_DOT_2_SMALL_LETTERS_REPL
      See Also:
    • ONE_DOT_TWO_REPL

      private static final String ONE_DOT_TWO_REPL
      See Also:
    • ABBR_DOT_NON_ENDING_PATTERN

      private static final Pattern ABBR_DOT_NON_ENDING_PATTERN
    • ABBR_DOT_NON_ENDING_PATTERN_2

      private static final Pattern ABBR_DOT_NON_ENDING_PATTERN_2
    • ABBR_DOT_NAR_PATTERN_1

      private static final Pattern ABBR_DOT_NAR_PATTERN_1
    • ABBR_DOT_NAR_PATTERN_2

      private static final Pattern ABBR_DOT_NAR_PATTERN_2
    • ABBR_DOT_ENDING_PATTERN

      private static final Pattern ABBR_DOT_ENDING_PATTERN
    • ABBR_DOT_I_T_P_PATTERN

      private static final Pattern ABBR_DOT_I_T_P_PATTERN
    • ABBR_DOT_I_T_CH_PATTERN

      private static final Pattern ABBR_DOT_I_T_CH_PATTERN
    • ABBR_DOT_T_ZV_PATTERN

      private static final Pattern ABBR_DOT_T_ZV_PATTERN
    • ABBR_AT_THE_END

      private static final Pattern ABBR_AT_THE_END
    • APOSTROPHE_BEGIN_PATTERN

      private static final Pattern APOSTROPHE_BEGIN_PATTERN
    • APOSTROPHE_END_PATTER

      private static final Pattern APOSTROPHE_END_PATTER
    • YEAR_WITH_R

      private static final Pattern YEAR_WITH_R
    • COMPOUND_WITH_QUOTES1

      private static final Pattern COMPOUND_WITH_QUOTES1
    • COMPOUND_WITH_QUOTES2

      private static final Pattern COMPOUND_WITH_QUOTES2
    • ABBR_DOT_RED_AVT_PATTERN

      private static final Pattern ABBR_DOT_RED_AVT_PATTERN
    • SOFT_HYPHEN_WRAP

      private static final String SOFT_HYPHEN_WRAP
      See Also:
    • SOFT_HYPHEN_WRAP_SUBST

      private static final String SOFT_HYPHEN_WRAP_SUBST
      See Also:
    • URL_PATTERN

      private static final Pattern URL_PATTERN
    • URL_START_REPLACE_CHAR

      private static final int URL_START_REPLACE_CHAR
      See Also:
    • LEADING_DASH_PATTERN

      private static final Pattern LEADING_DASH_PATTERN
    • LEADING_DASH_PATTERN_2

      private static final Pattern LEADING_DASH_PATTERN_2
    • NUMBER_MISSING_SPACE

      private static final Pattern NUMBER_MISSING_SPACE
    • WEB_ENTITIES

      private static final Pattern WEB_ENTITIES
    • WEB_ENTITIES2

      private static final Pattern WEB_ENTITIES2
  • Constructor Details

    • UkrainianWordTokenizer

      public UkrainianWordTokenizer()
  • Method Details