4 #include <unordered_map> 9 #include "unicode/uchar.h" 10 #include "unicode/uscript.h" 42 std::vector<std::vector<char32>>*
dest) {
44 std::vector<std::vector<char32>> graphemes;
59 std::unique_ptr<Validator> validator(
61 for (
const auto& grapheme : graphemes) {
62 if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme,
dest)) {
75 return std::unique_ptr<Validator>(
78 return std::unique_ptr<Validator>(
81 return std::unique_ptr<Validator>(
84 return std::unique_ptr<Validator>(
87 return std::unique_ptr<Validator>(
99 std::vector<std::vector<char32>>*
dest) {
115 std::vector<std::vector<char32>>*
dest) {
126 dest->push_back(std::vector<char32>());
135 static bool CmpPairSecond(
const std::pair<int, int>& p1,
136 const std::pair<int, int>& p2) {
137 return p1.second < p2.second;
144 const std::vector<char32>& utf32) {
145 std::unordered_map<int, int> histogram;
152 UScriptCode script_code = uscript_getScript(ch, err);
154 script_code != USCRIPT_COMMON) ||
155 script_code == USCRIPT_MYANMAR) {
156 if (script_code == USCRIPT_MYANMAR)
161 if (!histogram.empty()) {
163 std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)
182 (unicode & 0x7f) == 0x4d) ||
192 return (0x1cd0 <= unicode && unicode < 0x1d00) ||
193 (0xa8e0 <= unicode && unicode <= 0xa8f7) ||
194 (0x951 <= unicode && unicode <= 0x954);
207 codes_.reserve(text.size());
virtual bool ConsumeGraphemeIfValid()=0
static const char32 kJavaneseVirama
void ComputeClassCodes(const std::vector< char32 > &text)
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
static const char32 kMaxJavaneseUnicode
static const char32 kRightToLeftMark
static const char32 kMaxSinhalaUnicode
static const char32 kZeroWidthNonJoiner
std::vector< IndicPair > codes_
bool IsSubscriptScript() const
static const char32 kZeroWidthJoiner
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
static const char32 kMinIndicUnicode
static const char32 kInvalid
static const char32 kKhmerVirama
static bool IsVedicAccent(char32 unicode)
static const int kIndicCodePageSize
static const char32 kZeroWidthSpace
static const char32 kLeftToRightMark
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
std::vector< std::vector< char32 > > parts_
std::vector< char32 > output_
static const char32 kSinhalaVirama
static const char32 kMyanmarVirama
virtual CharClass UnicodeToCharClass(char32 ch) const =0
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
static bool IsVirama(char32 unicode)