46#include "blocxx/BLOCXX_config.h"
84UInt8 SequenceLengthTable[256] =
86 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
88 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
89 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
90 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
91 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
92 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
99 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
100 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
101 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
107 const char* p = utf8str;
113 if (c < 0x80 || c > 0xBF)
131 return static_cast<UInt16
>(c);
145 const char* p = utf8char;
146 const UInt32 c0 =
static_cast<UInt8>(p[0]);
147 const UInt32 bad = 0xFFFFFFFF;
148 switch (SequenceLengthTable[c0])
161 const UInt32 c1 =
static_cast<UInt8>(p[1]);
162 return ((c0 & 0x1fu) << 6) | (c1 & 0x3fu);
167 if (p[1] ==
'\0' || p[2] ==
'\0')
171 const UInt32 c1 =
static_cast<UInt8>(p[1]);
172 const UInt32 c2 =
static_cast<UInt8>(p[2]);
173 return ((c0 & 0x0fu) << 12) | ((c1 & 0x3fu) << 6) | (c2 & 0x3fu);
178 if (p[1] ==
'\0' || p[2] ==
'\0' || p[3] ==
'\0')
183 const UInt32 c1 =
static_cast<UInt8>(p[1]);
184 const UInt32 c2 =
static_cast<UInt8>(p[2]);
185 const UInt32 c3 =
static_cast<UInt8>(p[3]);
187 return ((c0 & 0x03u) << 18) | ((c1 & 0x3fu) << 12) | ((c2 & 0x3fu) << 6) | (c3 & 0x3fu);
209 if (ucs4char < 0x80u)
212 sb +=
static_cast<char>(
static_cast<UInt8>(ucs4char));
214 else if (ucs4char < 0x800u)
216 sb +=
static_cast<char>(
static_cast<UInt8>(0xc0u | (ucs4char >> 6)));
217 sb +=
static_cast<char>(
static_cast<UInt8>(0x80u | (ucs4char & 0x3fu)));
219 else if (ucs4char < 0x10000u)
221 sb +=
static_cast<char>(
static_cast<UInt8>(0xe0u | (ucs4char >> 12)));
222 sb +=
static_cast<char>(
static_cast<UInt8>(0x80u | ((ucs4char >> 6) & 0x3fu)));
223 sb +=
static_cast<char>(
static_cast<UInt8>(0x80u | (ucs4char & 0x3fu)));
227 sb +=
static_cast<char>(
static_cast<UInt8>(0xf0u | (ucs4char >> 18)));
228 sb +=
static_cast<char>(
static_cast<UInt8>(0x80u | ((ucs4char >> 12) & 0x3fu)));
229 sb +=
static_cast<char>(
static_cast<UInt8>(0x80u | ((ucs4char >> 6) & 0x3fu)));
230 sb +=
static_cast<char>(
static_cast<UInt8>(0x80u | (ucs4char & 0x3fu)));
239 if (ucs4char < 0x80u)
242 p[0] =
static_cast<char>(
static_cast<UInt8>(ucs4char));
244 else if (ucs4char < 0x800u)
246 p[0] =
static_cast<char>(
static_cast<UInt8>(0xc0u | (ucs4char >> 6)));
247 p[1] =
static_cast<char>(
static_cast<UInt8>(0x80u | (ucs4char & 0x3fu)));
249 else if (ucs4char < 0x10000u)
251 p[0] =
static_cast<char>(
static_cast<UInt8>(0xe0u | (ucs4char >> 12)));
252 p[1] =
static_cast<char>(
static_cast<UInt8>(0x80u | ((ucs4char >> 6) & 0x3fu)));
253 p[2] =
static_cast<char>(
static_cast<UInt8>(0x80u | (ucs4char & 0x3fu)));
257 p[0] =
static_cast<char>(
static_cast<UInt8>(0xf0u | (ucs4char >> 18)));
258 p[1] =
static_cast<char>(
static_cast<UInt8>(0x80u | ((ucs4char >> 12) & 0x3fu)));
259 p[2] =
static_cast<char>(
static_cast<UInt8>(0x80u | ((ucs4char >> 6) & 0x3fu)));
260 p[3] =
static_cast<char>(
static_cast<UInt8>(0x80u | (ucs4char & 0x3fu)));
265Array<UInt16> StringToUCS2Common(
const String& input,
bool throwException)
270 const UInt16 UCS2ReplacementChar = 0xFFFD;
271 const char* begin = input.c_str();
272 const char* end = begin + input.length();
274 const char* p = begin;
277 const UInt32 c0 =
static_cast<UInt8>(p[0]);
278 switch (SequenceLengthTable[c0])
293 BLOCXX_THROW(InvalidUTF8Exception, Format(
"Length: %1, input = %2, p = %3",
294 static_cast<int>(SequenceLengthTable[c0]), input.c_str(), p).c_str());
298 rval.push_back(UCS2ReplacementChar);
302 const UInt32 c1 =
static_cast<UInt8>(p[1]);
303 rval.push_back(((c0 & 0x1fu) << 6) | (c1 & 0x3fu));
310 if (p[1] ==
'\0' || p[2] ==
'\0')
314 BLOCXX_THROW(InvalidUTF8Exception, Format(
"Length: %1, input = %2, p = %3",
315 static_cast<int>(SequenceLengthTable[c0]), input.c_str(), p).c_str());
319 rval.push_back(UCS2ReplacementChar);
324 const UInt32 c1 =
static_cast<UInt8>(p[1]);
325 const UInt32 c2 =
static_cast<UInt8>(p[2]);
326 rval.push_back(((c0 & 0x0fu) << 12) | ((c1 & 0x3fu) << 6) | (c2 & 0x3fu));
335 BLOCXX_THROW(InvalidUTF8Exception, Format(
"Length: %1, input = %2, p = %3",
336 static_cast<int>(SequenceLengthTable[c0]), input.c_str(), p).c_str());
340 rval.push_back(UCS2ReplacementChar);
350 BLOCXX_THROW(InvalidUTF8Exception, Format(
"Length: %1, input = %2, p = %3",
351 static_cast<int>(SequenceLengthTable[c0]), input.c_str(), p).c_str());
355 rval.push_back(UCS2ReplacementChar);
369 return StringToUCS2Common(input,
false);
375 return StringToUCS2Common(input,
true);
384 size_t numchars = inputLength/2;
386 for (
size_t i = 0;
i < numchars; ++
i)
388 UCS4toUTF8(
reinterpret_cast<const UInt16*
>(input)[
i], sb);
416int UTF8CharLen(UInt32 ucs4char)
418 if (ucs4char < 0x80u)
422 else if (ucs4char < 0x800u)
426 else if (ucs4char < 0x10000u)
436template <
typename TransformT>
437bool transformInPlace(
char* input, TransformT transformer)
440 char* output = input;
444 if (ucs4char == 0xFFFFFFFF)
450 UInt32 newUcs4Char = transformer(ucs4char);
452 const UInt32 c0 =
static_cast<UInt8>(p[0]);
453 int prevCharLen = SequenceLengthTable[c0];
454 int newCharLen = UTF8CharLen(newUcs4Char);
470 output += newCharLen;
476template <
typename TransformT>
477String transform(
const char* input, TransformT transformer)
479 StringBuffer rval(strlen(input));
480 const char* p = input;
484 if (ucs4char == 0xFFFFFFFF)
493 const UInt32 c0 =
static_cast<UInt8>(p[0]);
494 int prevCharLen = SequenceLengthTable[c0];
497 return rval.releaseString();
508const CaseMapping lowerMappings[] =
1266const CaseMapping upperMappings[] =
2033const CaseMapping*
const lowerMappingsEnd = lowerMappings +
2034 (
sizeof(lowerMappings)/
sizeof(lowerMappings[0]));
2036const CaseMapping*
const upperMappingsEnd = upperMappings +
2037 (
sizeof(upperMappings)/
sizeof(upperMappings[0]));
2039struct MappingOrdering
2041 bool operator()(
const CaseMapping& x,
const CaseMapping& y)
2043 return x.codePoint < y.codePoint;
2049 Transformer(
const CaseMapping*
const begin,
const CaseMapping*
const end)
2055 UInt32 operator()(UInt32 in)
const
2057 CaseMapping val = { in, 0 };
2058 const CaseMapping*
i = std::lower_bound(m_begin, m_end, val, MappingOrdering());
2059 if (
i == m_end ||
i->codePoint != in)
2077 return transformInPlace(input, Transformer(upperMappings, upperMappingsEnd));
2083 return transform(input, Transformer(upperMappings, upperMappingsEnd));
2089 return transformInPlace(input, Transformer(lowerMappings, lowerMappingsEnd));
2095 return transform(input, Transformer(lowerMappings, lowerMappingsEnd));
#define BLOCXX_ASSERT(CON)
BLOCXX_ASSERT works similar to the assert() macro, but instead of calling abort(),...
#define BLOCXX_DEFINE_EXCEPTION_WITH_ID(NAME)
Define a new exception class named <NAME>Exception that derives from Exception.
#define BLOCXX_THROW(exType, msg)
Throw an exception using FILE and LINE.
const CaseMapping *const m_end
const CaseMapping *const m_begin
Array<> wraps std::vector<> in COWReference<> adding ref counting and copy on write capability.
This String class is an abstract data type that represents as NULL terminated string of characters.
String toUpperCase(const char *input)
Convert the UTF-8 string to upper case and return the result.
String UCS2toUTF8(UInt16 ucs2char)
Convert one UCS2 16-bit char into a UTF-8 char (possibly multiple bytes)
bool toUpperCaseInPlace(char *input)
Convert the UTF-8 string to upper case.
String toLowerCase(const char *input)
Convert the UTF-8 string to lower case and return the result.
String UCS2ToString(const void *input, size_t inputLength)
Convert a UCS2 string into a UTF-8 (or ASCII) string.
Array< UInt16 > StringToUCS2(const String &input)
Convert a UTF-8 (or ASCII) string into a UCS2 string.
String UCS4toUTF8(UInt32 ucs4char)
Convert one UCS4 32-bit char into a UTF-8 char (possibly multiple bytes)
size_t charCount(const char *utf8str)
Count the number of UTF-8 chars in the string.
UInt16 UTF8toUCS2(const char *utf8char)
Convert one UTF-8 char (possibly multiple bytes) into a UCS2 16-bit char.
bool toLowerCaseInPlace(char *input)
Convert the UTF-8 string to lower case.
UInt32 UTF8toUCS4(const char *utf8char)
Convert one UTF-8 char (possibly multiple bytes) into a UCS4 32-bit char.
Array< UInt16 > StringToUCS2ReplaceInvalid(const String &input)
Convert a UTF-8 (or ASCII) string into a UCS2 string.