|
tesseract 3.04.01
|
00001 00002 // File: unichar.cpp 00003 // Description: Unicode character/ligature class. 00004 // Author: Ray Smith 00005 // Created: Wed Jun 28 17:05:01 PDT 2006 00006 // 00007 // (C) Copyright 2006, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #include "unichar.h" 00021 #include "errcode.h" 00022 #include "genericvector.h" 00023 #include "tprintf.h" 00024 00025 #define UNI_MAX_LEGAL_UTF32 0x0010FFFF 00026 00027 // Construct from a utf8 string. If len<0 then the string is null terminated. 00028 // If the string is too long to fit in the UNICHAR then it takes only what 00029 // will fit. Checks for illegal input and stops at an illegal sequence. 00030 // The resulting UNICHAR may be empty. 00031 UNICHAR::UNICHAR(const char* utf8_str, int len) { 00032 int total_len = 0; 00033 int step = 0; 00034 if (len < 0) { 00035 for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len); 00036 } 00037 for (total_len = 0; total_len < len; total_len += step) { 00038 step = utf8_step(utf8_str + total_len); 00039 if (total_len + step > UNICHAR_LEN) 00040 break; // Too long. 00041 if (step == 0) 00042 break; // Illegal first byte. 00043 int i; 00044 for (i = 1; i < step; ++i) 00045 if ((utf8_str[total_len + i] & 0xc0) != 0x80) 00046 break; 00047 if (i < step) 00048 break; // Illegal surrogate 00049 } 00050 memcpy(chars, utf8_str, total_len); 00051 if (total_len < UNICHAR_LEN) { 00052 chars[UNICHAR_LEN - 1] = total_len; 00053 while (total_len < UNICHAR_LEN - 1) 00054 chars[total_len++] = 0; 00055 } 00056 } 00057 00058 // Construct from a single UCS4 character. Illegal values are ignored, 00059 // resulting in an empty UNICHAR. 00060 UNICHAR::UNICHAR(int unicode) { 00061 const int bytemask = 0xBF; 00062 const int bytemark = 0x80; 00063 00064 if (unicode < 0x80) { 00065 chars[UNICHAR_LEN - 1] = 1; 00066 chars[2] = 0; 00067 chars[1] = 0; 00068 chars[0] = static_cast<char>(unicode); 00069 } else if (unicode < 0x800) { 00070 chars[UNICHAR_LEN - 1] = 2; 00071 chars[2] = 0; 00072 chars[1] = static_cast<char>((unicode | bytemark) & bytemask); 00073 unicode >>= 6; 00074 chars[0] = static_cast<char>(unicode | 0xc0); 00075 } else if (unicode < 0x10000) { 00076 chars[UNICHAR_LEN - 1] = 3; 00077 chars[2] = static_cast<char>((unicode | bytemark) & bytemask); 00078 unicode >>= 6; 00079 chars[1] = static_cast<char>((unicode | bytemark) & bytemask); 00080 unicode >>= 6; 00081 chars[0] = static_cast<char>(unicode | 0xe0); 00082 } else if (unicode <= UNI_MAX_LEGAL_UTF32) { 00083 chars[UNICHAR_LEN - 1] = 4; 00084 chars[3] = static_cast<char>((unicode | bytemark) & bytemask); 00085 unicode >>= 6; 00086 chars[2] = static_cast<char>((unicode | bytemark) & bytemask); 00087 unicode >>= 6; 00088 chars[1] = static_cast<char>((unicode | bytemark) & bytemask); 00089 unicode >>= 6; 00090 chars[0] = static_cast<char>(unicode | 0xf0); 00091 } else { 00092 memset(chars, 0, UNICHAR_LEN); 00093 } 00094 } 00095 00096 // Get the first character as UCS-4. 00097 int UNICHAR::first_uni() const { 00098 static const int utf8_offsets[5] = { 00099 0, 0, 0x3080, 0xE2080, 0x3C82080 00100 }; 00101 int uni = 0; 00102 int len = utf8_step(chars); 00103 const char* src = chars; 00104 00105 switch (len) { 00106 default: 00107 break; 00108 case 4: 00109 uni += static_cast<unsigned char>(*src++); 00110 uni <<= 6; 00111 case 3: 00112 uni += static_cast<unsigned char>(*src++); 00113 uni <<= 6; 00114 case 2: 00115 uni += static_cast<unsigned char>(*src++); 00116 uni <<= 6; 00117 case 1: 00118 uni += static_cast<unsigned char>(*src++); 00119 } 00120 uni -= utf8_offsets[len]; 00121 return uni; 00122 } 00123 00124 // Get a terminated UTF8 string: Must delete[] it after use. 00125 char* UNICHAR::utf8_str() const { 00126 int len = utf8_len(); 00127 char* str = new char[len + 1]; 00128 memcpy(str, chars, len); 00129 str[len] = 0; 00130 return str; 00131 } 00132 00133 // Get the number of bytes in the first character of the given utf8 string. 00134 int UNICHAR::utf8_step(const char* utf8_str) { 00135 static const char utf8_bytes[256] = { 00136 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00137 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00138 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00139 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00140 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00141 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00142 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 00143 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0 00144 }; 00145 00146 return utf8_bytes[static_cast<unsigned char>(*utf8_str)]; 00147 } 00148 00149 UNICHAR::const_iterator& UNICHAR::const_iterator::operator++() { 00150 ASSERT_HOST(it_ != NULL); 00151 int step = utf8_step(it_); 00152 if (step == 0) { 00153 tprintf("ERROR: Illegal UTF8 encountered.\n"); 00154 for (int i = 0; i < 5 && it_[i] != '\0'; ++i) { 00155 tprintf("Index %d char = 0x%x\n", i, it_[i]); 00156 } 00157 step = 1; 00158 } 00159 it_ += step; 00160 return *this; 00161 } 00162 00163 int UNICHAR::const_iterator::operator*() const { 00164 ASSERT_HOST(it_ != NULL); 00165 const int len = utf8_step(it_); 00166 if (len == 0) { 00167 tprintf("WARNING: Illegal UTF8 encountered\n"); 00168 return ' '; 00169 } 00170 UNICHAR uch(it_, len); 00171 return uch.first_uni(); 00172 } 00173 00174 int UNICHAR::const_iterator::get_utf8(char* utf8_output) const { 00175 ASSERT_HOST(it_ != NULL); 00176 const int len = utf8_step(it_); 00177 if (len == 0) { 00178 tprintf("WARNING: Illegal UTF8 encountered\n"); 00179 utf8_output[0] = ' '; 00180 return 1; 00181 } 00182 strncpy(utf8_output, it_, len); 00183 return len; 00184 } 00185 00186 int UNICHAR::const_iterator::utf8_len() const { 00187 ASSERT_HOST(it_ != NULL); 00188 const int len = utf8_step(it_); 00189 if (len == 0) { 00190 tprintf("WARNING: Illegal UTF8 encountered\n"); 00191 return 1; 00192 } 00193 return len; 00194 } 00195 00196 bool UNICHAR::const_iterator::is_legal() const { 00197 return utf8_step(it_) > 0; 00198 } 00199 00200 UNICHAR::const_iterator UNICHAR::begin(const char* utf8_str, const int len) { 00201 return UNICHAR::const_iterator(utf8_str); 00202 } 00203 00204 UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) { 00205 return UNICHAR::const_iterator(utf8_str + len); 00206 } 00207 00208 // Converts a utf-8 string to a vector of unicodes. 00209 // Returns false if the input contains invalid UTF-8, and replaces 00210 // the rest of the string with a single space. 00211 bool UNICHAR::UTF8ToUnicode(const char* utf8_str, 00212 GenericVector<int>* unicodes) { 00213 const int utf8_length = strlen(utf8_str); 00214 const_iterator end_it(end(utf8_str, utf8_length)); 00215 for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) { 00216 if (it.is_legal()) { 00217 unicodes->push_back(*it); 00218 } else { 00219 unicodes->push_back(' '); 00220 return false; 00221 } 00222 } 00223 return true; 00224 } 00225