|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: cube_utils.cpp 00003 * Description: Implementation of the Cube Utilities Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2008 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include <math.h> 00021 #include <string> 00022 #include <vector> 00023 #include "cube_utils.h" 00024 #include "char_set.h" 00025 #include "unichar.h" 00026 00027 namespace tesseract { 00028 CubeUtils::CubeUtils() { 00029 } 00030 00031 CubeUtils::~CubeUtils() { 00032 } 00033 00037 int CubeUtils::Prob2Cost(double prob_val) { 00038 if (prob_val < MIN_PROB) { 00039 return MIN_PROB_COST; 00040 } 00041 return static_cast<int>(-log(prob_val) * PROB2COST_SCALE); 00042 } 00043 00047 double CubeUtils::Cost2Prob(int cost) { 00048 return exp(-cost / PROB2COST_SCALE); 00049 } 00050 00054 int CubeUtils::StrLen(const char_32 *char_32_ptr) { 00055 if (char_32_ptr == NULL) { 00056 return 0; 00057 } 00058 int len = -1; 00059 while (char_32_ptr[++len]); 00060 return len; 00061 } 00062 00066 int CubeUtils::StrCmp(const char_32 *str1, const char_32 *str2) { 00067 const char_32 *pch1 = str1; 00068 const char_32 *pch2 = str2; 00069 00070 for (; (*pch1) != 0 && (*pch2) != 0; pch1++, pch2++) { 00071 if ((*pch1) != (*pch2)) { 00072 return (*pch1) - (*pch2); 00073 } 00074 } 00075 00076 if ((*pch1) == 0) { 00077 if ((*pch2) == 0) { 00078 return 0; 00079 } else { 00080 return -1; 00081 } 00082 } else { 00083 return 1; 00084 } 00085 } 00086 00090 char_32 *CubeUtils::StrDup(const char_32 *str32) { 00091 int len = StrLen(str32); 00092 char_32 *new_str = new char_32[len + 1]; 00093 if (new_str == NULL) { 00094 return NULL; 00095 } 00096 memcpy(new_str, str32, len * sizeof(*str32)); 00097 new_str[len] = 0; 00098 return new_str; 00099 } 00100 00104 CharSamp *CubeUtils::CharSampleFromPix(Pix *pix, int left, int top, 00105 int wid, int hgt) { 00106 // get the raw img data from the image 00107 unsigned char *temp_buff = GetImageData(pix, left, top, wid, hgt); 00108 if (temp_buff == NULL) { 00109 return NULL; 00110 } 00111 00112 // create a char samp from temp buffer 00113 CharSamp *char_samp = CharSamp::FromRawData(left, top, wid, hgt, temp_buff); 00114 00115 // clean up temp buffer 00116 delete []temp_buff; 00117 return char_samp; 00118 } 00119 00123 Pix *CubeUtils::PixFromCharSample(CharSamp *char_samp) { 00124 // parameter check 00125 if (char_samp == NULL) { 00126 return NULL; 00127 } 00128 00129 // get the raw data 00130 int stride = char_samp->Stride(); 00131 int wid = char_samp->Width(); 00132 int hgt = char_samp->Height(); 00133 00134 Pix *pix = pixCreate(wid, hgt, 1); 00135 if (pix == NULL) { 00136 return NULL; 00137 } 00138 00139 // copy the contents 00140 unsigned char *line = char_samp->RawData(); 00141 for (int y = 0; y < hgt ; y++, line += stride) { 00142 for (int x = 0; x < wid; x++) { 00143 if (line[x] != 0) { 00144 pixSetPixel(pix, x, y, 0); 00145 } else { 00146 pixSetPixel(pix, x, y, 255); 00147 } 00148 } 00149 } 00150 00151 return pix; 00152 } 00153 00157 unsigned char *CubeUtils::GetImageData(Pix *pix, int left, int top, 00158 int wid, int hgt) { 00159 // skip invalid dimensions 00160 if (left < 0 || top < 0 || wid < 0 || hgt < 0 || 00161 (left + wid) > pix->w || (top + hgt) > pix->h || 00162 pix->d != 1) { 00163 return NULL; 00164 } 00165 00166 // copy the char img to a temp buffer 00167 unsigned char *temp_buff = new unsigned char[wid * hgt]; 00168 if (temp_buff == NULL) { 00169 return NULL; 00170 } 00171 l_int32 w; 00172 l_int32 h; 00173 l_int32 d; 00174 l_int32 wpl; 00175 l_uint32 *line; 00176 l_uint32 *data; 00177 00178 pixGetDimensions(pix, &w, &h, &d); 00179 wpl = pixGetWpl(pix); 00180 data = pixGetData(pix); 00181 line = data + (top * wpl); 00182 00183 for (int y = 0, off = 0; y < hgt ; y++) { 00184 for (int x = 0; x < wid; x++, off++) { 00185 temp_buff[off] = GET_DATA_BIT(line, x + left) ? 0 : 255; 00186 } 00187 line += wpl; 00188 } 00189 return temp_buff; 00190 } 00191 00195 bool CubeUtils::ReadFileToString(const string &file_name, string *str) { 00196 str->clear(); 00197 FILE *fp = fopen(file_name.c_str(), "rb"); 00198 if (fp == NULL) { 00199 return false; 00200 } 00201 00202 // get the size of the size 00203 fseek(fp, 0, SEEK_END); 00204 int file_size = ftell(fp); 00205 if (file_size < 1) { 00206 fclose(fp); 00207 return false; 00208 } 00209 // adjust string size 00210 str->reserve(file_size); 00211 // read the contents 00212 rewind(fp); 00213 char *buff = new char[file_size]; 00214 if (buff == NULL) { 00215 fclose(fp); 00216 return false; 00217 } 00218 int read_bytes = fread(buff, 1, static_cast<int>(file_size), fp); 00219 if (read_bytes == file_size) { 00220 str->append(buff, file_size); 00221 } 00222 delete []buff; 00223 fclose(fp); 00224 return (read_bytes == file_size); 00225 } 00226 00230 void CubeUtils::SplitStringUsing(const string &str, 00231 const string &delims, 00232 vector<string> *str_vec) { 00233 // Optimize the common case where delims is a single character. 00234 if (delims[0] != '\0' && delims[1] == '\0') { 00235 char c = delims[0]; 00236 const char* p = str.data(); 00237 const char* end = p + str.size(); 00238 while (p != end) { 00239 if (*p == c) { 00240 ++p; 00241 } else { 00242 const char* start = p; 00243 while (++p != end && *p != c); 00244 str_vec->push_back(string(start, p - start)); 00245 } 00246 } 00247 return; 00248 } 00249 00250 string::size_type begin_index, end_index; 00251 begin_index = str.find_first_not_of(delims); 00252 while (begin_index != string::npos) { 00253 end_index = str.find_first_of(delims, begin_index); 00254 if (end_index == string::npos) { 00255 str_vec->push_back(str.substr(begin_index)); 00256 return; 00257 } 00258 str_vec->push_back(str.substr(begin_index, (end_index - begin_index))); 00259 begin_index = str.find_first_not_of(delims, end_index); 00260 } 00261 } 00262 00266 void CubeUtils::UTF8ToUTF32(const char *utf8_str, string_32 *str32) { 00267 str32->clear(); 00268 int len = strlen(utf8_str); 00269 int step = 0; 00270 for (int ch = 0; ch < len; ch += step) { 00271 step = UNICHAR::utf8_step(utf8_str + ch); 00272 if (step > 0) { 00273 UNICHAR uni_ch(utf8_str + ch, step); 00274 (*str32) += uni_ch.first_uni(); 00275 } 00276 } 00277 } 00278 00282 void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) { 00283 str->clear(); 00284 for (const char_32 *ch_32 = utf32_str; (*ch_32) != 0; ch_32++) { 00285 UNICHAR uni_ch((*ch_32)); 00286 char *utf8 = uni_ch.utf8_str(); 00287 if (utf8 != NULL) { 00288 (*str) += utf8; 00289 delete []utf8; 00290 } 00291 } 00292 } 00293 00294 bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) { 00295 bool all_one_case = true; 00296 bool capitalized; 00297 bool prev_upper; 00298 bool prev_lower; 00299 bool first_upper; 00300 bool first_lower; 00301 bool cur_upper; 00302 bool cur_lower; 00303 00304 string str8; 00305 if (!char_set) { 00306 // If cube char_set is missing, use C-locale-dependent functions 00307 // on UTF8 characters to determine case properties. 00308 first_upper = isupper(str32[0]); 00309 first_lower = islower(str32[0]); 00310 if (first_upper) 00311 capitalized = true; 00312 prev_upper = first_upper; 00313 prev_lower = first_lower; 00314 for (int c = 1; str32[c] != 0; ++c) { 00315 cur_upper = isupper(str32[c]); 00316 cur_lower = islower(str32[c]); 00317 if ((prev_upper && cur_lower) || (prev_lower && cur_upper)) 00318 all_one_case = false; 00319 if (cur_upper) 00320 capitalized = false; 00321 prev_upper = cur_upper; 00322 prev_lower = cur_lower; 00323 } 00324 } else { 00325 UNICHARSET *unicharset = char_set->InternalUnicharset(); 00326 // Use UNICHARSET functions to determine case properties 00327 first_upper = unicharset->get_isupper(char_set->ClassID(str32[0])); 00328 first_lower = unicharset->get_islower(char_set->ClassID(str32[0])); 00329 if (first_upper) 00330 capitalized = true; 00331 prev_upper = first_upper; 00332 prev_lower = first_lower; 00333 00334 for (int c = 1; c < StrLen(str32); ++c) { 00335 cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c])); 00336 cur_lower = unicharset->get_islower(char_set->ClassID(str32[c])); 00337 if ((prev_upper && cur_lower) || (prev_lower && cur_upper)) 00338 all_one_case = false; 00339 if (cur_upper) 00340 capitalized = false; 00341 prev_upper = cur_upper; 00342 prev_lower = cur_lower; 00343 } 00344 } 00345 return all_one_case || capitalized; 00346 } 00347 00348 char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) { 00349 if (!char_set) { 00350 return NULL; 00351 } 00352 UNICHARSET *unicharset = char_set->InternalUnicharset(); 00353 int len = StrLen(str32); 00354 char_32 *lower = new char_32[len + 1]; 00355 if (!lower) 00356 return NULL; 00357 for (int i = 0; i < len; ++i) { 00358 char_32 ch = str32[i]; 00359 if (ch == INVALID_UNICHAR_ID) { 00360 delete [] lower; 00361 return NULL; 00362 } 00363 // convert upper-case characters to lower-case 00364 if (unicharset->get_isupper(char_set->ClassID(ch))) { 00365 UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch)); 00366 const char_32 *str32_lower = char_set->ClassString(uid_lower); 00367 // expect lower-case version of character to be a single character 00368 if (!str32_lower || StrLen(str32_lower) != 1) { 00369 delete [] lower; 00370 return NULL; 00371 } 00372 lower[i] = str32_lower[0]; 00373 } else { 00374 lower[i] = ch; 00375 } 00376 } 00377 lower[len] = 0; 00378 return lower; 00379 } 00380 00381 char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) { 00382 if (!char_set) { 00383 return NULL; 00384 } 00385 UNICHARSET *unicharset = char_set->InternalUnicharset(); 00386 int len = StrLen(str32); 00387 char_32 *upper = new char_32[len + 1]; 00388 if (!upper) 00389 return NULL; 00390 for (int i = 0; i < len; ++i) { 00391 char_32 ch = str32[i]; 00392 if (ch == INVALID_UNICHAR_ID) { 00393 delete [] upper; 00394 return NULL; 00395 } 00396 // convert lower-case characters to upper-case 00397 if (unicharset->get_islower(char_set->ClassID(ch))) { 00398 UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch)); 00399 const char_32 *str32_upper = char_set->ClassString(uid_upper); 00400 // expect upper-case version of character to be a single character 00401 if (!str32_upper || StrLen(str32_upper) != 1) { 00402 delete [] upper; 00403 return NULL; 00404 } 00405 upper[i] = str32_upper[0]; 00406 } else { 00407 upper[i] = ch; 00408 } 00409 } 00410 upper[len] = 0; 00411 return upper; 00412 } 00413 } // namespace tesseract