tesseract 3.04.01

cube/cube_utils.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        cube_utils.cpp
00003  * Description: Implementation of the Cube Utilities Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2008
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include <math.h>
00021 #include <string>
00022 #include <vector>
00023 #include "cube_utils.h"
00024 #include "char_set.h"
00025 #include "unichar.h"
00026 
00027 namespace tesseract {
00028 CubeUtils::CubeUtils() {
00029 }
00030 
00031 CubeUtils::~CubeUtils() {
00032 }
00033 
00037 int CubeUtils::Prob2Cost(double prob_val) {
00038   if (prob_val < MIN_PROB)   {
00039     return MIN_PROB_COST;
00040   }
00041   return static_cast<int>(-log(prob_val) * PROB2COST_SCALE);
00042 }
00043 
00047 double CubeUtils::Cost2Prob(int cost) {
00048   return exp(-cost / PROB2COST_SCALE);
00049 }
00050 
00054 int CubeUtils::StrLen(const char_32 *char_32_ptr) {
00055   if (char_32_ptr == NULL) {
00056     return 0;
00057   }
00058   int len = -1;
00059   while (char_32_ptr[++len]);
00060   return len;
00061 }
00062 
00066 int CubeUtils::StrCmp(const char_32 *str1, const char_32 *str2) {
00067   const char_32 *pch1 = str1;
00068   const char_32 *pch2 = str2;
00069 
00070   for (; (*pch1) != 0 && (*pch2) != 0; pch1++, pch2++) {
00071     if ((*pch1) != (*pch2)) {
00072       return (*pch1) - (*pch2);
00073     }
00074   }
00075 
00076   if ((*pch1) == 0) {
00077     if ((*pch2) == 0) {
00078       return 0;
00079     } else {
00080       return -1;
00081     }
00082   } else {
00083     return 1;
00084   }
00085 }
00086 
00090 char_32 *CubeUtils::StrDup(const char_32 *str32) {
00091   int len = StrLen(str32);
00092   char_32 *new_str = new char_32[len + 1];
00093   if (new_str == NULL) {
00094     return NULL;
00095   }
00096   memcpy(new_str, str32, len * sizeof(*str32));
00097   new_str[len] = 0;
00098   return new_str;
00099 }
00100 
00104 CharSamp *CubeUtils::CharSampleFromPix(Pix *pix, int left, int top,
00105                                        int wid, int hgt) {
00106   // get the raw img data from the image
00107   unsigned char *temp_buff = GetImageData(pix, left, top, wid, hgt);
00108   if (temp_buff == NULL) {
00109     return NULL;
00110   }
00111 
00112   // create a char samp from temp buffer
00113   CharSamp *char_samp = CharSamp::FromRawData(left, top, wid, hgt, temp_buff);
00114 
00115   // clean up temp buffer
00116   delete []temp_buff;
00117   return char_samp;
00118 }
00119 
00123 Pix *CubeUtils::PixFromCharSample(CharSamp *char_samp) {
00124   // parameter check
00125   if (char_samp == NULL) {
00126     return NULL;
00127   }
00128 
00129   // get the raw data
00130   int stride = char_samp->Stride();
00131   int wid = char_samp->Width();
00132   int hgt = char_samp->Height();
00133 
00134   Pix *pix = pixCreate(wid, hgt, 1);
00135   if (pix == NULL) {
00136     return NULL;
00137   }
00138 
00139   // copy the contents
00140   unsigned char *line = char_samp->RawData();
00141   for (int y = 0; y < hgt ; y++, line += stride) {
00142     for (int x = 0; x < wid; x++) {
00143       if (line[x] != 0) {
00144         pixSetPixel(pix, x, y, 0);
00145       } else {
00146         pixSetPixel(pix, x, y, 255);
00147       }
00148     }
00149   }
00150 
00151   return pix;
00152 }
00153 
00157 unsigned char *CubeUtils::GetImageData(Pix *pix, int left, int top,
00158                                        int wid, int hgt) {
00159   // skip invalid dimensions
00160   if (left < 0 || top < 0 || wid < 0 || hgt < 0 ||
00161       (left + wid) > pix->w || (top + hgt) > pix->h ||
00162       pix->d != 1) {
00163     return NULL;
00164   }
00165 
00166   // copy the char img to a temp buffer
00167   unsigned char *temp_buff = new unsigned char[wid * hgt];
00168   if (temp_buff == NULL) {
00169     return NULL;
00170   }
00171   l_int32 w;
00172   l_int32 h;
00173   l_int32 d;
00174   l_int32 wpl;
00175   l_uint32 *line;
00176   l_uint32 *data;
00177 
00178   pixGetDimensions(pix, &w, &h, &d);
00179   wpl = pixGetWpl(pix);
00180   data = pixGetData(pix);
00181   line = data + (top * wpl);
00182 
00183   for (int y = 0, off = 0; y < hgt ; y++) {
00184     for (int x = 0; x < wid; x++, off++) {
00185       temp_buff[off] = GET_DATA_BIT(line, x + left) ? 0 : 255;
00186     }
00187     line += wpl;
00188   }
00189   return temp_buff;
00190 }
00191 
00195 bool CubeUtils::ReadFileToString(const string &file_name, string *str) {
00196   str->clear();
00197   FILE *fp = fopen(file_name.c_str(), "rb");
00198   if (fp == NULL) {
00199     return false;
00200   }
00201 
00202   // get the size of the size
00203   fseek(fp, 0, SEEK_END);
00204   int file_size = ftell(fp);
00205   if (file_size < 1) {
00206     fclose(fp);
00207     return false;
00208   }
00209   // adjust string size
00210   str->reserve(file_size);
00211   // read the contents
00212   rewind(fp);
00213   char *buff = new char[file_size];
00214   if (buff == NULL) {
00215     fclose(fp);
00216     return false;
00217   }
00218   int read_bytes = fread(buff, 1, static_cast<int>(file_size), fp);
00219   if (read_bytes == file_size) {
00220     str->append(buff, file_size);
00221   }
00222   delete []buff;
00223   fclose(fp);
00224   return (read_bytes == file_size);
00225 }
00226 
00230 void CubeUtils::SplitStringUsing(const string &str,
00231                                  const string &delims,
00232                                  vector<string> *str_vec) {
00233   // Optimize the common case where delims is a single character.
00234   if (delims[0] != '\0' && delims[1] == '\0') {
00235     char c = delims[0];
00236     const char* p = str.data();
00237     const char* end = p + str.size();
00238     while (p != end) {
00239       if (*p == c) {
00240         ++p;
00241       } else {
00242         const char* start = p;
00243         while (++p != end && *p != c);
00244         str_vec->push_back(string(start, p - start));
00245       }
00246     }
00247     return;
00248   }
00249 
00250   string::size_type begin_index, end_index;
00251   begin_index = str.find_first_not_of(delims);
00252   while (begin_index != string::npos) {
00253     end_index = str.find_first_of(delims, begin_index);
00254     if (end_index == string::npos) {
00255       str_vec->push_back(str.substr(begin_index));
00256       return;
00257     }
00258     str_vec->push_back(str.substr(begin_index, (end_index - begin_index)));
00259     begin_index = str.find_first_not_of(delims, end_index);
00260   }
00261 }
00262 
00266 void CubeUtils::UTF8ToUTF32(const char *utf8_str, string_32 *str32) {
00267   str32->clear();
00268   int len = strlen(utf8_str);
00269   int step = 0;
00270   for (int ch = 0; ch < len; ch += step) {
00271     step = UNICHAR::utf8_step(utf8_str + ch);
00272     if (step > 0) {
00273       UNICHAR uni_ch(utf8_str + ch, step);
00274       (*str32) += uni_ch.first_uni();
00275     }
00276   }
00277 }
00278 
00282 void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) {
00283   str->clear();
00284   for (const char_32 *ch_32 = utf32_str; (*ch_32) != 0; ch_32++)  {
00285     UNICHAR uni_ch((*ch_32));
00286     char *utf8 = uni_ch.utf8_str();
00287     if (utf8 != NULL) {
00288       (*str) += utf8;
00289       delete []utf8;
00290     }
00291   }
00292 }
00293 
00294 bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) {
00295   bool all_one_case = true;
00296   bool capitalized;
00297   bool prev_upper;
00298   bool prev_lower;
00299   bool first_upper;
00300   bool first_lower;
00301   bool cur_upper;
00302   bool cur_lower;
00303 
00304   string str8;
00305   if (!char_set) {
00306     // If cube char_set is missing, use C-locale-dependent functions
00307     // on UTF8 characters to determine case properties.
00308     first_upper = isupper(str32[0]);
00309     first_lower = islower(str32[0]);
00310     if (first_upper)
00311       capitalized = true;
00312     prev_upper = first_upper;
00313     prev_lower = first_lower;
00314     for (int c = 1; str32[c] != 0; ++c) {
00315       cur_upper = isupper(str32[c]);
00316       cur_lower = islower(str32[c]);
00317       if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
00318         all_one_case = false;
00319       if (cur_upper)
00320         capitalized = false;
00321       prev_upper = cur_upper;
00322       prev_lower = cur_lower;
00323     }
00324   } else {
00325     UNICHARSET *unicharset = char_set->InternalUnicharset();
00326     // Use UNICHARSET functions to determine case properties
00327     first_upper = unicharset->get_isupper(char_set->ClassID(str32[0]));
00328     first_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
00329     if (first_upper)
00330       capitalized = true;
00331     prev_upper = first_upper;
00332     prev_lower = first_lower;
00333 
00334     for (int c = 1; c < StrLen(str32); ++c) {
00335       cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c]));
00336       cur_lower = unicharset->get_islower(char_set->ClassID(str32[c]));
00337       if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
00338         all_one_case = false;
00339       if (cur_upper)
00340         capitalized = false;
00341       prev_upper = cur_upper;
00342       prev_lower = cur_lower;
00343     }
00344   }
00345   return all_one_case || capitalized;
00346 }
00347 
00348 char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) {
00349   if (!char_set) {
00350     return NULL;
00351   }
00352   UNICHARSET *unicharset = char_set->InternalUnicharset();
00353   int len = StrLen(str32);
00354   char_32 *lower = new char_32[len + 1];
00355   if (!lower)
00356     return NULL;
00357   for (int i = 0; i < len; ++i) {
00358     char_32 ch = str32[i];
00359     if (ch == INVALID_UNICHAR_ID) {
00360       delete [] lower;
00361       return NULL;
00362     }
00363     // convert upper-case characters to lower-case
00364     if (unicharset->get_isupper(char_set->ClassID(ch))) {
00365       UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch));
00366       const char_32 *str32_lower = char_set->ClassString(uid_lower);
00367       // expect lower-case version of character to be a single character
00368       if (!str32_lower || StrLen(str32_lower) != 1) {
00369         delete [] lower;
00370         return NULL;
00371       }
00372       lower[i] = str32_lower[0];
00373     } else {
00374       lower[i] = ch;
00375     }
00376   }
00377   lower[len] = 0;
00378   return lower;
00379 }
00380 
00381 char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) {
00382   if (!char_set) {
00383     return NULL;
00384   }
00385   UNICHARSET *unicharset = char_set->InternalUnicharset();
00386   int len = StrLen(str32);
00387   char_32 *upper = new char_32[len + 1];
00388   if (!upper)
00389     return NULL;
00390   for (int i = 0; i < len; ++i) {
00391     char_32 ch = str32[i];
00392     if (ch == INVALID_UNICHAR_ID) {
00393       delete [] upper;
00394       return NULL;
00395     }
00396     // convert lower-case characters to upper-case
00397     if (unicharset->get_islower(char_set->ClassID(ch))) {
00398       UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch));
00399       const char_32 *str32_upper = char_set->ClassString(uid_upper);
00400       // expect upper-case version of character to be a single character
00401       if (!str32_upper || StrLen(str32_upper) != 1) {
00402         delete [] upper;
00403         return NULL;
00404       }
00405       upper[i] = str32_upper[0];
00406     } else {
00407       upper[i] = ch;
00408     }
00409   }
00410   upper[len] = 0;
00411   return upper;
00412 }
00413 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines