tesseract 3.04.01

ccutil/strngs.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        strngs.c  (Formerly strings.c)
00003  * Description: STRING class functions.
00004  * Author:                                      Ray Smith
00005  * Created:                                     Fri Feb 15 09:13:30 GMT 1991
00006  *
00007  * (C) Copyright 1991, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include "strngs.h"
00021 
00022 #include <assert.h>
00023 
00024 #include "genericvector.h"
00025 #include "helpers.h"
00026 #include "serialis.h"
00027 #include "tprintf.h"
00028 
00029 using tesseract::TFile;
00030 
00031 // Size of buffer needed to host the decimal representation of the maximum
00032 // possible length of an int (in 64 bits), being -<20 digits>.
00033 const int kMaxIntSize = 22;
00034 // Size of buffer needed to host the decimal representation of the maximum
00035 // possible length of a %.8g being -0.12345678e+999<nul> = 15.
00036 const int kMaxDoubleSize = 15;
00037 
00038 /**********************************************************************
00039  * STRING_HEADER provides metadata about the allocated buffer,
00040  * including total capacity and how much used (strlen with '\0').
00041  *
00042  * The implementation hides this header at the start of the data
00043  * buffer and appends the string on the end to keep sizeof(STRING)
00044  * unchanged from earlier versions so serialization is not affected.
00045  *
00046  * The collection of MACROS provide different implementations depending
00047  * on whether the string keeps track of its strlen or not so that this
00048  * feature can be added in later when consumers don't modify the string
00049  **********************************************************************/
00050 
00051 // Smallest string to allocate by default
00052 const int kMinCapacity = 16;
00053 
00054 char* STRING::AllocData(int used, int capacity) {
00055   data_ = (STRING_HEADER *)alloc_string(capacity + sizeof(STRING_HEADER));
00056 
00057   // header is the metadata for this memory block
00058   STRING_HEADER* header = GetHeader();
00059   header->capacity_ = capacity;
00060   header->used_ = used;
00061   return GetCStr();
00062 }
00063 
00064 void STRING::DiscardData() {
00065   free_string((char *)data_);
00066 }
00067 
00068 // This is a private method; ensure FixHeader is called (or used_ is well defined)
00069 // beforehand
00070 char* STRING::ensure_cstr(inT32 min_capacity) {
00071   STRING_HEADER* orig_header = GetHeader();
00072   if (min_capacity <= orig_header->capacity_)
00073     return ((char *)this->data_) + sizeof(STRING_HEADER);
00074 
00075   // if we are going to grow bigger, than double our existing
00076   // size, but if that still is not big enough then keep the
00077   // requested capacity
00078   if (min_capacity < 2 * orig_header->capacity_)
00079     min_capacity = 2 * orig_header->capacity_;
00080 
00081   int alloc = sizeof(STRING_HEADER) + min_capacity;
00082   STRING_HEADER* new_header = (STRING_HEADER*)(alloc_string(alloc));
00083 
00084   memcpy(&new_header[1], GetCStr(), orig_header->used_);
00085   new_header->capacity_ = min_capacity;
00086   new_header->used_ = orig_header->used_;
00087 
00088   // free old memory, then rebind to new memory
00089   DiscardData();
00090   data_ = new_header;
00091 
00092   assert(InvariantOk());
00093   return ((char *)data_) + sizeof(STRING_HEADER);
00094 }
00095 
00096 // This is const, but is modifying a mutable field
00097 // this way it can be used on const or non-const instances.
00098 void STRING::FixHeader() const {
00099   const STRING_HEADER* header = GetHeader();
00100   if (header->used_ < 0)
00101     header->used_ = strlen(GetCStr()) + 1;
00102 }
00103 
00104 
00105 STRING::STRING() {
00106   // Empty STRINGs contain just the "\0".
00107   memcpy(AllocData(1, kMinCapacity), "", 1);
00108 }
00109 
00110 STRING::STRING(const STRING& str) {
00111   str.FixHeader();
00112   const STRING_HEADER* str_header  = str.GetHeader();
00113   int   str_used  = str_header->used_;
00114   char *this_cstr = AllocData(str_used, str_used);
00115   memcpy(this_cstr, str.GetCStr(), str_used);
00116   assert(InvariantOk());
00117 }
00118 
00119 STRING::STRING(const char* cstr) {
00120   if (cstr == NULL) {
00121     // Empty STRINGs contain just the "\0".
00122     memcpy(AllocData(1, kMinCapacity), "", 1);
00123   } else {
00124     int len = strlen(cstr) + 1;
00125     char* this_cstr = AllocData(len, len);
00126     memcpy(this_cstr, cstr, len);
00127   }
00128   assert(InvariantOk());
00129 }
00130 
00131 STRING::STRING(const char *data, int length) {
00132   if (data == NULL) {
00133     // Empty STRINGs contain just the "\0".
00134     memcpy(AllocData(1, kMinCapacity), "", 1);
00135   } else {
00136     char* this_cstr = AllocData(length + 1, length + 1);
00137     memcpy(this_cstr, data, length);
00138     this_cstr[length] = '\0';
00139   }
00140 }
00141 
00142 STRING::~STRING() {
00143   DiscardData();
00144 }
00145 
00146 // TODO(rays) Change all callers to use TFile and remove the old functions.
00147 // Writes to the given file. Returns false in case of error.
00148 bool STRING::Serialize(FILE* fp) const {
00149   inT32 len = length();
00150   if (fwrite(&len, sizeof(len), 1, fp) != 1) return false;
00151   if (static_cast<int>(fwrite(GetCStr(), 1, len, fp)) != len) return false;
00152   return true;
00153 }
00154 // Writes to the given file. Returns false in case of error.
00155 bool STRING::Serialize(TFile* fp) const {
00156   inT32 len = length();
00157   if (fp->FWrite(&len, sizeof(len), 1) != 1) return false;
00158   if (fp->FWrite(GetCStr(), 1, len) != len) return false;
00159   return true;
00160 }
00161 // Reads from the given file. Returns false in case of error.
00162 // If swap is true, assumes a big/little-endian swap is needed.
00163 bool STRING::DeSerialize(bool swap, FILE* fp) {
00164   inT32 len;
00165   if (fread(&len, sizeof(len), 1, fp) != 1) return false;
00166   if (swap)
00167     ReverseN(&len, sizeof(len));
00168   truncate_at(len);
00169   if (static_cast<int>(fread(GetCStr(), 1, len, fp)) != len) return false;
00170   return true;
00171 }
00172 // Reads from the given file. Returns false in case of error.
00173 // If swap is true, assumes a big/little-endian swap is needed.
00174 bool STRING::DeSerialize(bool swap, TFile* fp) {
00175   inT32 len;
00176   if (fp->FRead(&len, sizeof(len), 1) != 1) return false;
00177   if (swap)
00178     ReverseN(&len, sizeof(len));
00179   truncate_at(len);
00180   if (fp->FRead(GetCStr(), 1, len) != len) return false;
00181   return true;
00182 }
00183 
00184 BOOL8 STRING::contains(const char c) const {
00185   return (c != '\0') && (strchr (GetCStr(), c) != NULL);
00186 }
00187 
00188 inT32 STRING::length() const {
00189   FixHeader();
00190   return GetHeader()->used_ - 1;
00191 }
00192 
00193 const char* STRING::string() const {
00194   const STRING_HEADER* header = GetHeader();
00195   if (header->used_ == 0)
00196     return NULL;
00197 
00198   // mark header length unreliable because tesseract might
00199   // cast away the const and mutate the string directly.
00200   header->used_ = -1;
00201   return GetCStr();
00202 }
00203 
00204 const char* STRING::c_str() const {
00205   return string();
00206 }
00207 
00208 /******
00209  * The STRING_IS_PROTECTED interface adds additional support to migrate
00210  * code that needs to modify the STRING in ways not otherwise supported
00211  * without violating encapsulation.
00212  *
00213  * Also makes the [] operator return a const so it is immutable
00214  */
00215 #if STRING_IS_PROTECTED
00216 const char& STRING::operator[](inT32 index) const {
00217   return GetCStr()[index];
00218 }
00219 
00220 void STRING::insert_range(inT32 index, const char* str, int len) {
00221   // if index is outside current range, then also grow size of string
00222   // to accmodate the requested range.
00223   STRING_HEADER* this_header = GetHeader();
00224   int used = this_header->used_;
00225   if (index > used)
00226     used = index;
00227 
00228   char* this_cstr = ensure_cstr(used + len + 1);
00229   if (index < used) {
00230     // move existing string from index to '\0' inclusive.
00231     memmove(this_cstr + index + len,
00232            this_cstr + index,
00233            this_header->used_ - index);
00234   } else if (len > 0) {
00235     // We are going to overwrite previous null terminator, so write the new one.
00236     this_cstr[this_header->used_ + len - 1] = '\0';
00237 
00238     // If the old header did not have the terminator,
00239     // then we need to account for it now that we've added it.
00240     // Otherwise it was already accounted for; we just moved it.
00241     if (this_header->used_ == 0)
00242       ++this_header->used_;
00243   }
00244 
00245   // Write new string to index.
00246   // The string is already terminated from the conditions above.
00247   memcpy(this_cstr + index, str, len);
00248   this_header->used_ += len;
00249 
00250   assert(InvariantOk());
00251 }
00252 
00253 void STRING::erase_range(inT32 index, int len) {
00254   char* this_cstr = GetCStr();
00255   STRING_HEADER* this_header = GetHeader();
00256 
00257   memcpy(this_cstr+index, this_cstr+index+len,
00258          this_header->used_ - index - len);
00259   this_header->used_ -= len;
00260   assert(InvariantOk());
00261 }
00262 
00263 #else
00264 void STRING::truncate_at(inT32 index) {
00265   ASSERT_HOST(index >= 0);
00266   FixHeader();
00267   char* this_cstr = ensure_cstr(index + 1);
00268   this_cstr[index] = '\0';
00269   GetHeader()->used_ = index + 1;
00270   assert(InvariantOk());
00271 }
00272 
00273 char& STRING::operator[](inT32 index) const {
00274   // Code is casting away this const and mutating the string,
00275   // so mark used_ as -1 to flag it unreliable.
00276   GetHeader()->used_ = -1;
00277   return ((char *)GetCStr())[index];
00278 }
00279 #endif
00280 
00281 void STRING::split(const char c, GenericVector<STRING> *splited) {
00282   int start_index = 0;
00283   int len = length();
00284   for (int i = 0; i < len; i++) {
00285     if ((*this)[i] == c) {
00286       if (i != start_index) {
00287         (*this)[i] = '\0';
00288         splited->push_back(STRING(GetCStr() + start_index, i - start_index));
00289         (*this)[i] = c;
00290       }
00291       start_index = i + 1;
00292     }
00293   }
00294 
00295   if (len != start_index) {
00296     splited->push_back(STRING(GetCStr() + start_index, len - start_index));
00297   }
00298 }
00299 
00300 BOOL8 STRING::operator==(const STRING& str) const {
00301   FixHeader();
00302   str.FixHeader();
00303   const STRING_HEADER* str_header = str.GetHeader();
00304   const STRING_HEADER* this_header = GetHeader();
00305   int this_used = this_header->used_;
00306   int str_used  = str_header->used_;
00307 
00308   return (this_used == str_used)
00309           && (memcmp(GetCStr(), str.GetCStr(), this_used) == 0);
00310 }
00311 
00312 BOOL8 STRING::operator!=(const STRING& str) const {
00313   FixHeader();
00314   str.FixHeader();
00315   const STRING_HEADER* str_header = str.GetHeader();
00316   const STRING_HEADER* this_header = GetHeader();
00317   int this_used = this_header->used_;
00318   int str_used  = str_header->used_;
00319 
00320   return (this_used != str_used)
00321          || (memcmp(GetCStr(), str.GetCStr(), this_used) != 0);
00322 }
00323 
00324 BOOL8 STRING::operator!=(const char* cstr) const {
00325   FixHeader();
00326   const STRING_HEADER* this_header = GetHeader();
00327 
00328   if (cstr == NULL)
00329     return this_header->used_ > 1;  // either '\0' or NULL
00330   else {
00331     inT32 length = strlen(cstr) + 1;
00332     return (this_header->used_ != length)
00333             || (memcmp(GetCStr(), cstr, length) != 0);
00334   }
00335 }
00336 
00337 STRING& STRING::operator=(const STRING& str) {
00338   str.FixHeader();
00339   const STRING_HEADER* str_header = str.GetHeader();
00340   int   str_used = str_header->used_;
00341 
00342   GetHeader()->used_ = 0;  // clear since ensure doesn't need to copy data
00343   char* this_cstr = ensure_cstr(str_used);
00344   STRING_HEADER* this_header = GetHeader();
00345 
00346   memcpy(this_cstr, str.GetCStr(), str_used);
00347   this_header->used_ = str_used;
00348 
00349   assert(InvariantOk());
00350   return *this;
00351 }
00352 
00353 STRING & STRING::operator+=(const STRING& str) {
00354   FixHeader();
00355   str.FixHeader();
00356   const STRING_HEADER* str_header = str.GetHeader();
00357   const char* str_cstr = str.GetCStr();
00358   int  str_used  = str_header->used_;
00359   int  this_used = GetHeader()->used_;
00360   char* this_cstr = ensure_cstr(this_used + str_used);
00361 
00362   STRING_HEADER* this_header = GetHeader();  // after ensure for realloc
00363 
00364   if (this_used > 1) {
00365     memcpy(this_cstr + this_used - 1, str_cstr, str_used);
00366     this_header->used_ += str_used - 1;  // overwrite '\0'
00367   } else {
00368     memcpy(this_cstr, str_cstr, str_used);
00369     this_header->used_ = str_used;
00370   }
00371 
00372   assert(InvariantOk());
00373   return *this;
00374 }
00375 
00376 void STRING::add_str_int(const char* str, int number) {
00377   if (str != NULL)
00378     *this += str;
00379   // Allow space for the maximum possible length of inT64.
00380   char num_buffer[kMaxIntSize];
00381   snprintf(num_buffer, kMaxIntSize - 1, "%d", number);
00382   num_buffer[kMaxIntSize - 1] = '\0';
00383   *this += num_buffer;
00384 }
00385 // Appends the given string and double (as a %.8g) to this.
00386 void STRING::add_str_double(const char* str, double number) {
00387   if (str != NULL)
00388     *this += str;
00389   // Allow space for the maximum possible length of %8g.
00390   char num_buffer[kMaxDoubleSize];
00391   snprintf(num_buffer, kMaxDoubleSize - 1, "%.8g", number);
00392   num_buffer[kMaxDoubleSize - 1] = '\0';
00393   *this += num_buffer;
00394 }
00395 
00396 STRING & STRING::operator=(const char* cstr) {
00397   STRING_HEADER* this_header = GetHeader();
00398   if (cstr) {
00399     int len = strlen(cstr) + 1;
00400 
00401     this_header->used_ = 0;  // don't bother copying data if need to realloc
00402     char* this_cstr = ensure_cstr(len);
00403     this_header = GetHeader();  // for realloc
00404     memcpy(this_cstr, cstr, len);
00405     this_header->used_ = len;
00406   } else {
00407     // Reallocate to same state as default constructor.
00408     DiscardData();
00409     // Empty STRINGs contain just the "\0".
00410     memcpy(AllocData(1, kMinCapacity), "", 1);
00411   }
00412 
00413   assert(InvariantOk());
00414   return *this;
00415 }
00416 
00417 void STRING::assign(const char *cstr, int len) {
00418   STRING_HEADER* this_header = GetHeader();
00419   this_header->used_ = 0;  // don't bother copying data if need to realloc
00420   char* this_cstr = ensure_cstr(len + 1);  // +1 for '\0'
00421 
00422   this_header = GetHeader();  // for realloc
00423   memcpy(this_cstr, cstr, len);
00424   this_cstr[len] = '\0';
00425   this_header->used_ = len + 1;
00426 
00427   assert(InvariantOk());
00428 }
00429 
00430 STRING STRING::operator+(const STRING& str) const {
00431   STRING result(*this);
00432   result += str;
00433 
00434   assert(InvariantOk());
00435   return result;
00436 }
00437 
00438 
00439 STRING STRING::operator+(const char ch) const {
00440   STRING result;
00441   FixHeader();
00442   const STRING_HEADER* this_header = GetHeader();
00443   int this_used = this_header->used_;
00444   char* result_cstr = result.ensure_cstr(this_used + 1);
00445   STRING_HEADER* result_header = result.GetHeader();
00446   int result_used = result_header->used_;
00447 
00448   // copies '\0' but we'll overwrite that
00449   memcpy(result_cstr, GetCStr(), this_used);
00450   result_cstr[result_used] = ch;      // overwrite old '\0'
00451   result_cstr[result_used + 1] = '\0';  // append on '\0'
00452   ++result_header->used_;
00453 
00454   assert(InvariantOk());
00455   return result;
00456 }
00457 
00458 
00459 STRING&  STRING::operator+=(const char *str) {
00460   if (!str || !*str)  // empty string has no effect
00461     return *this;
00462 
00463   FixHeader();
00464   int len = strlen(str) + 1;
00465   int this_used = GetHeader()->used_;
00466   char* this_cstr = ensure_cstr(this_used + len);
00467   STRING_HEADER* this_header = GetHeader();  // after ensure for realloc
00468 
00469   // if we had non-empty string then append overwriting old '\0'
00470   // otherwise replace
00471   if (this_used > 0) {
00472     memcpy(this_cstr + this_used - 1, str, len);
00473     this_header->used_ += len - 1;
00474   } else {
00475     memcpy(this_cstr, str, len);
00476     this_header->used_ = len;
00477   }
00478 
00479   assert(InvariantOk());
00480   return *this;
00481 }
00482 
00483 
00484 STRING& STRING::operator+=(const char ch) {
00485   if (ch == '\0')
00486     return *this;
00487 
00488   FixHeader();
00489   int   this_used = GetHeader()->used_;
00490   char* this_cstr = ensure_cstr(this_used + 1);
00491   STRING_HEADER* this_header = GetHeader();
00492 
00493   if (this_used > 0)
00494     --this_used; // undo old empty null if there was one
00495 
00496   this_cstr[this_used++] = ch;   // append ch to end
00497   this_cstr[this_used++] = '\0'; // append '\0' after ch
00498   this_header->used_ = this_used;
00499 
00500   assert(InvariantOk());
00501   return *this;
00502 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines