|
tesseract 3.04.01
|
00001 /********************************************************************** 00002 * File: strngs.c (Formerly strings.c) 00003 * Description: STRING class functions. 00004 * Author: Ray Smith 00005 * Created: Fri Feb 15 09:13:30 GMT 1991 00006 * 00007 * (C) Copyright 1991, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include "strngs.h" 00021 00022 #include <assert.h> 00023 00024 #include "genericvector.h" 00025 #include "helpers.h" 00026 #include "serialis.h" 00027 #include "tprintf.h" 00028 00029 using tesseract::TFile; 00030 00031 // Size of buffer needed to host the decimal representation of the maximum 00032 // possible length of an int (in 64 bits), being -<20 digits>. 00033 const int kMaxIntSize = 22; 00034 // Size of buffer needed to host the decimal representation of the maximum 00035 // possible length of a %.8g being -0.12345678e+999<nul> = 15. 00036 const int kMaxDoubleSize = 15; 00037 00038 /********************************************************************** 00039 * STRING_HEADER provides metadata about the allocated buffer, 00040 * including total capacity and how much used (strlen with '\0'). 00041 * 00042 * The implementation hides this header at the start of the data 00043 * buffer and appends the string on the end to keep sizeof(STRING) 00044 * unchanged from earlier versions so serialization is not affected. 00045 * 00046 * The collection of MACROS provide different implementations depending 00047 * on whether the string keeps track of its strlen or not so that this 00048 * feature can be added in later when consumers don't modify the string 00049 **********************************************************************/ 00050 00051 // Smallest string to allocate by default 00052 const int kMinCapacity = 16; 00053 00054 char* STRING::AllocData(int used, int capacity) { 00055 data_ = (STRING_HEADER *)alloc_string(capacity + sizeof(STRING_HEADER)); 00056 00057 // header is the metadata for this memory block 00058 STRING_HEADER* header = GetHeader(); 00059 header->capacity_ = capacity; 00060 header->used_ = used; 00061 return GetCStr(); 00062 } 00063 00064 void STRING::DiscardData() { 00065 free_string((char *)data_); 00066 } 00067 00068 // This is a private method; ensure FixHeader is called (or used_ is well defined) 00069 // beforehand 00070 char* STRING::ensure_cstr(inT32 min_capacity) { 00071 STRING_HEADER* orig_header = GetHeader(); 00072 if (min_capacity <= orig_header->capacity_) 00073 return ((char *)this->data_) + sizeof(STRING_HEADER); 00074 00075 // if we are going to grow bigger, than double our existing 00076 // size, but if that still is not big enough then keep the 00077 // requested capacity 00078 if (min_capacity < 2 * orig_header->capacity_) 00079 min_capacity = 2 * orig_header->capacity_; 00080 00081 int alloc = sizeof(STRING_HEADER) + min_capacity; 00082 STRING_HEADER* new_header = (STRING_HEADER*)(alloc_string(alloc)); 00083 00084 memcpy(&new_header[1], GetCStr(), orig_header->used_); 00085 new_header->capacity_ = min_capacity; 00086 new_header->used_ = orig_header->used_; 00087 00088 // free old memory, then rebind to new memory 00089 DiscardData(); 00090 data_ = new_header; 00091 00092 assert(InvariantOk()); 00093 return ((char *)data_) + sizeof(STRING_HEADER); 00094 } 00095 00096 // This is const, but is modifying a mutable field 00097 // this way it can be used on const or non-const instances. 00098 void STRING::FixHeader() const { 00099 const STRING_HEADER* header = GetHeader(); 00100 if (header->used_ < 0) 00101 header->used_ = strlen(GetCStr()) + 1; 00102 } 00103 00104 00105 STRING::STRING() { 00106 // Empty STRINGs contain just the "\0". 00107 memcpy(AllocData(1, kMinCapacity), "", 1); 00108 } 00109 00110 STRING::STRING(const STRING& str) { 00111 str.FixHeader(); 00112 const STRING_HEADER* str_header = str.GetHeader(); 00113 int str_used = str_header->used_; 00114 char *this_cstr = AllocData(str_used, str_used); 00115 memcpy(this_cstr, str.GetCStr(), str_used); 00116 assert(InvariantOk()); 00117 } 00118 00119 STRING::STRING(const char* cstr) { 00120 if (cstr == NULL) { 00121 // Empty STRINGs contain just the "\0". 00122 memcpy(AllocData(1, kMinCapacity), "", 1); 00123 } else { 00124 int len = strlen(cstr) + 1; 00125 char* this_cstr = AllocData(len, len); 00126 memcpy(this_cstr, cstr, len); 00127 } 00128 assert(InvariantOk()); 00129 } 00130 00131 STRING::STRING(const char *data, int length) { 00132 if (data == NULL) { 00133 // Empty STRINGs contain just the "\0". 00134 memcpy(AllocData(1, kMinCapacity), "", 1); 00135 } else { 00136 char* this_cstr = AllocData(length + 1, length + 1); 00137 memcpy(this_cstr, data, length); 00138 this_cstr[length] = '\0'; 00139 } 00140 } 00141 00142 STRING::~STRING() { 00143 DiscardData(); 00144 } 00145 00146 // TODO(rays) Change all callers to use TFile and remove the old functions. 00147 // Writes to the given file. Returns false in case of error. 00148 bool STRING::Serialize(FILE* fp) const { 00149 inT32 len = length(); 00150 if (fwrite(&len, sizeof(len), 1, fp) != 1) return false; 00151 if (static_cast<int>(fwrite(GetCStr(), 1, len, fp)) != len) return false; 00152 return true; 00153 } 00154 // Writes to the given file. Returns false in case of error. 00155 bool STRING::Serialize(TFile* fp) const { 00156 inT32 len = length(); 00157 if (fp->FWrite(&len, sizeof(len), 1) != 1) return false; 00158 if (fp->FWrite(GetCStr(), 1, len) != len) return false; 00159 return true; 00160 } 00161 // Reads from the given file. Returns false in case of error. 00162 // If swap is true, assumes a big/little-endian swap is needed. 00163 bool STRING::DeSerialize(bool swap, FILE* fp) { 00164 inT32 len; 00165 if (fread(&len, sizeof(len), 1, fp) != 1) return false; 00166 if (swap) 00167 ReverseN(&len, sizeof(len)); 00168 truncate_at(len); 00169 if (static_cast<int>(fread(GetCStr(), 1, len, fp)) != len) return false; 00170 return true; 00171 } 00172 // Reads from the given file. Returns false in case of error. 00173 // If swap is true, assumes a big/little-endian swap is needed. 00174 bool STRING::DeSerialize(bool swap, TFile* fp) { 00175 inT32 len; 00176 if (fp->FRead(&len, sizeof(len), 1) != 1) return false; 00177 if (swap) 00178 ReverseN(&len, sizeof(len)); 00179 truncate_at(len); 00180 if (fp->FRead(GetCStr(), 1, len) != len) return false; 00181 return true; 00182 } 00183 00184 BOOL8 STRING::contains(const char c) const { 00185 return (c != '\0') && (strchr (GetCStr(), c) != NULL); 00186 } 00187 00188 inT32 STRING::length() const { 00189 FixHeader(); 00190 return GetHeader()->used_ - 1; 00191 } 00192 00193 const char* STRING::string() const { 00194 const STRING_HEADER* header = GetHeader(); 00195 if (header->used_ == 0) 00196 return NULL; 00197 00198 // mark header length unreliable because tesseract might 00199 // cast away the const and mutate the string directly. 00200 header->used_ = -1; 00201 return GetCStr(); 00202 } 00203 00204 const char* STRING::c_str() const { 00205 return string(); 00206 } 00207 00208 /****** 00209 * The STRING_IS_PROTECTED interface adds additional support to migrate 00210 * code that needs to modify the STRING in ways not otherwise supported 00211 * without violating encapsulation. 00212 * 00213 * Also makes the [] operator return a const so it is immutable 00214 */ 00215 #if STRING_IS_PROTECTED 00216 const char& STRING::operator[](inT32 index) const { 00217 return GetCStr()[index]; 00218 } 00219 00220 void STRING::insert_range(inT32 index, const char* str, int len) { 00221 // if index is outside current range, then also grow size of string 00222 // to accmodate the requested range. 00223 STRING_HEADER* this_header = GetHeader(); 00224 int used = this_header->used_; 00225 if (index > used) 00226 used = index; 00227 00228 char* this_cstr = ensure_cstr(used + len + 1); 00229 if (index < used) { 00230 // move existing string from index to '\0' inclusive. 00231 memmove(this_cstr + index + len, 00232 this_cstr + index, 00233 this_header->used_ - index); 00234 } else if (len > 0) { 00235 // We are going to overwrite previous null terminator, so write the new one. 00236 this_cstr[this_header->used_ + len - 1] = '\0'; 00237 00238 // If the old header did not have the terminator, 00239 // then we need to account for it now that we've added it. 00240 // Otherwise it was already accounted for; we just moved it. 00241 if (this_header->used_ == 0) 00242 ++this_header->used_; 00243 } 00244 00245 // Write new string to index. 00246 // The string is already terminated from the conditions above. 00247 memcpy(this_cstr + index, str, len); 00248 this_header->used_ += len; 00249 00250 assert(InvariantOk()); 00251 } 00252 00253 void STRING::erase_range(inT32 index, int len) { 00254 char* this_cstr = GetCStr(); 00255 STRING_HEADER* this_header = GetHeader(); 00256 00257 memcpy(this_cstr+index, this_cstr+index+len, 00258 this_header->used_ - index - len); 00259 this_header->used_ -= len; 00260 assert(InvariantOk()); 00261 } 00262 00263 #else 00264 void STRING::truncate_at(inT32 index) { 00265 ASSERT_HOST(index >= 0); 00266 FixHeader(); 00267 char* this_cstr = ensure_cstr(index + 1); 00268 this_cstr[index] = '\0'; 00269 GetHeader()->used_ = index + 1; 00270 assert(InvariantOk()); 00271 } 00272 00273 char& STRING::operator[](inT32 index) const { 00274 // Code is casting away this const and mutating the string, 00275 // so mark used_ as -1 to flag it unreliable. 00276 GetHeader()->used_ = -1; 00277 return ((char *)GetCStr())[index]; 00278 } 00279 #endif 00280 00281 void STRING::split(const char c, GenericVector<STRING> *splited) { 00282 int start_index = 0; 00283 int len = length(); 00284 for (int i = 0; i < len; i++) { 00285 if ((*this)[i] == c) { 00286 if (i != start_index) { 00287 (*this)[i] = '\0'; 00288 splited->push_back(STRING(GetCStr() + start_index, i - start_index)); 00289 (*this)[i] = c; 00290 } 00291 start_index = i + 1; 00292 } 00293 } 00294 00295 if (len != start_index) { 00296 splited->push_back(STRING(GetCStr() + start_index, len - start_index)); 00297 } 00298 } 00299 00300 BOOL8 STRING::operator==(const STRING& str) const { 00301 FixHeader(); 00302 str.FixHeader(); 00303 const STRING_HEADER* str_header = str.GetHeader(); 00304 const STRING_HEADER* this_header = GetHeader(); 00305 int this_used = this_header->used_; 00306 int str_used = str_header->used_; 00307 00308 return (this_used == str_used) 00309 && (memcmp(GetCStr(), str.GetCStr(), this_used) == 0); 00310 } 00311 00312 BOOL8 STRING::operator!=(const STRING& str) const { 00313 FixHeader(); 00314 str.FixHeader(); 00315 const STRING_HEADER* str_header = str.GetHeader(); 00316 const STRING_HEADER* this_header = GetHeader(); 00317 int this_used = this_header->used_; 00318 int str_used = str_header->used_; 00319 00320 return (this_used != str_used) 00321 || (memcmp(GetCStr(), str.GetCStr(), this_used) != 0); 00322 } 00323 00324 BOOL8 STRING::operator!=(const char* cstr) const { 00325 FixHeader(); 00326 const STRING_HEADER* this_header = GetHeader(); 00327 00328 if (cstr == NULL) 00329 return this_header->used_ > 1; // either '\0' or NULL 00330 else { 00331 inT32 length = strlen(cstr) + 1; 00332 return (this_header->used_ != length) 00333 || (memcmp(GetCStr(), cstr, length) != 0); 00334 } 00335 } 00336 00337 STRING& STRING::operator=(const STRING& str) { 00338 str.FixHeader(); 00339 const STRING_HEADER* str_header = str.GetHeader(); 00340 int str_used = str_header->used_; 00341 00342 GetHeader()->used_ = 0; // clear since ensure doesn't need to copy data 00343 char* this_cstr = ensure_cstr(str_used); 00344 STRING_HEADER* this_header = GetHeader(); 00345 00346 memcpy(this_cstr, str.GetCStr(), str_used); 00347 this_header->used_ = str_used; 00348 00349 assert(InvariantOk()); 00350 return *this; 00351 } 00352 00353 STRING & STRING::operator+=(const STRING& str) { 00354 FixHeader(); 00355 str.FixHeader(); 00356 const STRING_HEADER* str_header = str.GetHeader(); 00357 const char* str_cstr = str.GetCStr(); 00358 int str_used = str_header->used_; 00359 int this_used = GetHeader()->used_; 00360 char* this_cstr = ensure_cstr(this_used + str_used); 00361 00362 STRING_HEADER* this_header = GetHeader(); // after ensure for realloc 00363 00364 if (this_used > 1) { 00365 memcpy(this_cstr + this_used - 1, str_cstr, str_used); 00366 this_header->used_ += str_used - 1; // overwrite '\0' 00367 } else { 00368 memcpy(this_cstr, str_cstr, str_used); 00369 this_header->used_ = str_used; 00370 } 00371 00372 assert(InvariantOk()); 00373 return *this; 00374 } 00375 00376 void STRING::add_str_int(const char* str, int number) { 00377 if (str != NULL) 00378 *this += str; 00379 // Allow space for the maximum possible length of inT64. 00380 char num_buffer[kMaxIntSize]; 00381 snprintf(num_buffer, kMaxIntSize - 1, "%d", number); 00382 num_buffer[kMaxIntSize - 1] = '\0'; 00383 *this += num_buffer; 00384 } 00385 // Appends the given string and double (as a %.8g) to this. 00386 void STRING::add_str_double(const char* str, double number) { 00387 if (str != NULL) 00388 *this += str; 00389 // Allow space for the maximum possible length of %8g. 00390 char num_buffer[kMaxDoubleSize]; 00391 snprintf(num_buffer, kMaxDoubleSize - 1, "%.8g", number); 00392 num_buffer[kMaxDoubleSize - 1] = '\0'; 00393 *this += num_buffer; 00394 } 00395 00396 STRING & STRING::operator=(const char* cstr) { 00397 STRING_HEADER* this_header = GetHeader(); 00398 if (cstr) { 00399 int len = strlen(cstr) + 1; 00400 00401 this_header->used_ = 0; // don't bother copying data if need to realloc 00402 char* this_cstr = ensure_cstr(len); 00403 this_header = GetHeader(); // for realloc 00404 memcpy(this_cstr, cstr, len); 00405 this_header->used_ = len; 00406 } else { 00407 // Reallocate to same state as default constructor. 00408 DiscardData(); 00409 // Empty STRINGs contain just the "\0". 00410 memcpy(AllocData(1, kMinCapacity), "", 1); 00411 } 00412 00413 assert(InvariantOk()); 00414 return *this; 00415 } 00416 00417 void STRING::assign(const char *cstr, int len) { 00418 STRING_HEADER* this_header = GetHeader(); 00419 this_header->used_ = 0; // don't bother copying data if need to realloc 00420 char* this_cstr = ensure_cstr(len + 1); // +1 for '\0' 00421 00422 this_header = GetHeader(); // for realloc 00423 memcpy(this_cstr, cstr, len); 00424 this_cstr[len] = '\0'; 00425 this_header->used_ = len + 1; 00426 00427 assert(InvariantOk()); 00428 } 00429 00430 STRING STRING::operator+(const STRING& str) const { 00431 STRING result(*this); 00432 result += str; 00433 00434 assert(InvariantOk()); 00435 return result; 00436 } 00437 00438 00439 STRING STRING::operator+(const char ch) const { 00440 STRING result; 00441 FixHeader(); 00442 const STRING_HEADER* this_header = GetHeader(); 00443 int this_used = this_header->used_; 00444 char* result_cstr = result.ensure_cstr(this_used + 1); 00445 STRING_HEADER* result_header = result.GetHeader(); 00446 int result_used = result_header->used_; 00447 00448 // copies '\0' but we'll overwrite that 00449 memcpy(result_cstr, GetCStr(), this_used); 00450 result_cstr[result_used] = ch; // overwrite old '\0' 00451 result_cstr[result_used + 1] = '\0'; // append on '\0' 00452 ++result_header->used_; 00453 00454 assert(InvariantOk()); 00455 return result; 00456 } 00457 00458 00459 STRING& STRING::operator+=(const char *str) { 00460 if (!str || !*str) // empty string has no effect 00461 return *this; 00462 00463 FixHeader(); 00464 int len = strlen(str) + 1; 00465 int this_used = GetHeader()->used_; 00466 char* this_cstr = ensure_cstr(this_used + len); 00467 STRING_HEADER* this_header = GetHeader(); // after ensure for realloc 00468 00469 // if we had non-empty string then append overwriting old '\0' 00470 // otherwise replace 00471 if (this_used > 0) { 00472 memcpy(this_cstr + this_used - 1, str, len); 00473 this_header->used_ += len - 1; 00474 } else { 00475 memcpy(this_cstr, str, len); 00476 this_header->used_ = len; 00477 } 00478 00479 assert(InvariantOk()); 00480 return *this; 00481 } 00482 00483 00484 STRING& STRING::operator+=(const char ch) { 00485 if (ch == '\0') 00486 return *this; 00487 00488 FixHeader(); 00489 int this_used = GetHeader()->used_; 00490 char* this_cstr = ensure_cstr(this_used + 1); 00491 STRING_HEADER* this_header = GetHeader(); 00492 00493 if (this_used > 0) 00494 --this_used; // undo old empty null if there was one 00495 00496 this_cstr[this_used++] = ch; // append ch to end 00497 this_cstr[this_used++] = '\0'; // append '\0' after ch 00498 this_header->used_ = this_used; 00499 00500 assert(InvariantOk()); 00501 return *this; 00502 }