|
tesseract 3.04.01
|
00001 // Copyright 2006 Google Inc. 00002 // All Rights Reserved. 00003 // Author: renn 00004 // 00005 // The fscanf, vfscanf and creat functions are implemented so that their 00006 // functionality is mostly like their stdio counterparts. However, currently 00007 // these functions do not use any buffering, making them rather slow. 00008 // File streams are thus processed one character at a time. 00009 // Although the implementations of the scanf functions do lack a few minor 00010 // features, they should be sufficient for their use in tesseract. 00011 // 00012 // Licensed under the Apache License, Version 2.0 (the "License"); 00013 // you may not use this file except in compliance with the License. 00014 // You may obtain a copy of the License at 00015 // http://www.apache.org/licenses/LICENSE-2.0 00016 // Unless required by applicable law or agreed to in writing, software 00017 // distributed under the License is distributed on an "AS IS" BASIS, 00018 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00019 // See the License for the specific language governing permissions and 00020 // limitations under the License. 00021 00022 #ifdef HAVE_CONFIG_H 00023 #include "config_auto.h" 00024 #endif 00025 00026 #include <ctype.h> 00027 #include <math.h> 00028 #include <stdarg.h> 00029 #include <stddef.h> 00030 #include <string.h> 00031 #include <limits.h> 00032 #include <stdio.h> 00033 #include <sys/types.h> 00034 #include <sys/stat.h> 00035 #include <fcntl.h> 00036 00037 #include "scanutils.h" 00038 #include "tprintf.h" 00039 00040 // workaround for "'off_t' was not declared in this scope" with -std=c++11 00041 #if !defined(HAVE_OFF_T) 00042 typedef long off_t; 00043 #endif // off_t 00044 00045 enum Flags { 00046 FL_SPLAT = 0x01, // Drop the value, do not assign 00047 FL_INV = 0x02, // Character-set with inverse 00048 FL_WIDTH = 0x04, // Field width specified 00049 FL_MINUS = 0x08, // Negative number 00050 }; 00051 00052 enum Ranks { 00053 RANK_CHAR = -2, 00054 RANK_SHORT = -1, 00055 RANK_INT = 0, 00056 RANK_LONG = 1, 00057 RANK_LONGLONG = 2, 00058 RANK_PTR = INT_MAX // Special value used for pointers 00059 }; 00060 00061 const enum Ranks kMinRank = RANK_CHAR; 00062 const enum Ranks kMaxRank = RANK_LONGLONG; 00063 00064 const enum Ranks kIntMaxRank = RANK_LONGLONG; 00065 const enum Ranks kSizeTRank = RANK_LONG; 00066 const enum Ranks kPtrDiffRank = RANK_LONG; 00067 00068 enum Bail { 00069 BAIL_NONE = 0, // No error condition 00070 BAIL_EOF, // Hit EOF 00071 BAIL_ERR // Conversion mismatch 00072 }; 00073 00074 // Helper functions ------------------------------------------------------------ 00075 inline size_t LongBit() { 00076 return CHAR_BIT * sizeof(long); 00077 } 00078 00079 static inline int 00080 SkipSpace(FILE *s) { 00081 int p; 00082 while (isspace(p = fgetc(s))); 00083 ungetc(p, s); // Make sure next char is available for reading 00084 return p; 00085 } 00086 00087 static inline void 00088 SetBit(unsigned long *bitmap, unsigned int bit) { 00089 bitmap[bit/LongBit()] |= 1UL << (bit%LongBit()); 00090 } 00091 00092 static inline int 00093 TestBit(unsigned long *bitmap, unsigned int bit) { 00094 return static_cast<int>(bitmap[bit/LongBit()] >> (bit%LongBit())) & 1; 00095 } 00096 00097 static inline int DigitValue(int ch, int base) { 00098 if (ch >= '0' && ch <= '9') { 00099 if (base >= 10 || ch <= '7') 00100 return ch-'0'; 00101 } else if (ch >= 'A' && ch <= 'Z' && base == 16) { 00102 return ch-'A'+10; 00103 } else if (ch >= 'a' && ch <= 'z' && base == 16) { 00104 return ch-'a'+10; 00105 } 00106 return -1; 00107 } 00108 00109 // IO (re-)implementations ----------------------------------------------------- 00110 uintmax_t streamtoumax(FILE* s, int base) { 00111 int minus = 0; 00112 uintmax_t v = 0; 00113 int d, c = 0; 00114 00115 for (c = fgetc(s); 00116 isspace(static_cast<unsigned char>(c)) && (c != EOF); 00117 c = fgetc(s)) {} 00118 00119 // Single optional + or - 00120 if (c == '-' || c == '+') { 00121 minus = (c == '-'); 00122 c = fgetc(s); 00123 } 00124 00125 // Assign correct base 00126 if (base == 0) { 00127 if (c == '0') { 00128 c = fgetc(s); 00129 if (c == 'x' || c == 'X') { 00130 base = 16; 00131 c = fgetc(s); 00132 } else { 00133 base = 8; 00134 } 00135 } 00136 } else if (base == 16) { 00137 if (c == '0') { 00138 c = fgetc(s); 00139 if (c == 'x' || c == 'X') c = fgetc(s); 00140 } 00141 } 00142 00143 // Actual number parsing 00144 for (; (c != EOF) && (d = DigitValue(c, base)) >= 0; c = fgetc(s)) 00145 v = v*base + d; 00146 00147 ungetc(c, s); 00148 return minus ? -v : v; 00149 } 00150 00151 double streamtofloat(FILE* s) { 00152 int minus = 0; 00153 int v = 0; 00154 int d, c = 0; 00155 int k = 1; 00156 int w = 0; 00157 00158 for (c = fgetc(s); 00159 isspace(static_cast<unsigned char>(c)) && (c != EOF); 00160 c = fgetc(s)); 00161 00162 // Single optional + or - 00163 if (c == '-' || c == '+') { 00164 minus = (c == '-'); 00165 c = fgetc(s); 00166 } 00167 00168 // Actual number parsing 00169 for (; c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) 00170 v = v*10 + d; 00171 if (c == '.') { 00172 for (c = fgetc(s); c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) { 00173 w = w*10 + d; 00174 k *= 10; 00175 } 00176 } 00177 double f = static_cast<double>(v) 00178 + static_cast<double>(w) / static_cast<double>(k); 00179 if (c == 'e' || c == 'E') { 00180 c = fgetc(s); 00181 int expsign = 1; 00182 if (c == '-' || c == '+') { 00183 expsign = (c == '-') ? -1 : 1; 00184 c = fgetc(s); 00185 } 00186 int exponent = 0; 00187 for (; (c != EOF) && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) { 00188 exponent = exponent * 10 + d; 00189 } 00190 exponent *= expsign; 00191 f *= pow(10.0, static_cast<double>(exponent)); 00192 } 00193 ungetc(c, s); 00194 00195 return minus ? -f : f; 00196 } 00197 00198 double strtofloat(const char* s) { 00199 int minus = 0; 00200 int v = 0; 00201 int d; 00202 int k = 1; 00203 int w = 0; 00204 00205 while(*s && isspace(static_cast<unsigned char>(*s))) s++; 00206 00207 // Single optional + or - 00208 if (*s == '-' || *s == '+') { 00209 minus = (*s == '-'); 00210 s++; 00211 } 00212 00213 // Actual number parsing 00214 for (; *s && (d = DigitValue(*s, 10)) >= 0; s++) 00215 v = v*10 + d; 00216 if (*s == '.') { 00217 for (++s; *s && (d = DigitValue(*s, 10)) >= 0; s++) { 00218 w = w*10 + d; 00219 k *= 10; 00220 } 00221 } 00222 if (*s == 'e' || *s == 'E') 00223 tprintf("WARNING: Scientific Notation not supported!"); 00224 00225 double f = static_cast<double>(v) 00226 + static_cast<double>(w) / static_cast<double>(k); 00227 00228 return minus ? -f : f; 00229 } 00230 00231 static int tvfscanf(FILE* stream, const char *format, va_list ap); 00232 00233 int tfscanf(FILE* stream, const char *format, ...) { 00234 va_list ap; 00235 int rv; 00236 00237 va_start(ap, format); 00238 rv = tvfscanf(stream, format, ap); 00239 va_end(ap); 00240 00241 return rv; 00242 } 00243 00244 #ifdef EMBEDDED 00245 00246 int fscanf(FILE* stream, const char *format, ...) { 00247 va_list ap; 00248 int rv; 00249 00250 va_start(ap, format); 00251 rv = tvfscanf(stream, format, ap); 00252 va_end(ap); 00253 00254 return rv; 00255 } 00256 00257 int vfscanf(FILE* stream, const char *format, ...) { 00258 va_list ap; 00259 int rv; 00260 00261 va_start(ap, format); 00262 rv = tvfscanf(stream, format, ap); 00263 va_end(ap); 00264 00265 return rv; 00266 } 00267 #endif 00268 00269 static int tvfscanf(FILE* stream, const char *format, va_list ap) { 00270 const char *p = format; 00271 char ch; 00272 int q = 0; 00273 uintmax_t val = 0; 00274 int rank = RANK_INT; // Default rank 00275 unsigned int width = UINT_MAX; 00276 int base; 00277 int flags = 0; 00278 enum { 00279 ST_NORMAL, // Ground state 00280 ST_FLAGS, // Special flags 00281 ST_WIDTH, // Field width 00282 ST_MODIFIERS, // Length or conversion modifiers 00283 ST_MATCH_INIT, // Initial state of %[ sequence 00284 ST_MATCH, // Main state of %[ sequence 00285 ST_MATCH_RANGE, // After - in a %[ sequence 00286 } state = ST_NORMAL; 00287 char *sarg = NULL; // %s %c or %[ string argument 00288 enum Bail bail = BAIL_NONE; 00289 int sign; 00290 int converted = 0; // Successful conversions 00291 unsigned long matchmap[((1 << CHAR_BIT)+(CHAR_BIT * sizeof(long) - 1)) / 00292 (CHAR_BIT * sizeof(long))]; 00293 int matchinv = 0; // Is match map inverted? 00294 unsigned char range_start = 0; 00295 off_t start_off = ftell(stream); 00296 00297 // Skip leading spaces 00298 SkipSpace(stream); 00299 00300 while ((ch = *p++) && !bail) { 00301 switch (state) { 00302 case ST_NORMAL: 00303 if (ch == '%') { 00304 state = ST_FLAGS; 00305 flags = 0; rank = RANK_INT; width = UINT_MAX; 00306 } else if (isspace(static_cast<unsigned char>(ch))) { 00307 SkipSpace(stream); 00308 } else { 00309 if (fgetc(stream) != ch) 00310 bail = BAIL_ERR; // Match failure 00311 } 00312 break; 00313 00314 case ST_FLAGS: 00315 if (ch == '*') { 00316 flags |= FL_SPLAT; 00317 } else if ('0' <= ch && ch <= '9') { 00318 width = (ch-'0'); 00319 state = ST_WIDTH; 00320 flags |= FL_WIDTH; 00321 } else { 00322 state = ST_MODIFIERS; 00323 p--; // Process this character again 00324 } 00325 break; 00326 00327 case ST_WIDTH: 00328 if (ch >= '0' && ch <= '9') { 00329 width = width*10+(ch-'0'); 00330 } else { 00331 state = ST_MODIFIERS; 00332 p--; // Process this character again 00333 } 00334 break; 00335 00336 case ST_MODIFIERS: 00337 switch (ch) { 00338 // Length modifiers - nonterminal sequences 00339 case 'h': 00340 rank--; // Shorter rank 00341 break; 00342 case 'l': 00343 rank++; // Longer rank 00344 break; 00345 case 'j': 00346 rank = kIntMaxRank; 00347 break; 00348 case 'z': 00349 rank = kSizeTRank; 00350 break; 00351 case 't': 00352 rank = kPtrDiffRank; 00353 break; 00354 case 'L': 00355 case 'q': 00356 rank = RANK_LONGLONG; // long double/long long 00357 break; 00358 00359 default: 00360 // Output modifiers - terminal sequences 00361 state = ST_NORMAL; // Next state will be normal 00362 if (rank < kMinRank) // Canonicalize rank 00363 rank = kMinRank; 00364 else if (rank > kMaxRank) 00365 rank = kMaxRank; 00366 00367 switch (ch) { 00368 case 'P': // Upper case pointer 00369 case 'p': // Pointer 00370 rank = RANK_PTR; 00371 base = 0; sign = 0; 00372 goto scan_int; 00373 00374 case 'i': // Base-independent integer 00375 base = 0; sign = 1; 00376 goto scan_int; 00377 00378 case 'd': // Decimal integer 00379 base = 10; sign = 1; 00380 goto scan_int; 00381 00382 case 'o': // Octal integer 00383 base = 8; sign = 0; 00384 goto scan_int; 00385 00386 case 'u': // Unsigned decimal integer 00387 base = 10; sign = 0; 00388 goto scan_int; 00389 00390 case 'x': // Hexadecimal integer 00391 case 'X': 00392 base = 16; sign = 0; 00393 goto scan_int; 00394 00395 case 'n': // Number of characters consumed 00396 val = ftell(stream) - start_off; 00397 goto set_integer; 00398 00399 scan_int: 00400 q = SkipSpace(stream); 00401 if ( q <= 0 ) { 00402 bail = BAIL_EOF; 00403 break; 00404 } 00405 val = streamtoumax(stream, base); 00406 // fall through 00407 00408 set_integer: 00409 if (!(flags & FL_SPLAT)) { 00410 converted++; 00411 switch(rank) { 00412 case RANK_CHAR: 00413 *va_arg(ap, unsigned char *) 00414 = static_cast<unsigned char>(val); 00415 break; 00416 case RANK_SHORT: 00417 *va_arg(ap, unsigned short *) 00418 = static_cast<unsigned short>(val); 00419 break; 00420 case RANK_INT: 00421 *va_arg(ap, unsigned int *) 00422 = static_cast<unsigned int>(val); 00423 break; 00424 case RANK_LONG: 00425 *va_arg(ap, unsigned long *) 00426 = static_cast<unsigned long>(val); 00427 break; 00428 case RANK_LONGLONG: 00429 *va_arg(ap, unsigned long long *) 00430 = static_cast<unsigned long long>(val); 00431 break; 00432 case RANK_PTR: 00433 *va_arg(ap, void **) 00434 = reinterpret_cast<void *>(static_cast<uintptr_t>(val)); 00435 break; 00436 } 00437 } 00438 break; 00439 00440 case 'f': // Preliminary float value parsing 00441 case 'g': 00442 case 'G': 00443 case 'e': 00444 case 'E': 00445 q = SkipSpace(stream); 00446 if (q <= 0) { 00447 bail = BAIL_EOF; 00448 break; 00449 } 00450 00451 { 00452 double fval = streamtofloat(stream); 00453 if (!(flags & FL_SPLAT)) { 00454 if (rank == RANK_INT) 00455 *va_arg(ap, float *) = static_cast<float>(fval); 00456 else if (rank == RANK_LONG) 00457 *va_arg(ap, double *) = static_cast<double>(fval); 00458 converted++; 00459 } 00460 } 00461 break; 00462 00463 case 'c': // Character 00464 width = (flags & FL_WIDTH) ? width : 1; // Default width == 1 00465 sarg = va_arg(ap, char *); 00466 while (width--) { 00467 if ((q = fgetc(stream)) <= 0) { 00468 bail = BAIL_EOF; 00469 break; 00470 } 00471 if (!(flags & FL_SPLAT)) { 00472 *sarg++ = q; 00473 converted++; 00474 } 00475 } 00476 break; 00477 00478 case 's': // String 00479 { 00480 char *sp; 00481 sp = sarg = va_arg(ap, char *); 00482 while (width--) { 00483 q = fgetc(stream); 00484 if (isspace(static_cast<unsigned char>(q)) || q <= 0) { 00485 ungetc(q, stream); 00486 break; 00487 } 00488 if (!(flags & FL_SPLAT)) *sp = q; 00489 sp++; 00490 } 00491 if (sarg == sp) { 00492 bail = BAIL_EOF; 00493 } else if (!(flags & FL_SPLAT)) { 00494 *sp = '\0'; // Terminate output 00495 converted++; 00496 } else { 00497 } 00498 } 00499 break; 00500 00501 case '[': // Character range 00502 sarg = va_arg(ap, char *); 00503 state = ST_MATCH_INIT; 00504 matchinv = 0; 00505 memset(matchmap, 0, sizeof matchmap); 00506 break; 00507 00508 case '%': // %% sequence 00509 if (fgetc(stream) != '%' ) 00510 bail = BAIL_ERR; 00511 break; 00512 00513 default: // Anything else 00514 bail = BAIL_ERR; // Unknown sequence 00515 break; 00516 } 00517 } 00518 break; 00519 00520 case ST_MATCH_INIT: // Initial state for %[ match 00521 if (ch == '^' && !(flags & FL_INV)) { 00522 matchinv = 1; 00523 } else { 00524 SetBit(matchmap, static_cast<unsigned char>(ch)); 00525 state = ST_MATCH; 00526 } 00527 break; 00528 00529 case ST_MATCH: // Main state for %[ match 00530 if (ch == ']') { 00531 goto match_run; 00532 } else if (ch == '-') { 00533 range_start = static_cast<unsigned char>(ch); 00534 state = ST_MATCH_RANGE; 00535 } else { 00536 SetBit(matchmap, static_cast<unsigned char>(ch)); 00537 } 00538 break; 00539 00540 case ST_MATCH_RANGE: // %[ match after - 00541 if (ch == ']') { 00542 SetBit(matchmap, static_cast<unsigned char>('-')); 00543 goto match_run; 00544 } else { 00545 int i; 00546 for (i = range_start ; i < (static_cast<unsigned char>(ch)) ; i++) 00547 SetBit(matchmap, i); 00548 state = ST_MATCH; 00549 } 00550 break; 00551 00552 match_run: // Match expression finished 00553 char* oarg = sarg; 00554 while (width) { 00555 q = fgetc(stream); 00556 unsigned char qc = static_cast<unsigned char>(q); 00557 if (q <= 0 || !(TestBit(matchmap, qc)^matchinv)) { 00558 ungetc(q, stream); 00559 break; 00560 } 00561 if (!(flags & FL_SPLAT)) *sarg = q; 00562 sarg++; 00563 } 00564 if (oarg == sarg) { 00565 bail = (q <= 0) ? BAIL_EOF : BAIL_ERR; 00566 } else if (!(flags & FL_SPLAT)) { 00567 *sarg = '\0'; 00568 converted++; 00569 } 00570 break; 00571 } 00572 } 00573 00574 if (bail == BAIL_EOF && !converted) 00575 converted = -1; // Return EOF (-1) 00576 00577 return converted; 00578 } 00579 00580 #ifdef EMBEDDED 00581 int creat(const char *pathname, mode_t mode) { 00582 return open(pathname, O_CREAT | O_TRUNC | O_WRONLY, mode); 00583 } 00584 00585 #endif // EMBEDDED