tesseract 3.04.01

ccutil/scanutils.cpp

Go to the documentation of this file.
00001 // Copyright 2006 Google Inc.
00002 // All Rights Reserved.
00003 // Author: renn
00004 //
00005 // The fscanf, vfscanf and creat functions are implemented so that their
00006 // functionality is mostly like their stdio counterparts. However, currently
00007 // these functions do not use any buffering, making them rather slow.
00008 // File streams are thus processed one character at a time.
00009 // Although the implementations of the scanf functions do lack a few minor
00010 // features, they should be sufficient for their use in tesseract.
00011 //
00012 // Licensed under the Apache License, Version 2.0 (the "License");
00013 // you may not use this file except in compliance with the License.
00014 // You may obtain a copy of the License at
00015 // http://www.apache.org/licenses/LICENSE-2.0
00016 // Unless required by applicable law or agreed to in writing, software
00017 // distributed under the License is distributed on an "AS IS" BASIS,
00018 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00019 // See the License for the specific language governing permissions and
00020 // limitations under the License.
00021 
00022 #ifdef HAVE_CONFIG_H
00023 #include "config_auto.h"
00024 #endif
00025 
00026 #include <ctype.h>
00027 #include <math.h>
00028 #include <stdarg.h>
00029 #include <stddef.h>
00030 #include <string.h>
00031 #include <limits.h>
00032 #include <stdio.h>
00033 #include <sys/types.h>
00034 #include <sys/stat.h>
00035 #include <fcntl.h>
00036 
00037 #include "scanutils.h"
00038 #include "tprintf.h"
00039 
00040 // workaround for "'off_t' was not declared in this scope" with -std=c++11
00041 #if !defined(HAVE_OFF_T)
00042 typedef long off_t;
00043 #endif  // off_t
00044 
00045 enum Flags {
00046   FL_SPLAT  = 0x01,   // Drop the value, do not assign
00047   FL_INV    = 0x02,   // Character-set with inverse
00048   FL_WIDTH  = 0x04,   // Field width specified
00049   FL_MINUS  = 0x08,   // Negative number
00050 };
00051 
00052 enum Ranks {
00053   RANK_CHAR = -2,
00054   RANK_SHORT  = -1,
00055   RANK_INT  = 0,
00056   RANK_LONG = 1,
00057   RANK_LONGLONG = 2,
00058   RANK_PTR      = INT_MAX // Special value used for pointers
00059 };
00060 
00061 const enum Ranks kMinRank = RANK_CHAR;
00062 const enum Ranks kMaxRank = RANK_LONGLONG;
00063 
00064 const enum Ranks kIntMaxRank = RANK_LONGLONG;
00065 const enum Ranks kSizeTRank = RANK_LONG;
00066 const enum Ranks kPtrDiffRank = RANK_LONG;
00067 
00068 enum Bail {
00069   BAIL_NONE = 0,    // No error condition
00070   BAIL_EOF,         // Hit EOF
00071   BAIL_ERR          // Conversion mismatch
00072 };
00073 
00074 // Helper functions ------------------------------------------------------------
00075 inline size_t LongBit() {
00076   return CHAR_BIT * sizeof(long);
00077 }
00078 
00079 static inline int
00080 SkipSpace(FILE *s) {
00081   int p;
00082   while (isspace(p = fgetc(s)));
00083   ungetc(p, s);  // Make sure next char is available for reading
00084   return p;
00085 }
00086 
00087 static inline void
00088 SetBit(unsigned long *bitmap, unsigned int bit) {
00089   bitmap[bit/LongBit()] |= 1UL << (bit%LongBit());
00090 }
00091 
00092 static inline int
00093 TestBit(unsigned long *bitmap, unsigned int bit) {
00094   return static_cast<int>(bitmap[bit/LongBit()] >> (bit%LongBit())) & 1;
00095 }
00096 
00097 static inline int DigitValue(int ch, int base) {
00098   if (ch >= '0' && ch <= '9') {
00099     if (base >= 10 || ch <= '7')
00100       return ch-'0';
00101   } else if (ch >= 'A' && ch <= 'Z' && base == 16) {
00102     return ch-'A'+10;
00103   } else if (ch >= 'a' && ch <= 'z' && base == 16) {
00104     return ch-'a'+10;
00105   }
00106   return -1;
00107 }
00108 
00109 // IO (re-)implementations -----------------------------------------------------
00110 uintmax_t streamtoumax(FILE* s, int base) {
00111   int minus = 0;
00112   uintmax_t v = 0;
00113   int d, c = 0;
00114 
00115   for (c = fgetc(s);
00116     isspace(static_cast<unsigned char>(c)) && (c != EOF);
00117     c = fgetc(s)) {}
00118 
00119   // Single optional + or -
00120   if (c == '-' || c == '+') {
00121     minus = (c == '-');
00122     c = fgetc(s);
00123   }
00124 
00125   // Assign correct base
00126   if (base == 0) {
00127     if (c == '0') {
00128       c = fgetc(s);
00129       if (c == 'x' || c == 'X') {
00130         base = 16;
00131         c = fgetc(s);
00132       } else {
00133         base = 8;
00134       }
00135     }
00136   } else if (base == 16) {
00137     if (c == '0') {
00138       c = fgetc(s);
00139       if (c == 'x' || c == 'X') c = fgetc(s);
00140     }
00141   }
00142 
00143   // Actual number parsing
00144   for (; (c != EOF) && (d = DigitValue(c, base)) >= 0; c = fgetc(s))
00145     v = v*base + d;
00146 
00147   ungetc(c, s);
00148   return minus ? -v : v;
00149 }
00150 
00151 double streamtofloat(FILE* s) {
00152   int minus = 0;
00153   int v = 0;
00154   int d, c = 0;
00155   int k = 1;
00156   int w = 0;
00157 
00158   for (c = fgetc(s);
00159     isspace(static_cast<unsigned char>(c)) && (c != EOF);
00160     c = fgetc(s));
00161 
00162   // Single optional + or -
00163   if (c == '-' || c == '+') {
00164     minus = (c == '-');
00165     c = fgetc(s);
00166   }
00167 
00168   // Actual number parsing
00169   for (; c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s))
00170     v = v*10 + d;
00171   if (c == '.') {
00172     for (c = fgetc(s); c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
00173       w = w*10 + d;
00174       k *= 10;
00175     }
00176   }
00177   double f  = static_cast<double>(v)
00178             + static_cast<double>(w) / static_cast<double>(k);
00179   if (c == 'e' || c == 'E') {
00180     c = fgetc(s);
00181     int expsign = 1;
00182     if (c == '-' || c == '+') {
00183       expsign = (c == '-') ? -1 : 1;
00184       c = fgetc(s);
00185     }
00186     int exponent = 0;
00187     for (; (c != EOF) && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
00188       exponent = exponent * 10 + d;
00189     }
00190     exponent *= expsign;
00191     f *= pow(10.0, static_cast<double>(exponent));
00192   }
00193   ungetc(c, s);
00194 
00195   return minus ? -f : f;
00196 }
00197 
00198 double strtofloat(const char* s) {
00199   int minus = 0;
00200   int v = 0;
00201   int d;
00202   int k = 1;
00203   int w = 0;
00204 
00205   while(*s && isspace(static_cast<unsigned char>(*s))) s++;
00206 
00207   // Single optional + or -
00208   if (*s == '-' || *s == '+') {
00209     minus = (*s == '-');
00210     s++;
00211   }
00212 
00213   // Actual number parsing
00214   for (; *s && (d = DigitValue(*s, 10)) >= 0; s++)
00215     v = v*10 + d;
00216   if (*s == '.') {
00217     for (++s; *s && (d = DigitValue(*s, 10)) >= 0; s++) {
00218       w = w*10 + d;
00219       k *= 10;
00220     }
00221   }
00222   if (*s == 'e' || *s == 'E')
00223     tprintf("WARNING: Scientific Notation not supported!");
00224 
00225   double f  = static_cast<double>(v)
00226             + static_cast<double>(w) / static_cast<double>(k);
00227 
00228   return minus ? -f : f;
00229 }
00230 
00231 static int tvfscanf(FILE* stream, const char *format, va_list ap);
00232 
00233 int tfscanf(FILE* stream, const char *format, ...) {
00234   va_list ap;
00235   int rv;
00236 
00237   va_start(ap, format);
00238   rv = tvfscanf(stream, format, ap);
00239   va_end(ap);
00240 
00241   return rv;
00242 }
00243 
00244 #ifdef EMBEDDED
00245 
00246 int fscanf(FILE* stream, const char *format, ...) {
00247   va_list ap;
00248   int rv;
00249 
00250   va_start(ap, format);
00251   rv = tvfscanf(stream, format, ap);
00252   va_end(ap);
00253 
00254   return rv;
00255 }
00256 
00257 int vfscanf(FILE* stream, const char *format, ...) {
00258   va_list ap;
00259   int rv;
00260 
00261   va_start(ap, format);
00262   rv = tvfscanf(stream, format, ap);
00263   va_end(ap);
00264 
00265   return rv;
00266 }
00267 #endif
00268 
00269 static int tvfscanf(FILE* stream, const char *format, va_list ap) {
00270   const char *p = format;
00271   char ch;
00272   int q = 0;
00273   uintmax_t val = 0;
00274   int rank = RANK_INT;    // Default rank
00275   unsigned int width = UINT_MAX;
00276   int base;
00277   int flags = 0;
00278   enum {
00279     ST_NORMAL,        // Ground state
00280     ST_FLAGS,         // Special flags
00281     ST_WIDTH,         // Field width
00282     ST_MODIFIERS,     // Length or conversion modifiers
00283     ST_MATCH_INIT,    // Initial state of %[ sequence
00284     ST_MATCH,         // Main state of %[ sequence
00285     ST_MATCH_RANGE,   // After - in a %[ sequence
00286   } state = ST_NORMAL;
00287   char *sarg = NULL;    // %s %c or %[ string argument
00288   enum Bail bail = BAIL_NONE;
00289   int sign;
00290   int converted = 0;    // Successful conversions
00291   unsigned long matchmap[((1 << CHAR_BIT)+(CHAR_BIT * sizeof(long) - 1)) /
00292       (CHAR_BIT * sizeof(long))];
00293   int matchinv = 0;   // Is match map inverted?
00294   unsigned char range_start = 0;
00295   off_t start_off = ftell(stream);
00296 
00297   // Skip leading spaces
00298   SkipSpace(stream);
00299 
00300   while ((ch = *p++) && !bail) {
00301     switch (state) {
00302       case ST_NORMAL:
00303         if (ch == '%') {
00304           state = ST_FLAGS;
00305           flags = 0; rank = RANK_INT; width = UINT_MAX;
00306         } else if (isspace(static_cast<unsigned char>(ch))) {
00307           SkipSpace(stream);
00308         } else {
00309           if (fgetc(stream) != ch)
00310             bail = BAIL_ERR;  // Match failure
00311         }
00312         break;
00313 
00314       case ST_FLAGS:
00315         if (ch == '*') {
00316           flags |= FL_SPLAT;
00317         } else if ('0' <= ch && ch <= '9') {
00318           width = (ch-'0');
00319           state = ST_WIDTH;
00320           flags |= FL_WIDTH;
00321         } else {
00322           state = ST_MODIFIERS;
00323           p--;      // Process this character again
00324         }
00325       break;
00326 
00327       case ST_WIDTH:
00328         if (ch >= '0' && ch <= '9') {
00329           width = width*10+(ch-'0');
00330         } else {
00331           state = ST_MODIFIERS;
00332           p--;      // Process this character again
00333         }
00334       break;
00335 
00336       case ST_MODIFIERS:
00337         switch (ch) {
00338           // Length modifiers - nonterminal sequences
00339           case 'h':
00340             rank--;     // Shorter rank
00341           break;
00342           case 'l':
00343             rank++;     // Longer rank
00344           break;
00345           case 'j':
00346             rank = kIntMaxRank;
00347           break;
00348           case 'z':
00349             rank = kSizeTRank;
00350           break;
00351           case 't':
00352             rank = kPtrDiffRank;
00353           break;
00354           case 'L':
00355           case 'q':
00356             rank = RANK_LONGLONG; // long double/long long
00357           break;
00358 
00359           default:
00360             // Output modifiers - terminal sequences
00361             state = ST_NORMAL;  // Next state will be normal
00362             if (rank < kMinRank)  // Canonicalize rank
00363               rank = kMinRank;
00364             else if (rank > kMaxRank)
00365               rank = kMaxRank;
00366 
00367           switch (ch) {
00368             case 'P':   // Upper case pointer
00369             case 'p':   // Pointer
00370               rank = RANK_PTR;
00371               base = 0; sign = 0;
00372             goto scan_int;
00373 
00374             case 'i':   // Base-independent integer
00375               base = 0; sign = 1;
00376             goto scan_int;
00377 
00378             case 'd':   // Decimal integer
00379               base = 10; sign = 1;
00380             goto scan_int;
00381 
00382             case 'o':   // Octal integer
00383               base = 8; sign = 0;
00384             goto scan_int;
00385 
00386             case 'u':   // Unsigned decimal integer
00387               base = 10; sign = 0;
00388             goto scan_int;
00389 
00390             case 'x':   // Hexadecimal integer
00391             case 'X':
00392               base = 16; sign = 0;
00393             goto scan_int;
00394 
00395             case 'n':   // Number of characters consumed
00396               val = ftell(stream) - start_off;
00397             goto set_integer;
00398 
00399             scan_int:
00400               q = SkipSpace(stream);
00401               if ( q <= 0 ) {
00402                 bail = BAIL_EOF;
00403                 break;
00404               }
00405               val = streamtoumax(stream, base);
00406               // fall through
00407 
00408             set_integer:
00409               if (!(flags & FL_SPLAT)) {
00410                 converted++;
00411                 switch(rank) {
00412                   case RANK_CHAR:
00413                     *va_arg(ap, unsigned char *)
00414                       = static_cast<unsigned char>(val);
00415                   break;
00416                   case RANK_SHORT:
00417                     *va_arg(ap, unsigned short *)
00418                       = static_cast<unsigned short>(val);
00419                   break;
00420                   case RANK_INT:
00421                     *va_arg(ap, unsigned int *)
00422                       = static_cast<unsigned int>(val);
00423                   break;
00424                   case RANK_LONG:
00425                     *va_arg(ap, unsigned long *)
00426                       = static_cast<unsigned long>(val);
00427                   break;
00428                   case RANK_LONGLONG:
00429                     *va_arg(ap, unsigned long long *)
00430                       = static_cast<unsigned long long>(val);
00431                   break;
00432                   case RANK_PTR:
00433                     *va_arg(ap, void **)
00434                       = reinterpret_cast<void *>(static_cast<uintptr_t>(val));
00435                   break;
00436                 }
00437               }
00438             break;
00439 
00440             case 'f':   // Preliminary float value parsing
00441             case 'g':
00442             case 'G':
00443             case 'e':
00444             case 'E':
00445               q = SkipSpace(stream);
00446               if (q <= 0) {
00447                 bail = BAIL_EOF;
00448                 break;
00449               }
00450 
00451               {
00452               double fval = streamtofloat(stream);
00453               if (!(flags & FL_SPLAT)) {
00454                 if (rank == RANK_INT)
00455                   *va_arg(ap, float *) = static_cast<float>(fval);
00456                 else if (rank == RANK_LONG)
00457                   *va_arg(ap, double *) = static_cast<double>(fval);
00458                 converted++;
00459               }
00460               }
00461             break;
00462 
00463             case 'c':               // Character
00464               width = (flags & FL_WIDTH) ? width : 1; // Default width == 1
00465               sarg = va_arg(ap, char *);
00466               while (width--) {
00467                 if ((q = fgetc(stream)) <= 0) {
00468                   bail = BAIL_EOF;
00469                   break;
00470                 }
00471                 if (!(flags & FL_SPLAT)) {
00472                   *sarg++ = q;
00473                   converted++;
00474                 }
00475               }
00476             break;
00477 
00478             case 's':               // String
00479             {
00480               char *sp;
00481               sp = sarg = va_arg(ap, char *);
00482               while (width--) {
00483                 q = fgetc(stream);
00484                 if (isspace(static_cast<unsigned char>(q)) || q <= 0) {
00485                   ungetc(q, stream);
00486                   break;
00487                 }
00488                 if (!(flags & FL_SPLAT)) *sp = q;
00489                 sp++;
00490               }
00491               if (sarg == sp) {
00492                 bail = BAIL_EOF;
00493               } else if (!(flags & FL_SPLAT)) {
00494                 *sp = '\0'; // Terminate output
00495                 converted++;
00496               } else {
00497               }
00498             }
00499             break;
00500 
00501             case '[':   // Character range
00502               sarg = va_arg(ap, char *);
00503               state = ST_MATCH_INIT;
00504               matchinv = 0;
00505               memset(matchmap, 0, sizeof matchmap);
00506             break;
00507 
00508             case '%':   // %% sequence
00509               if (fgetc(stream) != '%' )
00510                 bail = BAIL_ERR;
00511             break;
00512 
00513             default:    // Anything else
00514               bail = BAIL_ERR;  // Unknown sequence
00515             break;
00516           }
00517         }
00518       break;
00519 
00520       case ST_MATCH_INIT:   // Initial state for %[ match
00521         if (ch == '^' && !(flags & FL_INV)) {
00522           matchinv = 1;
00523         } else {
00524           SetBit(matchmap, static_cast<unsigned char>(ch));
00525           state = ST_MATCH;
00526         }
00527       break;
00528 
00529       case ST_MATCH:    // Main state for %[ match
00530         if (ch == ']') {
00531           goto match_run;
00532         } else if (ch == '-') {
00533           range_start = static_cast<unsigned char>(ch);
00534           state = ST_MATCH_RANGE;
00535         } else {
00536           SetBit(matchmap, static_cast<unsigned char>(ch));
00537         }
00538       break;
00539 
00540       case ST_MATCH_RANGE:    // %[ match after -
00541         if (ch == ']') {
00542           SetBit(matchmap, static_cast<unsigned char>('-'));
00543           goto match_run;
00544         } else {
00545           int i;
00546           for (i = range_start ; i < (static_cast<unsigned char>(ch)) ; i++)
00547           SetBit(matchmap, i);
00548           state = ST_MATCH;
00549         }
00550       break;
00551 
00552       match_run:      // Match expression finished
00553         char* oarg = sarg;
00554         while (width) {
00555           q = fgetc(stream);
00556           unsigned char qc = static_cast<unsigned char>(q);
00557           if (q <= 0 || !(TestBit(matchmap, qc)^matchinv)) {
00558             ungetc(q, stream);
00559             break;
00560           }
00561           if (!(flags & FL_SPLAT)) *sarg = q;
00562           sarg++;
00563         }
00564         if (oarg == sarg) {
00565           bail = (q <= 0) ? BAIL_EOF : BAIL_ERR;
00566         } else if (!(flags & FL_SPLAT)) {
00567           *sarg = '\0';
00568           converted++;
00569         }
00570       break;
00571     }
00572   }
00573 
00574   if (bail == BAIL_EOF && !converted)
00575     converted = -1;   // Return EOF (-1)
00576 
00577   return converted;
00578 }
00579 
00580 #ifdef EMBEDDED
00581 int creat(const char *pathname, mode_t mode) {
00582   return open(pathname, O_CREAT | O_TRUNC | O_WRONLY, mode);
00583 }
00584 
00585 #endif  // EMBEDDED
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines