tesseract  3.04.01
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
scanutils.cpp
Go to the documentation of this file.
1 // Copyright 2006 Google Inc.
2 // All Rights Reserved.
3 // Author: renn
4 //
5 // The fscanf, vfscanf and creat functions are implemented so that their
6 // functionality is mostly like their stdio counterparts. However, currently
7 // these functions do not use any buffering, making them rather slow.
8 // File streams are thus processed one character at a time.
9 // Although the implementations of the scanf functions do lack a few minor
10 // features, they should be sufficient for their use in tesseract.
11 //
12 // Licensed under the Apache License, Version 2.0 (the "License");
13 // you may not use this file except in compliance with the License.
14 // You may obtain a copy of the License at
15 // http://www.apache.org/licenses/LICENSE-2.0
16 // Unless required by applicable law or agreed to in writing, software
17 // distributed under the License is distributed on an "AS IS" BASIS,
18 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 // See the License for the specific language governing permissions and
20 // limitations under the License.
21 
22 #ifdef HAVE_CONFIG_H
23 #include "config_auto.h"
24 #endif
25 
26 #include <ctype.h>
27 #include <math.h>
28 #include <stdarg.h>
29 #include <stddef.h>
30 #include <string.h>
31 #include <limits.h>
32 #include <stdio.h>
33 #include <sys/types.h>
34 #include <sys/stat.h>
35 #include <fcntl.h>
36 
37 #include "scanutils.h"
38 #include "tprintf.h"
39 
40 // workaround for "'off_t' was not declared in this scope" with -std=c++11
41 #if !defined(HAVE_OFF_T)
42 typedef long off_t;
43 #endif // off_t
44 
45 enum Flags {
46  FL_SPLAT = 0x01, // Drop the value, do not assign
47  FL_INV = 0x02, // Character-set with inverse
48  FL_WIDTH = 0x04, // Field width specified
49  FL_MINUS = 0x08, // Negative number
50 };
51 
52 enum Ranks {
53  RANK_CHAR = -2,
54  RANK_SHORT = -1,
55  RANK_INT = 0,
56  RANK_LONG = 1,
58  RANK_PTR = INT_MAX // Special value used for pointers
59 };
60 
61 const enum Ranks kMinRank = RANK_CHAR;
63 
65 const enum Ranks kSizeTRank = RANK_LONG;
67 
68 enum Bail {
69  BAIL_NONE = 0, // No error condition
70  BAIL_EOF, // Hit EOF
71  BAIL_ERR // Conversion mismatch
72 };
73 
74 // Helper functions ------------------------------------------------------------
75 inline size_t LongBit() {
76  return CHAR_BIT * sizeof(long);
77 }
78 
79 static inline int
80 SkipSpace(FILE *s) {
81  int p;
82  while (isspace(p = fgetc(s)));
83  ungetc(p, s); // Make sure next char is available for reading
84  return p;
85 }
86 
87 static inline void
88 SetBit(unsigned long *bitmap, unsigned int bit) {
89  bitmap[bit/LongBit()] |= 1UL << (bit%LongBit());
90 }
91 
92 static inline int
93 TestBit(unsigned long *bitmap, unsigned int bit) {
94  return static_cast<int>(bitmap[bit/LongBit()] >> (bit%LongBit())) & 1;
95 }
96 
97 static inline int DigitValue(int ch, int base) {
98  if (ch >= '0' && ch <= '9') {
99  if (base >= 10 || ch <= '7')
100  return ch-'0';
101  } else if (ch >= 'A' && ch <= 'Z' && base == 16) {
102  return ch-'A'+10;
103  } else if (ch >= 'a' && ch <= 'z' && base == 16) {
104  return ch-'a'+10;
105  }
106  return -1;
107 }
108 
109 // IO (re-)implementations -----------------------------------------------------
110 uintmax_t streamtoumax(FILE* s, int base) {
111  int minus = 0;
112  uintmax_t v = 0;
113  int d, c = 0;
114 
115  for (c = fgetc(s);
116  isspace(static_cast<unsigned char>(c)) && (c != EOF);
117  c = fgetc(s)) {}
118 
119  // Single optional + or -
120  if (c == '-' || c == '+') {
121  minus = (c == '-');
122  c = fgetc(s);
123  }
124 
125  // Assign correct base
126  if (base == 0) {
127  if (c == '0') {
128  c = fgetc(s);
129  if (c == 'x' || c == 'X') {
130  base = 16;
131  c = fgetc(s);
132  } else {
133  base = 8;
134  }
135  }
136  } else if (base == 16) {
137  if (c == '0') {
138  c = fgetc(s);
139  if (c == 'x' || c == 'X') c = fgetc(s);
140  }
141  }
142 
143  // Actual number parsing
144  for (; (c != EOF) && (d = DigitValue(c, base)) >= 0; c = fgetc(s))
145  v = v*base + d;
146 
147  ungetc(c, s);
148  return minus ? -v : v;
149 }
150 
151 double streamtofloat(FILE* s) {
152  int minus = 0;
153  int v = 0;
154  int d, c = 0;
155  int k = 1;
156  int w = 0;
157 
158  for (c = fgetc(s);
159  isspace(static_cast<unsigned char>(c)) && (c != EOF);
160  c = fgetc(s));
161 
162  // Single optional + or -
163  if (c == '-' || c == '+') {
164  minus = (c == '-');
165  c = fgetc(s);
166  }
167 
168  // Actual number parsing
169  for (; c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s))
170  v = v*10 + d;
171  if (c == '.') {
172  for (c = fgetc(s); c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
173  w = w*10 + d;
174  k *= 10;
175  }
176  }
177  double f = static_cast<double>(v)
178  + static_cast<double>(w) / static_cast<double>(k);
179  if (c == 'e' || c == 'E') {
180  c = fgetc(s);
181  int expsign = 1;
182  if (c == '-' || c == '+') {
183  expsign = (c == '-') ? -1 : 1;
184  c = fgetc(s);
185  }
186  int exponent = 0;
187  for (; (c != EOF) && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
188  exponent = exponent * 10 + d;
189  }
190  exponent *= expsign;
191  f *= pow(10.0, static_cast<double>(exponent));
192  }
193  ungetc(c, s);
194 
195  return minus ? -f : f;
196 }
197 
198 double strtofloat(const char* s) {
199  int minus = 0;
200  int v = 0;
201  int d;
202  int k = 1;
203  int w = 0;
204 
205  while(*s && isspace(static_cast<unsigned char>(*s))) s++;
206 
207  // Single optional + or -
208  if (*s == '-' || *s == '+') {
209  minus = (*s == '-');
210  s++;
211  }
212 
213  // Actual number parsing
214  for (; *s && (d = DigitValue(*s, 10)) >= 0; s++)
215  v = v*10 + d;
216  if (*s == '.') {
217  for (++s; *s && (d = DigitValue(*s, 10)) >= 0; s++) {
218  w = w*10 + d;
219  k *= 10;
220  }
221  }
222  if (*s == 'e' || *s == 'E')
223  tprintf("WARNING: Scientific Notation not supported!");
224 
225  double f = static_cast<double>(v)
226  + static_cast<double>(w) / static_cast<double>(k);
227 
228  return minus ? -f : f;
229 }
230 
231 static int tvfscanf(FILE* stream, const char *format, va_list ap);
232 
233 int tfscanf(FILE* stream, const char *format, ...) {
234  va_list ap;
235  int rv;
236 
237  va_start(ap, format);
238  rv = tvfscanf(stream, format, ap);
239  va_end(ap);
240 
241  return rv;
242 }
243 
244 #ifdef EMBEDDED
245 
246 int fscanf(FILE* stream, const char *format, ...) {
247  va_list ap;
248  int rv;
249 
250  va_start(ap, format);
251  rv = tvfscanf(stream, format, ap);
252  va_end(ap);
253 
254  return rv;
255 }
256 
257 int vfscanf(FILE* stream, const char *format, ...) {
258  va_list ap;
259  int rv;
260 
261  va_start(ap, format);
262  rv = tvfscanf(stream, format, ap);
263  va_end(ap);
264 
265  return rv;
266 }
267 #endif
268 
269 static int tvfscanf(FILE* stream, const char *format, va_list ap) {
270  const char *p = format;
271  char ch;
272  int q = 0;
273  uintmax_t val = 0;
274  int rank = RANK_INT; // Default rank
275  unsigned int width = UINT_MAX;
276  int base;
277  int flags = 0;
278  enum {
279  ST_NORMAL, // Ground state
280  ST_FLAGS, // Special flags
281  ST_WIDTH, // Field width
282  ST_MODIFIERS, // Length or conversion modifiers
283  ST_MATCH_INIT, // Initial state of %[ sequence
284  ST_MATCH, // Main state of %[ sequence
285  ST_MATCH_RANGE, // After - in a %[ sequence
286  } state = ST_NORMAL;
287  char *sarg = NULL; // %s %c or %[ string argument
288  enum Bail bail = BAIL_NONE;
289  int sign;
290  int converted = 0; // Successful conversions
291  unsigned long matchmap[((1 << CHAR_BIT)+(CHAR_BIT * sizeof(long) - 1)) /
292  (CHAR_BIT * sizeof(long))];
293  int matchinv = 0; // Is match map inverted?
294  unsigned char range_start = 0;
295  off_t start_off = ftell(stream);
296 
297  // Skip leading spaces
298  SkipSpace(stream);
299 
300  while ((ch = *p++) && !bail) {
301  switch (state) {
302  case ST_NORMAL:
303  if (ch == '%') {
304  state = ST_FLAGS;
305  flags = 0; rank = RANK_INT; width = UINT_MAX;
306  } else if (isspace(static_cast<unsigned char>(ch))) {
307  SkipSpace(stream);
308  } else {
309  if (fgetc(stream) != ch)
310  bail = BAIL_ERR; // Match failure
311  }
312  break;
313 
314  case ST_FLAGS:
315  if (ch == '*') {
316  flags |= FL_SPLAT;
317  } else if ('0' <= ch && ch <= '9') {
318  width = (ch-'0');
319  state = ST_WIDTH;
320  flags |= FL_WIDTH;
321  } else {
322  state = ST_MODIFIERS;
323  p--; // Process this character again
324  }
325  break;
326 
327  case ST_WIDTH:
328  if (ch >= '0' && ch <= '9') {
329  width = width*10+(ch-'0');
330  } else {
331  state = ST_MODIFIERS;
332  p--; // Process this character again
333  }
334  break;
335 
336  case ST_MODIFIERS:
337  switch (ch) {
338  // Length modifiers - nonterminal sequences
339  case 'h':
340  rank--; // Shorter rank
341  break;
342  case 'l':
343  rank++; // Longer rank
344  break;
345  case 'j':
346  rank = kIntMaxRank;
347  break;
348  case 'z':
349  rank = kSizeTRank;
350  break;
351  case 't':
352  rank = kPtrDiffRank;
353  break;
354  case 'L':
355  case 'q':
356  rank = RANK_LONGLONG; // long double/long long
357  break;
358 
359  default:
360  // Output modifiers - terminal sequences
361  state = ST_NORMAL; // Next state will be normal
362  if (rank < kMinRank) // Canonicalize rank
363  rank = kMinRank;
364  else if (rank > kMaxRank)
365  rank = kMaxRank;
366 
367  switch (ch) {
368  case 'P': // Upper case pointer
369  case 'p': // Pointer
370  rank = RANK_PTR;
371  base = 0; sign = 0;
372  goto scan_int;
373 
374  case 'i': // Base-independent integer
375  base = 0; sign = 1;
376  goto scan_int;
377 
378  case 'd': // Decimal integer
379  base = 10; sign = 1;
380  goto scan_int;
381 
382  case 'o': // Octal integer
383  base = 8; sign = 0;
384  goto scan_int;
385 
386  case 'u': // Unsigned decimal integer
387  base = 10; sign = 0;
388  goto scan_int;
389 
390  case 'x': // Hexadecimal integer
391  case 'X':
392  base = 16; sign = 0;
393  goto scan_int;
394 
395  case 'n': // Number of characters consumed
396  val = ftell(stream) - start_off;
397  goto set_integer;
398 
399  scan_int:
400  q = SkipSpace(stream);
401  if ( q <= 0 ) {
402  bail = BAIL_EOF;
403  break;
404  }
405  val = streamtoumax(stream, base);
406  // fall through
407 
408  set_integer:
409  if (!(flags & FL_SPLAT)) {
410  converted++;
411  switch(rank) {
412  case RANK_CHAR:
413  *va_arg(ap, unsigned char *)
414  = static_cast<unsigned char>(val);
415  break;
416  case RANK_SHORT:
417  *va_arg(ap, unsigned short *)
418  = static_cast<unsigned short>(val);
419  break;
420  case RANK_INT:
421  *va_arg(ap, unsigned int *)
422  = static_cast<unsigned int>(val);
423  break;
424  case RANK_LONG:
425  *va_arg(ap, unsigned long *)
426  = static_cast<unsigned long>(val);
427  break;
428  case RANK_LONGLONG:
429  *va_arg(ap, unsigned long long *)
430  = static_cast<unsigned long long>(val);
431  break;
432  case RANK_PTR:
433  *va_arg(ap, void **)
434  = reinterpret_cast<void *>(static_cast<uintptr_t>(val));
435  break;
436  }
437  }
438  break;
439 
440  case 'f': // Preliminary float value parsing
441  case 'g':
442  case 'G':
443  case 'e':
444  case 'E':
445  q = SkipSpace(stream);
446  if (q <= 0) {
447  bail = BAIL_EOF;
448  break;
449  }
450 
451  {
452  double fval = streamtofloat(stream);
453  if (!(flags & FL_SPLAT)) {
454  if (rank == RANK_INT)
455  *va_arg(ap, float *) = static_cast<float>(fval);
456  else if (rank == RANK_LONG)
457  *va_arg(ap, double *) = static_cast<double>(fval);
458  converted++;
459  }
460  }
461  break;
462 
463  case 'c': // Character
464  width = (flags & FL_WIDTH) ? width : 1; // Default width == 1
465  sarg = va_arg(ap, char *);
466  while (width--) {
467  if ((q = fgetc(stream)) <= 0) {
468  bail = BAIL_EOF;
469  break;
470  }
471  if (!(flags & FL_SPLAT)) {
472  *sarg++ = q;
473  converted++;
474  }
475  }
476  break;
477 
478  case 's': // String
479  {
480  char *sp;
481  sp = sarg = va_arg(ap, char *);
482  while (width--) {
483  q = fgetc(stream);
484  if (isspace(static_cast<unsigned char>(q)) || q <= 0) {
485  ungetc(q, stream);
486  break;
487  }
488  if (!(flags & FL_SPLAT)) *sp = q;
489  sp++;
490  }
491  if (sarg == sp) {
492  bail = BAIL_EOF;
493  } else if (!(flags & FL_SPLAT)) {
494  *sp = '\0'; // Terminate output
495  converted++;
496  } else {
497  }
498  }
499  break;
500 
501  case '[': // Character range
502  sarg = va_arg(ap, char *);
503  state = ST_MATCH_INIT;
504  matchinv = 0;
505  memset(matchmap, 0, sizeof matchmap);
506  break;
507 
508  case '%': // %% sequence
509  if (fgetc(stream) != '%' )
510  bail = BAIL_ERR;
511  break;
512 
513  default: // Anything else
514  bail = BAIL_ERR; // Unknown sequence
515  break;
516  }
517  }
518  break;
519 
520  case ST_MATCH_INIT: // Initial state for %[ match
521  if (ch == '^' && !(flags & FL_INV)) {
522  matchinv = 1;
523  } else {
524  SetBit(matchmap, static_cast<unsigned char>(ch));
525  state = ST_MATCH;
526  }
527  break;
528 
529  case ST_MATCH: // Main state for %[ match
530  if (ch == ']') {
531  goto match_run;
532  } else if (ch == '-') {
533  range_start = static_cast<unsigned char>(ch);
534  state = ST_MATCH_RANGE;
535  } else {
536  SetBit(matchmap, static_cast<unsigned char>(ch));
537  }
538  break;
539 
540  case ST_MATCH_RANGE: // %[ match after -
541  if (ch == ']') {
542  SetBit(matchmap, static_cast<unsigned char>('-'));
543  goto match_run;
544  } else {
545  int i;
546  for (i = range_start ; i < (static_cast<unsigned char>(ch)) ; i++)
547  SetBit(matchmap, i);
548  state = ST_MATCH;
549  }
550  break;
551 
552  match_run: // Match expression finished
553  char* oarg = sarg;
554  while (width) {
555  q = fgetc(stream);
556  unsigned char qc = static_cast<unsigned char>(q);
557  if (q <= 0 || !(TestBit(matchmap, qc)^matchinv)) {
558  ungetc(q, stream);
559  break;
560  }
561  if (!(flags & FL_SPLAT)) *sarg = q;
562  sarg++;
563  }
564  if (oarg == sarg) {
565  bail = (q <= 0) ? BAIL_EOF : BAIL_ERR;
566  } else if (!(flags & FL_SPLAT)) {
567  *sarg = '\0';
568  converted++;
569  }
570  break;
571  }
572  }
573 
574  if (bail == BAIL_EOF && !converted)
575  converted = -1; // Return EOF (-1)
576 
577  return converted;
578 }
579 
580 #ifdef EMBEDDED
581 int creat(const char *pathname, mode_t mode) {
582  return open(pathname, O_CREAT | O_TRUNC | O_WRONLY, mode);
583 }
584 
585 #endif // EMBEDDED
size_t LongBit()
Definition: scanutils.cpp:75
double streamtofloat(FILE *s)
Definition: scanutils.cpp:151
Flags
Definition: scanutils.cpp:45
#define tprintf(...)
Definition: tprintf.h:31
double strtofloat(const char *s)
Definition: scanutils.cpp:198
enum Ranks kMinRank
Definition: scanutils.cpp:61
uintmax_t streamtoumax(FILE *s, int base)
Definition: scanutils.cpp:110
enum Ranks kMaxRank
Definition: scanutils.cpp:62
long off_t
Definition: scanutils.cpp:42
enum Ranks kPtrDiffRank
Definition: scanutils.cpp:66
CMD_EVENTS mode
Definition: pgedit.cpp:116
Ranks
Definition: scanutils.cpp:52
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:233
enum Ranks kIntMaxRank
Definition: scanutils.cpp:64
enum Ranks kSizeTRank
Definition: scanutils.cpp:65
Bail
Definition: scanutils.cpp:68