1 // Copyright 2006 Google Inc.
2 // All Rights Reserved.
3 // Author: renn
4 //
5 // The fscanf, vfscanf and creat functions are implemented so that their
6 // functionality is mostly like their stdio counterparts. However, currently
7 // these functions do not use any buffering, making them rather slow.
8 // File streams are thus processed one character at a time.
9 // Although the implementations of the scanf functions do lack a few minor
10 // features, they should be sufficient for their use in tesseract.
11 //
12 // Licensed under the Apache License, Version 2.0 (the "License");
13 // you may not use this file except in compliance with the License.
14 // You may obtain a copy of the License at
15 // http://www.apache.org/licenses/LICENSE-2.0
16 // Unless required by applicable law or agreed to in writing, software
17 // distributed under the License is distributed on an "AS IS" BASIS,
18 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 // See the License for the specific language governing permissions and
20 // limitations under the License.
21
22 #include <ctype.h>
23 #include <stdarg.h>
24 #include <stddef.h>
25 #include <inttypes.h>
26 #include <string.h>
27 #include <limits.h>
28 #include <stdio.h>
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 #include <fcntl.h>
32
33 #include "scanutils.h"
34 #include "tprintf.h"
35
36 enum Flags {
37 FL_SPLAT = 0x01, // Drop the value, do not assign
38 FL_INV = 0x02, // Character-set with inverse
39 FL_WIDTH = 0x04, // Field width specified
40 FL_MINUS = 0x08, // Negative number
41 };
42
43 enum Ranks {
44 RANK_CHAR = -2,
45 RANK_SHORT = -1,
46 RANK_INT = 0,
47 RANK_LONG = 1,
48 RANK_LONGLONG = 2,
49 RANK_PTR = INT_MAX // Special value used for pointers
50 };
51
52 const enum Ranks kMinRank = RANK_CHAR;
53 const enum Ranks kMaxRank = RANK_LONGLONG;
54
55 const enum Ranks kIntMaxRank = RANK_LONGLONG;
56 const enum Ranks kSizeTRank = RANK_LONG;
57 const enum Ranks kPtrDiffRank = RANK_LONG;
58
59 enum Bail {
60 BAIL_NONE = 0, // No error condition
61 BAIL_EOF, // Hit EOF
62 BAIL_ERR // Conversion mismatch
63 };
64
65 // Helper functions ------------------------------------------------------------
LongBit()66 inline size_t LongBit() {
67 return CHAR_BIT * sizeof(long);
68 }
69
70 static inline int
SkipSpace(FILE * s)71 SkipSpace(FILE *s)
72 {
73 int p;
74 while (isspace(p = fgetc(s)));
75 ungetc(p, s); // Make sure next char is available for reading
76 return p;
77 }
78
79 static inline void
SetBit(unsigned long * bitmap,unsigned int bit)80 SetBit(unsigned long *bitmap, unsigned int bit)
81 {
82 bitmap[bit/LongBit()] |= 1UL << (bit%LongBit());
83 }
84
85 static inline int
TestBit(unsigned long * bitmap,unsigned int bit)86 TestBit(unsigned long *bitmap, unsigned int bit)
87 {
88 return static_cast<int>(bitmap[bit/LongBit()] >> (bit%LongBit())) & 1;
89 }
90
DigitValue(int ch)91 static inline int DigitValue(int ch)
92 {
93 if (ch >= '0' && ch <= '9') {
94 return ch-'0';
95 } else if (ch >= 'A' && ch <= 'Z') {
96 return ch-'A'+10;
97 } else if (ch >= 'a' && ch <= 'z') {
98 return ch-'a'+10;
99 } else {
100 return -1;
101 }
102 }
103
104 // IO (re-)implementations -----------------------------------------------------
streamtoumax(FILE * s,int base)105 uintmax_t streamtoumax(FILE* s, int base)
106 {
107 int minus = 0;
108 uintmax_t v = 0;
109 int d, c = 0;
110
111 for (c = fgetc(s);
112 isspace(static_cast<unsigned char>(c)) && (c != EOF);
113 c = fgetc(s))
114
115 // Single optional + or -
116 if (c == '-' || c == '+') {
117 minus = (c == '-');
118 c = fgetc(s);
119 }
120
121 // Assign correct base
122 if (base == 0) {
123 if (c == '0') {
124 c = fgetc(s);
125 if (c == 'x' || c == 'X') {
126 base = 16;
127 c = fgetc(s);
128 } else {
129 base = 8;
130 }
131 }
132 } else if (base == 16) {
133 if (c == '0') {
134 c = fgetc(s);
135 if (c == 'x' && c == 'X') c = fgetc(s);
136 }
137 }
138
139 // Actual number parsing
140 for (; (c != EOF) && (d = DigitValue(c)) >= 0 && d < base; c = fgetc(s))
141 v = v*base + d;
142
143 ungetc(c, s);
144 return minus ? -v : v;
145 }
146
streamtofloat(FILE * s)147 double streamtofloat(FILE* s)
148 {
149 int minus = 0;
150 int v = 0;
151 int d, c = 0;
152 int k = 1;
153 int w = 0;
154
155 for (c = fgetc(s);
156 isspace(static_cast<unsigned char>(c)) && (c != EOF);
157 c = fgetc(s));
158
159 // Single optional + or -
160 if (c == '-' || c == '+') {
161 minus = (c == '-');
162 c = fgetc(s);
163 }
164
165 // Actual number parsing
166 for (; (c != EOF) && (d = DigitValue(c)) >= 0; c = fgetc(s))
167 v = v*10 + d;
168 if (c == '.') {
169 for (c = fgetc(s); (c != EOF) && (d = DigitValue(c)) >= 0; c = fgetc(s)) {
170 w = w*10 + d;
171 k *= 10;
172 }
173 } else if (c == 'e' || c == 'E')
174 tprintf("WARNING: Scientific Notation not supported!");
175
176 ungetc(c, s);
177 double f = static_cast<double>(v)
178 + static_cast<double>(w) / static_cast<double>(k);
179
180 return minus ? -f : f;
181 }
182
strtofloat(const char * s)183 double strtofloat(const char* s)
184 {
185 int minus = 0;
186 int v = 0;
187 int d;
188 int k = 1;
189 int w = 0;
190
191 while(*s && isspace(static_cast<unsigned char>(*s))) s++;
192
193 // Single optional + or -
194 if (*s == '-' || *s == '+') {
195 minus = (*s == '-');
196 s++;
197 }
198
199 // Actual number parsing
200 for (; *s && (d = DigitValue(*s)) >= 0; s++)
201 v = v*10 + d;
202 if (*s == '.') {
203 for (++s; *s && (d = DigitValue(*s)) >= 0; s++) {
204 w = w*10 + d;
205 k *= 10;
206 }
207 } else if (*s == 'e' || *s == 'E')
208 tprintf("WARNING: Scientific Notation not supported!");
209
210 double f = static_cast<double>(v)
211 + static_cast<double>(w) / static_cast<double>(k);
212
213 return minus ? -f : f;
214 }
215
fscanf(FILE * stream,const char * format,...)216 int fscanf(FILE* stream, const char *format, ...)
217 {
218 va_list ap;
219 int rv;
220
221 va_start(ap, format);
222 rv = vfscanf(stream, format, ap);
223 va_end(ap);
224
225 return rv;
226 }
227
vfscanf(FILE * stream,const char * format,va_list ap)228 int vfscanf(FILE* stream, const char *format, va_list ap)
229 {
230 const char *p = format;
231 char ch;
232 int q = 0;
233 uintmax_t val = 0;
234 int rank = RANK_INT; // Default rank
235 unsigned int width = ~0;
236 int base;
237 int flags = 0;
238 enum {
239 ST_NORMAL, // Ground state
240 ST_FLAGS, // Special flags
241 ST_WIDTH, // Field width
242 ST_MODIFIERS, // Length or conversion modifiers
243 ST_MATCH_INIT, // Initial state of %[ sequence
244 ST_MATCH, // Main state of %[ sequence
245 ST_MATCH_RANGE, // After - in a %[ sequence
246 } state = ST_NORMAL;
247 char *sarg = NULL; // %s %c or %[ string argument
248 enum Bail bail = BAIL_NONE;
249 int sign;
250 int converted = 0; // Successful conversions
251 unsigned long matchmap[((1 << CHAR_BIT)+(LongBit()-1))/LongBit()];
252 int matchinv = 0; // Is match map inverted?
253 unsigned char range_start = 0;
254 off_t start_off = ftell(stream);
255
256 // Skip leading spaces
257 SkipSpace(stream);
258
259 while ((ch = *p++) && !bail) {
260 switch (state) {
261 case ST_NORMAL:
262 if (ch == '%') {
263 state = ST_FLAGS;
264 flags = 0; rank = RANK_INT; width = ~0;
265 } else if (isspace(static_cast<unsigned char>(ch))) {
266 SkipSpace(stream);
267 } else {
268 if (fgetc(stream) != ch)
269 bail = BAIL_ERR; // Match failure
270 }
271 break;
272
273 case ST_FLAGS:
274 switch (ch) {
275 case '*':
276 flags |= FL_SPLAT;
277 break;
278
279 case '0' ... '9':
280 width = (ch-'0');
281 state = ST_WIDTH;
282 flags |= FL_WIDTH;
283 break;
284
285 default:
286 state = ST_MODIFIERS;
287 p--; // Process this character again
288 break;
289 }
290 break;
291
292 case ST_WIDTH:
293 if (ch >= '0' && ch <= '9') {
294 width = width*10+(ch-'0');
295 } else {
296 state = ST_MODIFIERS;
297 p--; // Process this character again
298 }
299 break;
300
301 case ST_MODIFIERS:
302 switch (ch) {
303 // Length modifiers - nonterminal sequences
304 case 'h':
305 rank--; // Shorter rank
306 break;
307 case 'l':
308 rank++; // Longer rank
309 break;
310 case 'j':
311 rank = kIntMaxRank;
312 break;
313 case 'z':
314 rank = kSizeTRank;
315 break;
316 case 't':
317 rank = kPtrDiffRank;
318 break;
319 case 'L':
320 case 'q':
321 rank = RANK_LONGLONG; // long double/long long
322 break;
323
324 default:
325 // Output modifiers - terminal sequences
326 state = ST_NORMAL; // Next state will be normal
327 if (rank < kMinRank) // Canonicalize rank
328 rank = kMinRank;
329 else if (rank > kMaxRank)
330 rank = kMaxRank;
331
332 switch (ch) {
333 case 'P': // Upper case pointer
334 case 'p': // Pointer
335 rank = RANK_PTR;
336 base = 0; sign = 0;
337 goto scan_int;
338
339 case 'i': // Base-independent integer
340 base = 0; sign = 1;
341 goto scan_int;
342
343 case 'd': // Decimal integer
344 base = 10; sign = 1;
345 goto scan_int;
346
347 case 'o': // Octal integer
348 base = 8; sign = 0;
349 goto scan_int;
350
351 case 'u': // Unsigned decimal integer
352 base = 10; sign = 0;
353 goto scan_int;
354
355 case 'x': // Hexadecimal integer
356 case 'X':
357 base = 16; sign = 0;
358 goto scan_int;
359
360 case 'n': // Number of characters consumed
361 val = ftell(stream) - start_off;
362 goto set_integer;
363
364 scan_int:
365 q = SkipSpace(stream);
366 if ( q <= 0 ) {
367 bail = BAIL_EOF;
368 break;
369 }
370 val = streamtoumax(stream, base);
371 converted++;
372 // fall through
373
374 set_integer:
375 if (!(flags & FL_SPLAT)) {
376 switch(rank) {
377 case RANK_CHAR:
378 *va_arg(ap, unsigned char *)
379 = static_cast<unsigned char>(val);
380 break;
381 case RANK_SHORT:
382 *va_arg(ap, unsigned short *)
383 = static_cast<unsigned short>(val);
384 break;
385 case RANK_INT:
386 *va_arg(ap, unsigned int *)
387 = static_cast<unsigned int>(val);
388 break;
389 case RANK_LONG:
390 *va_arg(ap, unsigned long *)
391 = static_cast<unsigned long>(val);
392 break;
393 case RANK_LONGLONG:
394 *va_arg(ap, unsigned long long *)
395 = static_cast<unsigned long long>(val);
396 break;
397 case RANK_PTR:
398 *va_arg(ap, void **)
399 = reinterpret_cast<void *>(static_cast<uintptr_t>(val));
400 break;
401 }
402 }
403 break;
404
405 case 'f': // Preliminary float value parsing
406 case 'g':
407 case 'G':
408 case 'e':
409 case 'E':
410 q = SkipSpace(stream);
411 if (q <= 0) {
412 bail = BAIL_EOF;
413 break;
414 }
415
416 {
417 double fval = streamtofloat(stream);
418 switch(rank) {
419 case RANK_INT:
420 *va_arg(ap, float *) = static_cast<float>(fval);
421 break;
422 case RANK_LONG:
423 *va_arg(ap, double *) = static_cast<double>(fval);
424 break;
425 }
426 converted++;
427 }
428 break;
429
430 case 'c': // Character
431 width = (flags & FL_WIDTH) ? width : 1; // Default width == 1
432 sarg = va_arg(ap, char *);
433 while (width--) {
434 if ((q = fgetc(stream)) <= 0) {
435 bail = BAIL_EOF;
436 break;
437 }
438 *sarg++ = q;
439 }
440 if (!bail)
441 converted++;
442 break;
443
444 case 's': // String
445 {
446 char *sp;
447 sp = sarg = va_arg(ap, char *);
448 while (width--) {
449 q = fgetc(stream);
450 if (isspace(static_cast<unsigned char>(q)) || q <= 0) {
451 ungetc(q, stream);
452 break;
453 }
454 *sp++ = q;
455 }
456 if (sarg != sp) {
457 *sp = '\0'; // Terminate output
458 converted++;
459 } else {
460 bail = BAIL_EOF;
461 }
462 }
463 break;
464
465 case '[': // Character range
466 sarg = va_arg(ap, char *);
467 state = ST_MATCH_INIT;
468 matchinv = 0;
469 memset(matchmap, 0, sizeof matchmap);
470 break;
471
472 case '%': // %% sequence
473 if (fgetc(stream) != '%' )
474 bail = BAIL_ERR;
475 break;
476
477 default: // Anything else
478 bail = BAIL_ERR; // Unknown sequence
479 break;
480 }
481 }
482 break;
483
484 case ST_MATCH_INIT: // Initial state for %[ match
485 if (ch == '^' && !(flags & FL_INV)) {
486 matchinv = 1;
487 } else {
488 SetBit(matchmap, static_cast<unsigned char>(ch));
489 state = ST_MATCH;
490 }
491 break;
492
493 case ST_MATCH: // Main state for %[ match
494 if (ch == ']') {
495 goto match_run;
496 } else if (ch == '-') {
497 range_start = static_cast<unsigned char>(ch);
498 state = ST_MATCH_RANGE;
499 } else {
500 SetBit(matchmap, static_cast<unsigned char>(ch));
501 }
502 break;
503
504 case ST_MATCH_RANGE: // %[ match after -
505 if (ch == ']') {
506 SetBit(matchmap, static_cast<unsigned char>('-'));
507 goto match_run;
508 } else {
509 int i;
510 for (i = range_start ; i < (static_cast<unsigned char>(ch)) ; i++)
511 SetBit(matchmap, i);
512 state = ST_MATCH;
513 }
514 break;
515
516 match_run: // Match expression finished
517 char* oarg = sarg;
518 while (width) {
519 q = fgetc(stream);
520 unsigned char qc = static_cast<unsigned char>(q);
521 if (q <= 0 || !(TestBit(matchmap, qc)^matchinv)) {
522 ungetc(q, stream);
523 break;
524 }
525 *sarg++ = q;
526 }
527 if (oarg != sarg) {
528 *sarg = '\0';
529 converted++;
530 } else {
531 bail = (q <= 0) ? BAIL_EOF : BAIL_ERR;
532 }
533 break;
534 }
535 }
536
537 if (bail == BAIL_EOF && !converted)
538 converted = -1; // Return EOF (-1)
539
540 return converted;
541 }
542
creat(const char * pathname,mode_t mode)543 int creat(const char *pathname, mode_t mode)
544 {
545 return open(pathname, O_CREAT | O_TRUNC | O_WRONLY, mode);
546 }
547