• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 1998-2003, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *
9 * File read.c
10 *
11 * Modification History:
12 *
13 *   Date        Name        Description
14 *   05/26/99    stephen     Creation.
15 *   5/10/01     Ram         removed ustdio dependency
16 *******************************************************************************
17 */
18 
19 #include "read.h"
20 #include "errmsg.h"
21 #include "unicode/ustring.h"
22 
23 #define OPENBRACE    0x007B
24 #define CLOSEBRACE   0x007D
25 #define COMMA        0x002C
26 #define QUOTE        0x0022
27 #define ESCAPE       0x005C
28 #define SLASH        0x002F
29 #define ASTERISK     0x002A
30 #define SPACE        0x0020
31 #define COLON        0x003A
32 #define BADBOM       0xFFFE
33 #define CR           0x000D
34 #define LF           0x000A
35 
36 static int32_t lineCount;
37 
38 /* Protos */
39 static enum ETokenType getStringToken(UCHARBUF *buf,
40                                       UChar32 initialChar,
41                                       struct UString *token,
42                                       UErrorCode *status);
43 
44 static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
45 static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
46 static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
47 static UBool   isWhitespace          (UChar32 c);
48 static UBool   isNewline             (UChar32 c);
49 
resetLineNumber()50 void resetLineNumber() {
51     lineCount = 1;
52 }
53 
54 /* Read and return the next token from the stream.  If the token is of
55    type eString, fill in the token parameter with the token.  If the
56    token is eError, then the status parameter will contain the
57    specific error.  This will be eItemNotFound at the end of file,
58    indicating that all tokens have been returned.  This method will
59    never return eString twice in a row; instead, multiple adjacent
60    string tokens will be merged into one, with no intervening
61    space. */
getNextToken(UCHARBUF * buf,struct UString * token,uint32_t * linenumber,struct UString * comment,UErrorCode * status)62 enum ETokenType getNextToken(UCHARBUF* buf,
63                              struct UString *token,
64                              uint32_t *linenumber, /* out: linenumber of token */
65                              struct UString *comment,
66                              UErrorCode *status) {
67     enum ETokenType result;
68     UChar32         c;
69 
70     if (U_FAILURE(*status)) {
71         return TOK_ERROR;
72     }
73 
74     /* Skip whitespace */
75     c = getNextChar(buf, TRUE, comment, status);
76 
77     if (U_FAILURE(*status)) {
78         return TOK_ERROR;
79     }
80 
81     *linenumber = lineCount;
82 
83     switch(c) {
84     case BADBOM:
85         return TOK_ERROR;
86     case OPENBRACE:
87         return TOK_OPEN_BRACE;
88     case CLOSEBRACE:
89         return TOK_CLOSE_BRACE;
90     case COMMA:
91         return TOK_COMMA;
92     case U_EOF:
93         return TOK_EOF;
94     case COLON:
95         return TOK_COLON;
96 
97     default:
98         result = getStringToken(buf, c, token, status);
99     }
100 
101     *linenumber = lineCount;
102     return result;
103 }
104 
105 /* Copy a string token into the given UnicodeString.  Upon entry, we
106    have already read the first character of the string token, which is
107    not a whitespace character (but may be a QUOTE or ESCAPE). This
108    function reads all subsequent characters that belong with this
109    string, and copy them into the token parameter. The other
110    important, and slightly convoluted purpose of this function is to
111    merge adjacent strings.  It looks forward a bit, and if the next
112    non comment, non whitespace item is a string, it reads it in as
113    well.  If two adjacent strings are quoted, they are merged without
114    intervening space.  Otherwise a single SPACE character is
115    inserted. */
getStringToken(UCHARBUF * buf,UChar32 initialChar,struct UString * token,UErrorCode * status)116 static enum ETokenType getStringToken(UCHARBUF* buf,
117                                       UChar32 initialChar,
118                                       struct UString *token,
119                                       UErrorCode *status) {
120     UBool    lastStringWasQuoted;
121     UChar32  c;
122     UChar    target[3] = { '\0' };
123     UChar    *pTarget   = target;
124     int      len=0;
125     UBool    isFollowingCharEscaped=FALSE;
126     UBool    isNLUnescaped = FALSE;
127     UChar32  prevC=0;
128 
129     /* We are guaranteed on entry that initialChar is not a whitespace
130        character. If we are at the EOF, or have some other problem, it
131        doesn't matter; we still want to validly return the initialChar
132        (if nothing else) as a string token. */
133 
134     if (U_FAILURE(*status)) {
135         return TOK_ERROR;
136     }
137 
138     /* setup */
139     lastStringWasQuoted = FALSE;
140     c = initialChar;
141     ustr_setlen(token, 0, status);
142 
143     if (U_FAILURE(*status)) {
144         return TOK_ERROR;
145     }
146 
147     for (;;) {
148         if (c == QUOTE) {
149             if (!lastStringWasQuoted && token->fLength > 0) {
150                 ustr_ucat(token, SPACE, status);
151 
152                 if (U_FAILURE(*status)) {
153                     return TOK_ERROR;
154                 }
155             }
156 
157             lastStringWasQuoted = TRUE;
158 
159             for (;;) {
160                 c = ucbuf_getc(buf,status);
161 
162                 /* EOF reached */
163                 if (c == U_EOF) {
164                     return TOK_EOF;
165                 }
166 
167                 /* Unterminated quoted strings */
168                 if (U_FAILURE(*status)) {
169                     return TOK_ERROR;
170                 }
171 
172                 if (c == QUOTE && !isFollowingCharEscaped) {
173                     break;
174                 }
175 
176                 if (c == ESCAPE  && !isFollowingCharEscaped) {
177                     pTarget = target;
178                     c       = unescape(buf, status);
179 
180                     if (c == U_ERR) {
181                         return TOK_ERROR;
182                     }
183                     if(c == CR || c == LF){
184                         isNLUnescaped = TRUE;
185                     }
186                 }
187 
188                 if(c==ESCAPE && !isFollowingCharEscaped){
189                     isFollowingCharEscaped = TRUE;
190                 }else{
191                     U_APPEND_CHAR32(c, pTarget,len);
192                     pTarget = target;
193                     ustr_uscat(token, pTarget,len, status);
194                     isFollowingCharEscaped = FALSE;
195                     len=0;
196                     if(c == CR || c == LF){
197                         if(isNLUnescaped == FALSE && prevC!=CR){
198                             lineCount++;
199                         }
200                         isNLUnescaped = FALSE;
201                     }
202                 }
203 
204                 if (U_FAILURE(*status)) {
205                     return TOK_ERROR;
206                 }
207                 prevC = c;
208             }
209         } else {
210             if (token->fLength > 0) {
211                 ustr_ucat(token, SPACE, status);
212 
213                 if (U_FAILURE(*status)) {
214                     return TOK_ERROR;
215                 }
216             }
217 
218             if(lastStringWasQuoted){
219                 if(getShowWarning()){
220                     warning(lineCount, "Mixing quoted and unquoted strings");
221                 }
222                 if(isStrict()){
223                     return TOK_ERROR;
224                 }
225 
226             }
227 
228             lastStringWasQuoted = FALSE;
229 
230             /* if we reach here we are mixing
231              * quoted and unquoted strings
232              * warn in normal mode and error in
233              * pedantic mode
234              */
235 
236             if (c == ESCAPE) {
237                 pTarget = target;
238                 c       = unescape(buf, status);
239 
240                 /* EOF reached */
241                 if (c == U_EOF) {
242                     return TOK_ERROR;
243                 }
244             }
245 
246             U_APPEND_CHAR32(c, pTarget,len);
247             pTarget = target;
248             ustr_uscat(token, pTarget,len, status);
249             len=0;
250 
251             if (U_FAILURE(*status)) {
252                 return TOK_ERROR;
253             }
254 
255             for (;;) {
256                 /* DON'T skip whitespace */
257                 c = getNextChar(buf, FALSE, NULL, status);
258 
259                 /* EOF reached */
260                 if (c == U_EOF) {
261                     ucbuf_ungetc(c, buf);
262                     return TOK_STRING;
263                 }
264 
265                 if (U_FAILURE(*status)) {
266                     return TOK_STRING;
267                 }
268 
269                 if (c == QUOTE
270                         || c == OPENBRACE
271                         || c == CLOSEBRACE
272                         || c == COMMA
273                         || c == COLON) {
274                     ucbuf_ungetc(c, buf);
275                     break;
276                 }
277 
278                 if (isWhitespace(c)) {
279                     break;
280                 }
281 
282                 if (c == ESCAPE) {
283                     pTarget = target;
284                     c       = unescape(buf, status);
285 
286                     if (c == U_ERR) {
287                         return TOK_ERROR;
288                     }
289                 }
290 
291                 U_APPEND_CHAR32(c, pTarget,len);
292                 pTarget = target;
293                 ustr_uscat(token, pTarget,len, status);
294                 len=0;
295                 if (U_FAILURE(*status)) {
296                     return TOK_ERROR;
297                 }
298             }
299         }
300 
301         /* DO skip whitespace */
302         c = getNextChar(buf, TRUE, NULL, status);
303 
304         if (U_FAILURE(*status)) {
305             return TOK_STRING;
306         }
307 
308         if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
309             ucbuf_ungetc(c, buf);
310             return TOK_STRING;
311         }
312     }
313 }
314 
315 /* Retrieve the next character.  If skipwhite is
316    true, whitespace is skipped as well. */
getNextChar(UCHARBUF * buf,UBool skipwhite,struct UString * token,UErrorCode * status)317 static UChar32 getNextChar(UCHARBUF* buf,
318                            UBool skipwhite,
319                            struct UString *token,
320                            UErrorCode *status) {
321     UChar32 c, c2;
322 
323     if (U_FAILURE(*status)) {
324         return U_EOF;
325     }
326 
327     for (;;) {
328         c = ucbuf_getc(buf,status);
329 
330         if (c == U_EOF) {
331             return U_EOF;
332         }
333 
334         if (skipwhite && isWhitespace(c)) {
335             continue;
336         }
337 
338         /* This also handles the get() failing case */
339         if (c != SLASH) {
340             return c;
341         }
342 
343         c = ucbuf_getc(buf,status);
344 
345         if (c == U_EOF) {
346             return U_EOF;
347         }
348 
349         switch (c) {
350         case SLASH:
351             seekUntilNewline(buf, NULL, status);
352             break;
353 
354         case ASTERISK:
355             c2 = ucbuf_getc(buf, status);
356             if(c2== ASTERISK){
357                 /* parse multi-line comment and store it in token*/
358                 seekUntilEndOfComment(buf, token, status);
359             }else{
360                 ucbuf_ungetc(c, buf);
361                 seekUntilEndOfComment(buf, NULL, status);
362             }
363             break;
364 
365         default:
366             ucbuf_ungetc(c, buf);
367             /* If get() failed this is a NOP */
368             return SLASH;
369         }
370 
371     }
372 }
373 
seekUntilNewline(UCHARBUF * buf,struct UString * token,UErrorCode * status)374 static void seekUntilNewline(UCHARBUF* buf,
375                              struct UString *token,
376                              UErrorCode *status) {
377     UChar32 c;
378 
379     if (U_FAILURE(*status)) {
380         return;
381     }
382 
383     do {
384         c = ucbuf_getc(buf,status);
385         /* add the char to token */
386         if(token!=NULL){
387             ustr_u32cat(token, c, status);
388         }
389     } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
390 }
391 
seekUntilEndOfComment(UCHARBUF * buf,struct UString * token,UErrorCode * status)392 static void seekUntilEndOfComment(UCHARBUF *buf,
393                                   struct UString *token,
394                                   UErrorCode *status) {
395     UChar32  c, d;
396     uint32_t line;
397 
398     if (U_FAILURE(*status)) {
399         return;
400     }
401 
402     line = lineCount;
403 
404     do {
405         c = ucbuf_getc(buf, status);
406 
407         if (c == ASTERISK) {
408             d = ucbuf_getc(buf, status);
409 
410             if (d != SLASH) {
411                 ucbuf_ungetc(d, buf);
412             } else {
413                 break;
414             }
415         }
416         /* add the char to token */
417         if(token!=NULL){
418             ustr_u32cat(token, c, status);
419         }
420         /* increment the lineCount */
421         isNewline(c);
422 
423     } while (c != U_EOF && *status == U_ZERO_ERROR);
424 
425     if (c == U_EOF) {
426         *status = U_INVALID_FORMAT_ERROR;
427         error(line, "unterminated comment detected");
428     }
429 }
430 
unescape(UCHARBUF * buf,UErrorCode * status)431 UChar32 unescape(UCHARBUF *buf,
432                  UErrorCode *status) {
433     if (U_FAILURE(*status)) {
434         return U_EOF;
435     }
436 
437     /* We expect to be called after the ESCAPE has been seen, but
438      * u_fgetcx needs an ESCAPE to do its magic. */
439     ucbuf_ungetc(ESCAPE, buf);
440 
441     return ucbuf_getcx32(buf, status);
442 }
443 
isWhitespace(UChar32 c)444 static UBool isWhitespace(UChar32 c) {
445     switch (c) {
446         /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
447     case 0x000A:
448     case 0x2029:
449         lineCount++;
450     case 0x000D:
451     case 0x0020:
452     case 0x0009:
453     case 0xFEFF:
454         return TRUE;
455 
456     default:
457         return FALSE;
458     }
459 }
460 
isNewline(UChar32 c)461 static UBool isNewline(UChar32 c) {
462     switch (c) {
463         /* '\n', '\r', 0x2029 */
464     case 0x000A:
465     case 0x2029:
466         lineCount++;
467     case 0x000D:
468         return TRUE;
469 
470     default:
471         return FALSE;
472     }
473 }
474