• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1998-2012, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *
11 * File read.c
12 *
13 * Modification History:
14 *
15 *   Date        Name        Description
16 *   05/26/99    stephen     Creation.
17 *   5/10/01     Ram         removed ustdio dependency
18 *******************************************************************************
19 */
20 
21 #include <stdbool.h>
22 
23 #include "read.h"
24 #include "errmsg.h"
25 #include "toolutil.h"
26 #include "unicode/ustring.h"
27 #include "unicode/utf16.h"
28 
29 #define OPENBRACE    0x007B
30 #define CLOSEBRACE   0x007D
31 #define COMMA        0x002C
32 #define QUOTE        0x0022
33 #define ESCAPE       0x005C
34 #define SLASH        0x002F
35 #define ASTERISK     0x002A
36 #define SPACE        0x0020
37 #define COLON        0x003A
38 #define BADBOM       0xFFFE
39 #define CR           0x000D
40 #define LF           0x000A
41 
42 static int32_t lineCount;
43 
44 /* Protos */
45 static enum ETokenType getStringToken(UCHARBUF *buf,
46                                       UChar32 initialChar,
47                                       struct UString *token,
48                                       UErrorCode *status);
49 
50 static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
51 static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
52 static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
53 static UBool   isWhitespace          (UChar32 c);
54 static UBool   isNewline             (UChar32 c);
55 
resetLineNumber()56 U_CFUNC void resetLineNumber() {
57     lineCount = 1;
58 }
59 
60 /* Read and return the next token from the stream.  If the token is of
61    type eString, fill in the token parameter with the token.  If the
62    token is eError, then the status parameter will contain the
63    specific error.  This will be eItemNotFound at the end of file,
64    indicating that all tokens have been returned.  This method will
65    never return eString twice in a row; instead, multiple adjacent
66    string tokens will be merged into one, with no intervening
67    space. */
68 U_CFUNC enum ETokenType
getNextToken(UCHARBUF * buf,struct UString * token,uint32_t * linenumber,struct UString * comment,UErrorCode * status)69 getNextToken(UCHARBUF* buf,
70              struct UString *token,
71              uint32_t *linenumber, /* out: linenumber of token */
72              struct UString *comment,
73              UErrorCode *status) {
74     enum ETokenType result;
75     UChar32         c;
76 
77     if (U_FAILURE(*status)) {
78         return TOK_ERROR;
79     }
80 
81     /* Skip whitespace */
82     c = getNextChar(buf, true, comment, status);
83 
84     if (U_FAILURE(*status)) {
85         return TOK_ERROR;
86     }
87 
88     *linenumber = lineCount;
89 
90     switch(c) {
91     case BADBOM:
92         return TOK_ERROR;
93     case OPENBRACE:
94         return TOK_OPEN_BRACE;
95     case CLOSEBRACE:
96         return TOK_CLOSE_BRACE;
97     case COMMA:
98         return TOK_COMMA;
99     case U_EOF:
100         return TOK_EOF;
101     case COLON:
102         return TOK_COLON;
103 
104     default:
105         result = getStringToken(buf, c, token, status);
106     }
107 
108     *linenumber = lineCount;
109     return result;
110 }
111 
112 /* Copy a string token into the given UnicodeString.  Upon entry, we
113    have already read the first character of the string token, which is
114    not a whitespace character (but may be a QUOTE or ESCAPE). This
115    function reads all subsequent characters that belong with this
116    string, and copy them into the token parameter. The other
117    important, and slightly convoluted purpose of this function is to
118    merge adjacent strings.  It looks forward a bit, and if the next
119    non comment, non whitespace item is a string, it reads it in as
120    well.  If two adjacent strings are quoted, they are merged without
121    intervening space.  Otherwise a single SPACE character is
122    inserted. */
getStringToken(UCHARBUF * buf,UChar32 initialChar,struct UString * token,UErrorCode * status)123 static enum ETokenType getStringToken(UCHARBUF* buf,
124                                       UChar32 initialChar,
125                                       struct UString *token,
126                                       UErrorCode *status) {
127     UBool    lastStringWasQuoted;
128     UChar32  c;
129     UChar    target[3] = { '\0' };
130     UChar    *pTarget   = target;
131     int      len=0;
132     UBool    isFollowingCharEscaped=false;
133     UBool    isNLUnescaped = false;
134     UChar32  prevC=0;
135 
136     /* We are guaranteed on entry that initialChar is not a whitespace
137        character. If we are at the EOF, or have some other problem, it
138        doesn't matter; we still want to validly return the initialChar
139        (if nothing else) as a string token. */
140 
141     if (U_FAILURE(*status)) {
142         return TOK_ERROR;
143     }
144 
145     /* setup */
146     lastStringWasQuoted = false;
147     c = initialChar;
148     ustr_setlen(token, 0, status);
149 
150     if (U_FAILURE(*status)) {
151         return TOK_ERROR;
152     }
153 
154     for (;;) {
155         if (c == QUOTE) {
156             if (!lastStringWasQuoted && token->fLength > 0) {
157                 ustr_ucat(token, SPACE, status);
158 
159                 if (U_FAILURE(*status)) {
160                     return TOK_ERROR;
161                 }
162             }
163 
164             lastStringWasQuoted = true;
165 
166             for (;;) {
167                 c = ucbuf_getc(buf,status);
168 
169                 /* EOF reached */
170                 if (c == U_EOF) {
171                     return TOK_EOF;
172                 }
173 
174                 /* Unterminated quoted strings */
175                 if (U_FAILURE(*status)) {
176                     return TOK_ERROR;
177                 }
178 
179                 if (c == QUOTE && !isFollowingCharEscaped) {
180                     break;
181                 }
182 
183                 if (c == ESCAPE  && !isFollowingCharEscaped) {
184                     pTarget = target;
185                     c       = unescape(buf, status);
186 
187                     if (c == U_ERR) {
188                         return TOK_ERROR;
189                     }
190                     if(c == CR || c == LF){
191                         isNLUnescaped = true;
192                     }
193                 }
194 
195                 if(c==ESCAPE && !isFollowingCharEscaped){
196                     isFollowingCharEscaped = true;
197                 }else{
198                     U_APPEND_CHAR32(c, pTarget,len);
199                     pTarget = target;
200                     ustr_uscat(token, pTarget,len, status);
201                     isFollowingCharEscaped = false;
202                     len=0;
203                     if(c == CR || c == LF){
204                         if(isNLUnescaped == false && prevC!=CR){
205                             lineCount++;
206                         }
207                         isNLUnescaped = false;
208                     }
209                 }
210 
211                 if (U_FAILURE(*status)) {
212                     return TOK_ERROR;
213                 }
214                 prevC = c;
215             }
216         } else {
217             if (token->fLength > 0) {
218                 ustr_ucat(token, SPACE, status);
219 
220                 if (U_FAILURE(*status)) {
221                     return TOK_ERROR;
222                 }
223             }
224 
225             if(lastStringWasQuoted){
226                 if(getShowWarning()){
227                     warning(lineCount, "Mixing quoted and unquoted strings");
228                 }
229                 if(isStrict()){
230                     return TOK_ERROR;
231                 }
232 
233             }
234 
235             lastStringWasQuoted = false;
236 
237             /* if we reach here we are mixing
238              * quoted and unquoted strings
239              * warn in normal mode and error in
240              * pedantic mode
241              */
242 
243             if (c == ESCAPE) {
244                 pTarget = target;
245                 c       = unescape(buf, status);
246 
247                 /* EOF reached */
248                 if (c == U_EOF) {
249                     return TOK_ERROR;
250                 }
251             }
252 
253             U_APPEND_CHAR32(c, pTarget,len);
254             pTarget = target;
255             ustr_uscat(token, pTarget,len, status);
256             len=0;
257 
258             if (U_FAILURE(*status)) {
259                 return TOK_ERROR;
260             }
261 
262             for (;;) {
263                 /* DON'T skip whitespace */
264                 c = getNextChar(buf, false, NULL, status);
265 
266                 /* EOF reached */
267                 if (c == U_EOF) {
268                     ucbuf_ungetc(c, buf);
269                     return TOK_STRING;
270                 }
271 
272                 if (U_FAILURE(*status)) {
273                     return TOK_STRING;
274                 }
275 
276                 if (c == QUOTE
277                         || c == OPENBRACE
278                         || c == CLOSEBRACE
279                         || c == COMMA
280                         || c == COLON) {
281                     ucbuf_ungetc(c, buf);
282                     break;
283                 }
284 
285                 if (isWhitespace(c)) {
286                     break;
287                 }
288 
289                 if (c == ESCAPE) {
290                     pTarget = target;
291                     c       = unescape(buf, status);
292 
293                     if (c == U_ERR) {
294                         return TOK_ERROR;
295                     }
296                 }
297 
298                 U_APPEND_CHAR32(c, pTarget,len);
299                 pTarget = target;
300                 ustr_uscat(token, pTarget,len, status);
301                 len=0;
302                 if (U_FAILURE(*status)) {
303                     return TOK_ERROR;
304                 }
305             }
306         }
307 
308         /* DO skip whitespace */
309         c = getNextChar(buf, true, NULL, status);
310 
311         if (U_FAILURE(*status)) {
312             return TOK_STRING;
313         }
314 
315         if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
316             ucbuf_ungetc(c, buf);
317             return TOK_STRING;
318         }
319     }
320 }
321 
322 /* Retrieve the next character.  If skipwhite is
323    true, whitespace is skipped as well. */
getNextChar(UCHARBUF * buf,UBool skipwhite,struct UString * token,UErrorCode * status)324 static UChar32 getNextChar(UCHARBUF* buf,
325                            UBool skipwhite,
326                            struct UString *token,
327                            UErrorCode *status) {
328     UChar32 c, c2;
329 
330     if (U_FAILURE(*status)) {
331         return U_EOF;
332     }
333 
334     for (;;) {
335         c = ucbuf_getc(buf,status);
336 
337         if (c == U_EOF) {
338             return U_EOF;
339         }
340 
341         if (skipwhite && isWhitespace(c)) {
342             continue;
343         }
344 
345         /* This also handles the get() failing case */
346         if (c != SLASH) {
347             return c;
348         }
349 
350         c = ucbuf_getc(buf,status); /* "/c" */
351 
352         if (c == U_EOF) {
353             return U_EOF;
354         }
355 
356         switch (c) {
357         case SLASH:  /* "//" */
358             seekUntilNewline(buf, NULL, status);
359             break;
360 
361         case ASTERISK:  /* " / * " */
362             c2 = ucbuf_getc(buf, status); /* "/ * c" */
363             if(c2 == ASTERISK){  /* "/ * *" */
364                 /* parse multi-line comment and store it in token*/
365                 seekUntilEndOfComment(buf, token, status);
366             } else {
367                 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *".  Include c2  back in buffer.  */
368                 seekUntilEndOfComment(buf, NULL, status);
369             }
370             break;
371 
372         default:
373             ucbuf_ungetc(c, buf); /* "/c" - put back the c */
374             /* If get() failed this is a NOP */
375             return SLASH;
376         }
377 
378     }
379 }
380 
seekUntilNewline(UCHARBUF * buf,struct UString * token,UErrorCode * status)381 static void seekUntilNewline(UCHARBUF* buf,
382                              struct UString *token,
383                              UErrorCode *status) {
384     UChar32 c;
385 
386     if (U_FAILURE(*status)) {
387         return;
388     }
389 
390     do {
391         c = ucbuf_getc(buf,status);
392         /* add the char to token */
393         if(token!=NULL){
394             ustr_u32cat(token, c, status);
395         }
396     } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
397 }
398 
seekUntilEndOfComment(UCHARBUF * buf,struct UString * token,UErrorCode * status)399 static void seekUntilEndOfComment(UCHARBUF *buf,
400                                   struct UString *token,
401                                   UErrorCode *status) {
402     UChar32  c, d;
403     uint32_t line;
404 
405     if (U_FAILURE(*status)) {
406         return;
407     }
408 
409     line = lineCount;
410 
411     do {
412         c = ucbuf_getc(buf, status);
413 
414         if (c == ASTERISK) {
415             d = ucbuf_getc(buf, status);
416 
417             if (d != SLASH) {
418                 ucbuf_ungetc(d, buf);
419             } else {
420                 break;
421             }
422         }
423         /* add the char to token */
424         if(token!=NULL){
425             ustr_u32cat(token, c, status);
426         }
427         /* increment the lineCount */
428         isNewline(c);
429 
430     } while (c != U_EOF && *status == U_ZERO_ERROR);
431 
432     if (c == U_EOF) {
433         *status = U_INVALID_FORMAT_ERROR;
434         error(line, "unterminated comment detected");
435     }
436 }
437 
unescape(UCHARBUF * buf,UErrorCode * status)438 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
439     if (U_FAILURE(*status)) {
440         return U_EOF;
441     }
442 
443     /* We expect to be called after the ESCAPE has been seen, but
444      * u_fgetcx needs an ESCAPE to do its magic. */
445     ucbuf_ungetc(ESCAPE, buf);
446 
447     return ucbuf_getcx32(buf, status);
448 }
449 
isWhitespace(UChar32 c)450 static UBool isWhitespace(UChar32 c) {
451     switch (c) {
452         /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
453     case 0x000A:
454     case 0x2029:
455         lineCount++;
456     case 0x000D:
457     case 0x0020:
458     case 0x0009:
459     case 0xFEFF:
460         return true;
461 
462     default:
463         return false;
464     }
465 }
466 
isNewline(UChar32 c)467 static UBool isNewline(UChar32 c) {
468     switch (c) {
469         /* '\n', '\r', 0x2029 */
470     case 0x000A:
471     case 0x2029:
472         lineCount++;
473     case 0x000D:
474         return true;
475 
476     default:
477         return false;
478     }
479 }
480