1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 1998-2009, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 *
9 * File read.c
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 05/26/99 stephen Creation.
15 * 5/10/01 Ram removed ustdio dependency
16 *******************************************************************************
17 */
18
19 #include "read.h"
20 #include "errmsg.h"
21 #include "unicode/ustring.h"
22
23 #define OPENBRACE 0x007B
24 #define CLOSEBRACE 0x007D
25 #define COMMA 0x002C
26 #define QUOTE 0x0022
27 #define ESCAPE 0x005C
28 #define SLASH 0x002F
29 #define ASTERISK 0x002A
30 #define SPACE 0x0020
31 #define COLON 0x003A
32 #define BADBOM 0xFFFE
33 #define CR 0x000D
34 #define LF 0x000A
35
36 static int32_t lineCount;
37
38 /* Protos */
39 static enum ETokenType getStringToken(UCHARBUF *buf,
40 UChar32 initialChar,
41 struct UString *token,
42 UErrorCode *status);
43
44 static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
45 static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status);
46 static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
47 static UBool isWhitespace (UChar32 c);
48 static UBool isNewline (UChar32 c);
49
resetLineNumber()50 void resetLineNumber() {
51 lineCount = 1;
52 }
53
54 /* Read and return the next token from the stream. If the token is of
55 type eString, fill in the token parameter with the token. If the
56 token is eError, then the status parameter will contain the
57 specific error. This will be eItemNotFound at the end of file,
58 indicating that all tokens have been returned. This method will
59 never return eString twice in a row; instead, multiple adjacent
60 string tokens will be merged into one, with no intervening
61 space. */
getNextToken(UCHARBUF * buf,struct UString * token,uint32_t * linenumber,struct UString * comment,UErrorCode * status)62 enum ETokenType getNextToken(UCHARBUF* buf,
63 struct UString *token,
64 uint32_t *linenumber, /* out: linenumber of token */
65 struct UString *comment,
66 UErrorCode *status) {
67 enum ETokenType result;
68 UChar32 c;
69
70 if (U_FAILURE(*status)) {
71 return TOK_ERROR;
72 }
73
74 /* Skip whitespace */
75 c = getNextChar(buf, TRUE, comment, status);
76
77 if (U_FAILURE(*status)) {
78 return TOK_ERROR;
79 }
80
81 *linenumber = lineCount;
82
83 switch(c) {
84 case BADBOM:
85 return TOK_ERROR;
86 case OPENBRACE:
87 return TOK_OPEN_BRACE;
88 case CLOSEBRACE:
89 return TOK_CLOSE_BRACE;
90 case COMMA:
91 return TOK_COMMA;
92 case U_EOF:
93 return TOK_EOF;
94 case COLON:
95 return TOK_COLON;
96
97 default:
98 result = getStringToken(buf, c, token, status);
99 }
100
101 *linenumber = lineCount;
102 return result;
103 }
104
105 /* Copy a string token into the given UnicodeString. Upon entry, we
106 have already read the first character of the string token, which is
107 not a whitespace character (but may be a QUOTE or ESCAPE). This
108 function reads all subsequent characters that belong with this
109 string, and copy them into the token parameter. The other
110 important, and slightly convoluted purpose of this function is to
111 merge adjacent strings. It looks forward a bit, and if the next
112 non comment, non whitespace item is a string, it reads it in as
113 well. If two adjacent strings are quoted, they are merged without
114 intervening space. Otherwise a single SPACE character is
115 inserted. */
getStringToken(UCHARBUF * buf,UChar32 initialChar,struct UString * token,UErrorCode * status)116 static enum ETokenType getStringToken(UCHARBUF* buf,
117 UChar32 initialChar,
118 struct UString *token,
119 UErrorCode *status) {
120 UBool lastStringWasQuoted;
121 UChar32 c;
122 UChar target[3] = { '\0' };
123 UChar *pTarget = target;
124 int len=0;
125 UBool isFollowingCharEscaped=FALSE;
126 UBool isNLUnescaped = FALSE;
127 UChar32 prevC=0;
128
129 /* We are guaranteed on entry that initialChar is not a whitespace
130 character. If we are at the EOF, or have some other problem, it
131 doesn't matter; we still want to validly return the initialChar
132 (if nothing else) as a string token. */
133
134 if (U_FAILURE(*status)) {
135 return TOK_ERROR;
136 }
137
138 /* setup */
139 lastStringWasQuoted = FALSE;
140 c = initialChar;
141 ustr_setlen(token, 0, status);
142
143 if (U_FAILURE(*status)) {
144 return TOK_ERROR;
145 }
146
147 for (;;) {
148 if (c == QUOTE) {
149 if (!lastStringWasQuoted && token->fLength > 0) {
150 ustr_ucat(token, SPACE, status);
151
152 if (U_FAILURE(*status)) {
153 return TOK_ERROR;
154 }
155 }
156
157 lastStringWasQuoted = TRUE;
158
159 for (;;) {
160 c = ucbuf_getc(buf,status);
161
162 /* EOF reached */
163 if (c == U_EOF) {
164 return TOK_EOF;
165 }
166
167 /* Unterminated quoted strings */
168 if (U_FAILURE(*status)) {
169 return TOK_ERROR;
170 }
171
172 if (c == QUOTE && !isFollowingCharEscaped) {
173 break;
174 }
175
176 if (c == ESCAPE && !isFollowingCharEscaped) {
177 pTarget = target;
178 c = unescape(buf, status);
179
180 if (c == U_ERR) {
181 return TOK_ERROR;
182 }
183 if(c == CR || c == LF){
184 isNLUnescaped = TRUE;
185 }
186 }
187
188 if(c==ESCAPE && !isFollowingCharEscaped){
189 isFollowingCharEscaped = TRUE;
190 }else{
191 U_APPEND_CHAR32(c, pTarget,len);
192 pTarget = target;
193 ustr_uscat(token, pTarget,len, status);
194 isFollowingCharEscaped = FALSE;
195 len=0;
196 if(c == CR || c == LF){
197 if(isNLUnescaped == FALSE && prevC!=CR){
198 lineCount++;
199 }
200 isNLUnescaped = FALSE;
201 }
202 }
203
204 if (U_FAILURE(*status)) {
205 return TOK_ERROR;
206 }
207 prevC = c;
208 }
209 } else {
210 if (token->fLength > 0) {
211 ustr_ucat(token, SPACE, status);
212
213 if (U_FAILURE(*status)) {
214 return TOK_ERROR;
215 }
216 }
217
218 if(lastStringWasQuoted){
219 if(getShowWarning()){
220 warning(lineCount, "Mixing quoted and unquoted strings");
221 }
222 if(isStrict()){
223 return TOK_ERROR;
224 }
225
226 }
227
228 lastStringWasQuoted = FALSE;
229
230 /* if we reach here we are mixing
231 * quoted and unquoted strings
232 * warn in normal mode and error in
233 * pedantic mode
234 */
235
236 if (c == ESCAPE) {
237 pTarget = target;
238 c = unescape(buf, status);
239
240 /* EOF reached */
241 if (c == U_EOF) {
242 return TOK_ERROR;
243 }
244 }
245
246 U_APPEND_CHAR32(c, pTarget,len);
247 pTarget = target;
248 ustr_uscat(token, pTarget,len, status);
249 len=0;
250
251 if (U_FAILURE(*status)) {
252 return TOK_ERROR;
253 }
254
255 for (;;) {
256 /* DON'T skip whitespace */
257 c = getNextChar(buf, FALSE, NULL, status);
258
259 /* EOF reached */
260 if (c == U_EOF) {
261 ucbuf_ungetc(c, buf);
262 return TOK_STRING;
263 }
264
265 if (U_FAILURE(*status)) {
266 return TOK_STRING;
267 }
268
269 if (c == QUOTE
270 || c == OPENBRACE
271 || c == CLOSEBRACE
272 || c == COMMA
273 || c == COLON) {
274 ucbuf_ungetc(c, buf);
275 break;
276 }
277
278 if (isWhitespace(c)) {
279 break;
280 }
281
282 if (c == ESCAPE) {
283 pTarget = target;
284 c = unescape(buf, status);
285
286 if (c == U_ERR) {
287 return TOK_ERROR;
288 }
289 }
290
291 U_APPEND_CHAR32(c, pTarget,len);
292 pTarget = target;
293 ustr_uscat(token, pTarget,len, status);
294 len=0;
295 if (U_FAILURE(*status)) {
296 return TOK_ERROR;
297 }
298 }
299 }
300
301 /* DO skip whitespace */
302 c = getNextChar(buf, TRUE, NULL, status);
303
304 if (U_FAILURE(*status)) {
305 return TOK_STRING;
306 }
307
308 if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
309 ucbuf_ungetc(c, buf);
310 return TOK_STRING;
311 }
312 }
313 }
314
315 /* Retrieve the next character. If skipwhite is
316 true, whitespace is skipped as well. */
getNextChar(UCHARBUF * buf,UBool skipwhite,struct UString * token,UErrorCode * status)317 static UChar32 getNextChar(UCHARBUF* buf,
318 UBool skipwhite,
319 struct UString *token,
320 UErrorCode *status) {
321 UChar32 c, c2;
322
323 if (U_FAILURE(*status)) {
324 return U_EOF;
325 }
326
327 for (;;) {
328 c = ucbuf_getc(buf,status);
329
330 if (c == U_EOF) {
331 return U_EOF;
332 }
333
334 if (skipwhite && isWhitespace(c)) {
335 continue;
336 }
337
338 /* This also handles the get() failing case */
339 if (c != SLASH) {
340 return c;
341 }
342
343 c = ucbuf_getc(buf,status); /* "/c" */
344
345 if (c == U_EOF) {
346 return U_EOF;
347 }
348
349 switch (c) {
350 case SLASH: /* "//" */
351 seekUntilNewline(buf, NULL, status);
352 break;
353
354 case ASTERISK: /* " / * " */
355 c2 = ucbuf_getc(buf, status); /* "/ * c" */
356 if(c2 == ASTERISK){ /* "/ * *" */
357 /* parse multi-line comment and store it in token*/
358 seekUntilEndOfComment(buf, token, status);
359 } else {
360 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */
361 seekUntilEndOfComment(buf, NULL, status);
362 }
363 break;
364
365 default:
366 ucbuf_ungetc(c, buf); /* "/c" - put back the c */
367 /* If get() failed this is a NOP */
368 return SLASH;
369 }
370
371 }
372 }
373
seekUntilNewline(UCHARBUF * buf,struct UString * token,UErrorCode * status)374 static void seekUntilNewline(UCHARBUF* buf,
375 struct UString *token,
376 UErrorCode *status) {
377 UChar32 c;
378
379 if (U_FAILURE(*status)) {
380 return;
381 }
382
383 do {
384 c = ucbuf_getc(buf,status);
385 /* add the char to token */
386 if(token!=NULL){
387 ustr_u32cat(token, c, status);
388 }
389 } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
390 }
391
seekUntilEndOfComment(UCHARBUF * buf,struct UString * token,UErrorCode * status)392 static void seekUntilEndOfComment(UCHARBUF *buf,
393 struct UString *token,
394 UErrorCode *status) {
395 UChar32 c, d;
396 uint32_t line;
397
398 if (U_FAILURE(*status)) {
399 return;
400 }
401
402 line = lineCount;
403
404 do {
405 c = ucbuf_getc(buf, status);
406
407 if (c == ASTERISK) {
408 d = ucbuf_getc(buf, status);
409
410 if (d != SLASH) {
411 ucbuf_ungetc(d, buf);
412 } else {
413 break;
414 }
415 }
416 /* add the char to token */
417 if(token!=NULL){
418 ustr_u32cat(token, c, status);
419 }
420 /* increment the lineCount */
421 isNewline(c);
422
423 } while (c != U_EOF && *status == U_ZERO_ERROR);
424
425 if (c == U_EOF) {
426 *status = U_INVALID_FORMAT_ERROR;
427 error(line, "unterminated comment detected");
428 }
429 }
430
unescape(UCHARBUF * buf,UErrorCode * status)431 UChar32 unescape(UCHARBUF *buf,
432 UErrorCode *status) {
433 if (U_FAILURE(*status)) {
434 return U_EOF;
435 }
436
437 /* We expect to be called after the ESCAPE has been seen, but
438 * u_fgetcx needs an ESCAPE to do its magic. */
439 ucbuf_ungetc(ESCAPE, buf);
440
441 return ucbuf_getcx32(buf, status);
442 }
443
isWhitespace(UChar32 c)444 static UBool isWhitespace(UChar32 c) {
445 switch (c) {
446 /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
447 case 0x000A:
448 case 0x2029:
449 lineCount++;
450 case 0x000D:
451 case 0x0020:
452 case 0x0009:
453 case 0xFEFF:
454 return TRUE;
455
456 default:
457 return FALSE;
458 }
459 }
460
isNewline(UChar32 c)461 static UBool isNewline(UChar32 c) {
462 switch (c) {
463 /* '\n', '\r', 0x2029 */
464 case 0x000A:
465 case 0x2029:
466 lineCount++;
467 case 0x000D:
468 return TRUE;
469
470 default:
471 return FALSE;
472 }
473 }
474