1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 1998-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 *
11 * File read.c
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 * 05/26/99 stephen Creation.
17 * 5/10/01 Ram removed ustdio dependency
18 *******************************************************************************
19 */
20
21 #include "read.h"
22 #include "errmsg.h"
23 #include "unicode/ustring.h"
24 #include "unicode/utf16.h"
25
26 #define OPENBRACE 0x007B
27 #define CLOSEBRACE 0x007D
28 #define COMMA 0x002C
29 #define QUOTE 0x0022
30 #define ESCAPE 0x005C
31 #define SLASH 0x002F
32 #define ASTERISK 0x002A
33 #define SPACE 0x0020
34 #define COLON 0x003A
35 #define BADBOM 0xFFFE
36 #define CR 0x000D
37 #define LF 0x000A
38
39 static int32_t lineCount;
40
41 /* Protos */
42 static enum ETokenType getStringToken(UCHARBUF *buf,
43 UChar32 initialChar,
44 struct UString *token,
45 UErrorCode *status);
46
47 static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
48 static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status);
49 static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
50 static UBool isWhitespace (UChar32 c);
51 static UBool isNewline (UChar32 c);
52
resetLineNumber()53 U_CFUNC void resetLineNumber() {
54 lineCount = 1;
55 }
56
57 /* Read and return the next token from the stream. If the token is of
58 type eString, fill in the token parameter with the token. If the
59 token is eError, then the status parameter will contain the
60 specific error. This will be eItemNotFound at the end of file,
61 indicating that all tokens have been returned. This method will
62 never return eString twice in a row; instead, multiple adjacent
63 string tokens will be merged into one, with no intervening
64 space. */
65 U_CFUNC enum ETokenType
getNextToken(UCHARBUF * buf,struct UString * token,uint32_t * linenumber,struct UString * comment,UErrorCode * status)66 getNextToken(UCHARBUF* buf,
67 struct UString *token,
68 uint32_t *linenumber, /* out: linenumber of token */
69 struct UString *comment,
70 UErrorCode *status) {
71 enum ETokenType result;
72 UChar32 c;
73
74 if (U_FAILURE(*status)) {
75 return TOK_ERROR;
76 }
77
78 /* Skip whitespace */
79 c = getNextChar(buf, TRUE, comment, status);
80
81 if (U_FAILURE(*status)) {
82 return TOK_ERROR;
83 }
84
85 *linenumber = lineCount;
86
87 switch(c) {
88 case BADBOM:
89 return TOK_ERROR;
90 case OPENBRACE:
91 return TOK_OPEN_BRACE;
92 case CLOSEBRACE:
93 return TOK_CLOSE_BRACE;
94 case COMMA:
95 return TOK_COMMA;
96 case U_EOF:
97 return TOK_EOF;
98 case COLON:
99 return TOK_COLON;
100
101 default:
102 result = getStringToken(buf, c, token, status);
103 }
104
105 *linenumber = lineCount;
106 return result;
107 }
108
109 /* Copy a string token into the given UnicodeString. Upon entry, we
110 have already read the first character of the string token, which is
111 not a whitespace character (but may be a QUOTE or ESCAPE). This
112 function reads all subsequent characters that belong with this
113 string, and copy them into the token parameter. The other
114 important, and slightly convoluted purpose of this function is to
115 merge adjacent strings. It looks forward a bit, and if the next
116 non comment, non whitespace item is a string, it reads it in as
117 well. If two adjacent strings are quoted, they are merged without
118 intervening space. Otherwise a single SPACE character is
119 inserted. */
getStringToken(UCHARBUF * buf,UChar32 initialChar,struct UString * token,UErrorCode * status)120 static enum ETokenType getStringToken(UCHARBUF* buf,
121 UChar32 initialChar,
122 struct UString *token,
123 UErrorCode *status) {
124 UBool lastStringWasQuoted;
125 UChar32 c;
126 UChar target[3] = { '\0' };
127 UChar *pTarget = target;
128 int len=0;
129 UBool isFollowingCharEscaped=FALSE;
130 UBool isNLUnescaped = FALSE;
131 UChar32 prevC=0;
132
133 /* We are guaranteed on entry that initialChar is not a whitespace
134 character. If we are at the EOF, or have some other problem, it
135 doesn't matter; we still want to validly return the initialChar
136 (if nothing else) as a string token. */
137
138 if (U_FAILURE(*status)) {
139 return TOK_ERROR;
140 }
141
142 /* setup */
143 lastStringWasQuoted = FALSE;
144 c = initialChar;
145 ustr_setlen(token, 0, status);
146
147 if (U_FAILURE(*status)) {
148 return TOK_ERROR;
149 }
150
151 for (;;) {
152 if (c == QUOTE) {
153 if (!lastStringWasQuoted && token->fLength > 0) {
154 ustr_ucat(token, SPACE, status);
155
156 if (U_FAILURE(*status)) {
157 return TOK_ERROR;
158 }
159 }
160
161 lastStringWasQuoted = TRUE;
162
163 for (;;) {
164 c = ucbuf_getc(buf,status);
165
166 /* EOF reached */
167 if (c == U_EOF) {
168 return TOK_EOF;
169 }
170
171 /* Unterminated quoted strings */
172 if (U_FAILURE(*status)) {
173 return TOK_ERROR;
174 }
175
176 if (c == QUOTE && !isFollowingCharEscaped) {
177 break;
178 }
179
180 if (c == ESCAPE && !isFollowingCharEscaped) {
181 pTarget = target;
182 c = unescape(buf, status);
183
184 if (c == U_ERR) {
185 return TOK_ERROR;
186 }
187 if(c == CR || c == LF){
188 isNLUnescaped = TRUE;
189 }
190 }
191
192 if(c==ESCAPE && !isFollowingCharEscaped){
193 isFollowingCharEscaped = TRUE;
194 }else{
195 U_APPEND_CHAR32(c, pTarget,len);
196 pTarget = target;
197 ustr_uscat(token, pTarget,len, status);
198 isFollowingCharEscaped = FALSE;
199 len=0;
200 if(c == CR || c == LF){
201 if(isNLUnescaped == FALSE && prevC!=CR){
202 lineCount++;
203 }
204 isNLUnescaped = FALSE;
205 }
206 }
207
208 if (U_FAILURE(*status)) {
209 return TOK_ERROR;
210 }
211 prevC = c;
212 }
213 } else {
214 if (token->fLength > 0) {
215 ustr_ucat(token, SPACE, status);
216
217 if (U_FAILURE(*status)) {
218 return TOK_ERROR;
219 }
220 }
221
222 if(lastStringWasQuoted){
223 if(getShowWarning()){
224 warning(lineCount, "Mixing quoted and unquoted strings");
225 }
226 if(isStrict()){
227 return TOK_ERROR;
228 }
229
230 }
231
232 lastStringWasQuoted = FALSE;
233
234 /* if we reach here we are mixing
235 * quoted and unquoted strings
236 * warn in normal mode and error in
237 * pedantic mode
238 */
239
240 if (c == ESCAPE) {
241 pTarget = target;
242 c = unescape(buf, status);
243
244 /* EOF reached */
245 if (c == U_EOF) {
246 return TOK_ERROR;
247 }
248 }
249
250 U_APPEND_CHAR32(c, pTarget,len);
251 pTarget = target;
252 ustr_uscat(token, pTarget,len, status);
253 len=0;
254
255 if (U_FAILURE(*status)) {
256 return TOK_ERROR;
257 }
258
259 for (;;) {
260 /* DON'T skip whitespace */
261 c = getNextChar(buf, FALSE, NULL, status);
262
263 /* EOF reached */
264 if (c == U_EOF) {
265 ucbuf_ungetc(c, buf);
266 return TOK_STRING;
267 }
268
269 if (U_FAILURE(*status)) {
270 return TOK_STRING;
271 }
272
273 if (c == QUOTE
274 || c == OPENBRACE
275 || c == CLOSEBRACE
276 || c == COMMA
277 || c == COLON) {
278 ucbuf_ungetc(c, buf);
279 break;
280 }
281
282 if (isWhitespace(c)) {
283 break;
284 }
285
286 if (c == ESCAPE) {
287 pTarget = target;
288 c = unescape(buf, status);
289
290 if (c == U_ERR) {
291 return TOK_ERROR;
292 }
293 }
294
295 U_APPEND_CHAR32(c, pTarget,len);
296 pTarget = target;
297 ustr_uscat(token, pTarget,len, status);
298 len=0;
299 if (U_FAILURE(*status)) {
300 return TOK_ERROR;
301 }
302 }
303 }
304
305 /* DO skip whitespace */
306 c = getNextChar(buf, TRUE, NULL, status);
307
308 if (U_FAILURE(*status)) {
309 return TOK_STRING;
310 }
311
312 if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
313 ucbuf_ungetc(c, buf);
314 return TOK_STRING;
315 }
316 }
317 }
318
319 /* Retrieve the next character. If skipwhite is
320 true, whitespace is skipped as well. */
getNextChar(UCHARBUF * buf,UBool skipwhite,struct UString * token,UErrorCode * status)321 static UChar32 getNextChar(UCHARBUF* buf,
322 UBool skipwhite,
323 struct UString *token,
324 UErrorCode *status) {
325 UChar32 c, c2;
326
327 if (U_FAILURE(*status)) {
328 return U_EOF;
329 }
330
331 for (;;) {
332 c = ucbuf_getc(buf,status);
333
334 if (c == U_EOF) {
335 return U_EOF;
336 }
337
338 if (skipwhite && isWhitespace(c)) {
339 continue;
340 }
341
342 /* This also handles the get() failing case */
343 if (c != SLASH) {
344 return c;
345 }
346
347 c = ucbuf_getc(buf,status); /* "/c" */
348
349 if (c == U_EOF) {
350 return U_EOF;
351 }
352
353 switch (c) {
354 case SLASH: /* "//" */
355 seekUntilNewline(buf, NULL, status);
356 break;
357
358 case ASTERISK: /* " / * " */
359 c2 = ucbuf_getc(buf, status); /* "/ * c" */
360 if(c2 == ASTERISK){ /* "/ * *" */
361 /* parse multi-line comment and store it in token*/
362 seekUntilEndOfComment(buf, token, status);
363 } else {
364 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */
365 seekUntilEndOfComment(buf, NULL, status);
366 }
367 break;
368
369 default:
370 ucbuf_ungetc(c, buf); /* "/c" - put back the c */
371 /* If get() failed this is a NOP */
372 return SLASH;
373 }
374
375 }
376 }
377
seekUntilNewline(UCHARBUF * buf,struct UString * token,UErrorCode * status)378 static void seekUntilNewline(UCHARBUF* buf,
379 struct UString *token,
380 UErrorCode *status) {
381 UChar32 c;
382
383 if (U_FAILURE(*status)) {
384 return;
385 }
386
387 do {
388 c = ucbuf_getc(buf,status);
389 /* add the char to token */
390 if(token!=NULL){
391 ustr_u32cat(token, c, status);
392 }
393 } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
394 }
395
seekUntilEndOfComment(UCHARBUF * buf,struct UString * token,UErrorCode * status)396 static void seekUntilEndOfComment(UCHARBUF *buf,
397 struct UString *token,
398 UErrorCode *status) {
399 UChar32 c, d;
400 uint32_t line;
401
402 if (U_FAILURE(*status)) {
403 return;
404 }
405
406 line = lineCount;
407
408 do {
409 c = ucbuf_getc(buf, status);
410
411 if (c == ASTERISK) {
412 d = ucbuf_getc(buf, status);
413
414 if (d != SLASH) {
415 ucbuf_ungetc(d, buf);
416 } else {
417 break;
418 }
419 }
420 /* add the char to token */
421 if(token!=NULL){
422 ustr_u32cat(token, c, status);
423 }
424 /* increment the lineCount */
425 isNewline(c);
426
427 } while (c != U_EOF && *status == U_ZERO_ERROR);
428
429 if (c == U_EOF) {
430 *status = U_INVALID_FORMAT_ERROR;
431 error(line, "unterminated comment detected");
432 }
433 }
434
unescape(UCHARBUF * buf,UErrorCode * status)435 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
436 if (U_FAILURE(*status)) {
437 return U_EOF;
438 }
439
440 /* We expect to be called after the ESCAPE has been seen, but
441 * u_fgetcx needs an ESCAPE to do its magic. */
442 ucbuf_ungetc(ESCAPE, buf);
443
444 return ucbuf_getcx32(buf, status);
445 }
446
isWhitespace(UChar32 c)447 static UBool isWhitespace(UChar32 c) {
448 switch (c) {
449 /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
450 case 0x000A:
451 case 0x2029:
452 lineCount++;
453 case 0x000D:
454 case 0x0020:
455 case 0x0009:
456 case 0xFEFF:
457 return TRUE;
458
459 default:
460 return FALSE;
461 }
462 }
463
isNewline(UChar32 c)464 static UBool isNewline(UChar32 c) {
465 switch (c) {
466 /* '\n', '\r', 0x2029 */
467 case 0x000A:
468 case 0x2029:
469 lineCount++;
470 case 0x000D:
471 return TRUE;
472
473 default:
474 return FALSE;
475 }
476 }
477