1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 1998-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 *
11 * File read.c
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 * 05/26/99 stephen Creation.
17 * 5/10/01 Ram removed ustdio dependency
18 *******************************************************************************
19 */
20
21 #include "read.h"
22 #include "errmsg.h"
23 #include "toolutil.h"
24 #include "unicode/ustring.h"
25 #include "unicode/utf16.h"
26
27 #define OPENBRACE 0x007B
28 #define CLOSEBRACE 0x007D
29 #define COMMA 0x002C
30 #define QUOTE 0x0022
31 #define ESCAPE 0x005C
32 #define SLASH 0x002F
33 #define ASTERISK 0x002A
34 #define SPACE 0x0020
35 #define COLON 0x003A
36 #define BADBOM 0xFFFE
37 #define CR 0x000D
38 #define LF 0x000A
39
40 static int32_t lineCount;
41
42 /* Protos */
43 static enum ETokenType getStringToken(UCHARBUF *buf,
44 UChar32 initialChar,
45 struct UString *token,
46 UErrorCode *status);
47
48 static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
49 static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status);
50 static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
51 static UBool isWhitespace (UChar32 c);
52 static UBool isNewline (UChar32 c);
53
resetLineNumber()54 U_CFUNC void resetLineNumber() {
55 lineCount = 1;
56 }
57
58 /* Read and return the next token from the stream. If the token is of
59 type eString, fill in the token parameter with the token. If the
60 token is eError, then the status parameter will contain the
61 specific error. This will be eItemNotFound at the end of file,
62 indicating that all tokens have been returned. This method will
63 never return eString twice in a row; instead, multiple adjacent
64 string tokens will be merged into one, with no intervening
65 space. */
66 U_CFUNC enum ETokenType
getNextToken(UCHARBUF * buf,struct UString * token,uint32_t * linenumber,struct UString * comment,UErrorCode * status)67 getNextToken(UCHARBUF* buf,
68 struct UString *token,
69 uint32_t *linenumber, /* out: linenumber of token */
70 struct UString *comment,
71 UErrorCode *status) {
72 enum ETokenType result;
73 UChar32 c;
74
75 if (U_FAILURE(*status)) {
76 return TOK_ERROR;
77 }
78
79 /* Skip whitespace */
80 c = getNextChar(buf, TRUE, comment, status);
81
82 if (U_FAILURE(*status)) {
83 return TOK_ERROR;
84 }
85
86 *linenumber = lineCount;
87
88 switch(c) {
89 case BADBOM:
90 return TOK_ERROR;
91 case OPENBRACE:
92 return TOK_OPEN_BRACE;
93 case CLOSEBRACE:
94 return TOK_CLOSE_BRACE;
95 case COMMA:
96 return TOK_COMMA;
97 case U_EOF:
98 return TOK_EOF;
99 case COLON:
100 return TOK_COLON;
101
102 default:
103 result = getStringToken(buf, c, token, status);
104 }
105
106 *linenumber = lineCount;
107 return result;
108 }
109
110 /* Copy a string token into the given UnicodeString. Upon entry, we
111 have already read the first character of the string token, which is
112 not a whitespace character (but may be a QUOTE or ESCAPE). This
113 function reads all subsequent characters that belong with this
114 string, and copy them into the token parameter. The other
115 important, and slightly convoluted purpose of this function is to
116 merge adjacent strings. It looks forward a bit, and if the next
117 non comment, non whitespace item is a string, it reads it in as
118 well. If two adjacent strings are quoted, they are merged without
119 intervening space. Otherwise a single SPACE character is
120 inserted. */
getStringToken(UCHARBUF * buf,UChar32 initialChar,struct UString * token,UErrorCode * status)121 static enum ETokenType getStringToken(UCHARBUF* buf,
122 UChar32 initialChar,
123 struct UString *token,
124 UErrorCode *status) {
125 UBool lastStringWasQuoted;
126 UChar32 c;
127 UChar target[3] = { '\0' };
128 UChar *pTarget = target;
129 int len=0;
130 UBool isFollowingCharEscaped=FALSE;
131 UBool isNLUnescaped = FALSE;
132 UChar32 prevC=0;
133
134 /* We are guaranteed on entry that initialChar is not a whitespace
135 character. If we are at the EOF, or have some other problem, it
136 doesn't matter; we still want to validly return the initialChar
137 (if nothing else) as a string token. */
138
139 if (U_FAILURE(*status)) {
140 return TOK_ERROR;
141 }
142
143 /* setup */
144 lastStringWasQuoted = FALSE;
145 c = initialChar;
146 ustr_setlen(token, 0, status);
147
148 if (U_FAILURE(*status)) {
149 return TOK_ERROR;
150 }
151
152 for (;;) {
153 if (c == QUOTE) {
154 if (!lastStringWasQuoted && token->fLength > 0) {
155 ustr_ucat(token, SPACE, status);
156
157 if (U_FAILURE(*status)) {
158 return TOK_ERROR;
159 }
160 }
161
162 lastStringWasQuoted = TRUE;
163
164 for (;;) {
165 c = ucbuf_getc(buf,status);
166
167 /* EOF reached */
168 if (c == U_EOF) {
169 return TOK_EOF;
170 }
171
172 /* Unterminated quoted strings */
173 if (U_FAILURE(*status)) {
174 return TOK_ERROR;
175 }
176
177 if (c == QUOTE && !isFollowingCharEscaped) {
178 break;
179 }
180
181 if (c == ESCAPE && !isFollowingCharEscaped) {
182 pTarget = target;
183 c = unescape(buf, status);
184
185 if (c == U_ERR) {
186 return TOK_ERROR;
187 }
188 if(c == CR || c == LF){
189 isNLUnescaped = TRUE;
190 }
191 }
192
193 if(c==ESCAPE && !isFollowingCharEscaped){
194 isFollowingCharEscaped = TRUE;
195 }else{
196 U_APPEND_CHAR32(c, pTarget,len);
197 pTarget = target;
198 ustr_uscat(token, pTarget,len, status);
199 isFollowingCharEscaped = FALSE;
200 len=0;
201 if(c == CR || c == LF){
202 if(isNLUnescaped == FALSE && prevC!=CR){
203 lineCount++;
204 }
205 isNLUnescaped = FALSE;
206 }
207 }
208
209 if (U_FAILURE(*status)) {
210 return TOK_ERROR;
211 }
212 prevC = c;
213 }
214 } else {
215 if (token->fLength > 0) {
216 ustr_ucat(token, SPACE, status);
217
218 if (U_FAILURE(*status)) {
219 return TOK_ERROR;
220 }
221 }
222
223 if(lastStringWasQuoted){
224 if(getShowWarning()){
225 warning(lineCount, "Mixing quoted and unquoted strings");
226 }
227 if(isStrict()){
228 return TOK_ERROR;
229 }
230
231 }
232
233 lastStringWasQuoted = FALSE;
234
235 /* if we reach here we are mixing
236 * quoted and unquoted strings
237 * warn in normal mode and error in
238 * pedantic mode
239 */
240
241 if (c == ESCAPE) {
242 pTarget = target;
243 c = unescape(buf, status);
244
245 /* EOF reached */
246 if (c == U_EOF) {
247 return TOK_ERROR;
248 }
249 }
250
251 U_APPEND_CHAR32(c, pTarget,len);
252 pTarget = target;
253 ustr_uscat(token, pTarget,len, status);
254 len=0;
255
256 if (U_FAILURE(*status)) {
257 return TOK_ERROR;
258 }
259
260 for (;;) {
261 /* DON'T skip whitespace */
262 c = getNextChar(buf, FALSE, NULL, status);
263
264 /* EOF reached */
265 if (c == U_EOF) {
266 ucbuf_ungetc(c, buf);
267 return TOK_STRING;
268 }
269
270 if (U_FAILURE(*status)) {
271 return TOK_STRING;
272 }
273
274 if (c == QUOTE
275 || c == OPENBRACE
276 || c == CLOSEBRACE
277 || c == COMMA
278 || c == COLON) {
279 ucbuf_ungetc(c, buf);
280 break;
281 }
282
283 if (isWhitespace(c)) {
284 break;
285 }
286
287 if (c == ESCAPE) {
288 pTarget = target;
289 c = unescape(buf, status);
290
291 if (c == U_ERR) {
292 return TOK_ERROR;
293 }
294 }
295
296 U_APPEND_CHAR32(c, pTarget,len);
297 pTarget = target;
298 ustr_uscat(token, pTarget,len, status);
299 len=0;
300 if (U_FAILURE(*status)) {
301 return TOK_ERROR;
302 }
303 }
304 }
305
306 /* DO skip whitespace */
307 c = getNextChar(buf, TRUE, NULL, status);
308
309 if (U_FAILURE(*status)) {
310 return TOK_STRING;
311 }
312
313 if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
314 ucbuf_ungetc(c, buf);
315 return TOK_STRING;
316 }
317 }
318 }
319
320 /* Retrieve the next character. If skipwhite is
321 true, whitespace is skipped as well. */
getNextChar(UCHARBUF * buf,UBool skipwhite,struct UString * token,UErrorCode * status)322 static UChar32 getNextChar(UCHARBUF* buf,
323 UBool skipwhite,
324 struct UString *token,
325 UErrorCode *status) {
326 UChar32 c, c2;
327
328 if (U_FAILURE(*status)) {
329 return U_EOF;
330 }
331
332 for (;;) {
333 c = ucbuf_getc(buf,status);
334
335 if (c == U_EOF) {
336 return U_EOF;
337 }
338
339 if (skipwhite && isWhitespace(c)) {
340 continue;
341 }
342
343 /* This also handles the get() failing case */
344 if (c != SLASH) {
345 return c;
346 }
347
348 c = ucbuf_getc(buf,status); /* "/c" */
349
350 if (c == U_EOF) {
351 return U_EOF;
352 }
353
354 switch (c) {
355 case SLASH: /* "//" */
356 seekUntilNewline(buf, NULL, status);
357 break;
358
359 case ASTERISK: /* " / * " */
360 c2 = ucbuf_getc(buf, status); /* "/ * c" */
361 if(c2 == ASTERISK){ /* "/ * *" */
362 /* parse multi-line comment and store it in token*/
363 seekUntilEndOfComment(buf, token, status);
364 } else {
365 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */
366 seekUntilEndOfComment(buf, NULL, status);
367 }
368 break;
369
370 default:
371 ucbuf_ungetc(c, buf); /* "/c" - put back the c */
372 /* If get() failed this is a NOP */
373 return SLASH;
374 }
375
376 }
377 }
378
seekUntilNewline(UCHARBUF * buf,struct UString * token,UErrorCode * status)379 static void seekUntilNewline(UCHARBUF* buf,
380 struct UString *token,
381 UErrorCode *status) {
382 UChar32 c;
383
384 if (U_FAILURE(*status)) {
385 return;
386 }
387
388 do {
389 c = ucbuf_getc(buf,status);
390 /* add the char to token */
391 if(token!=NULL){
392 ustr_u32cat(token, c, status);
393 }
394 } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
395 }
396
seekUntilEndOfComment(UCHARBUF * buf,struct UString * token,UErrorCode * status)397 static void seekUntilEndOfComment(UCHARBUF *buf,
398 struct UString *token,
399 UErrorCode *status) {
400 UChar32 c, d;
401 uint32_t line;
402
403 if (U_FAILURE(*status)) {
404 return;
405 }
406
407 line = lineCount;
408
409 do {
410 c = ucbuf_getc(buf, status);
411
412 if (c == ASTERISK) {
413 d = ucbuf_getc(buf, status);
414
415 if (d != SLASH) {
416 ucbuf_ungetc(d, buf);
417 } else {
418 break;
419 }
420 }
421 /* add the char to token */
422 if(token!=NULL){
423 ustr_u32cat(token, c, status);
424 }
425 /* increment the lineCount */
426 isNewline(c);
427
428 } while (c != U_EOF && *status == U_ZERO_ERROR);
429
430 if (c == U_EOF) {
431 *status = U_INVALID_FORMAT_ERROR;
432 error(line, "unterminated comment detected");
433 }
434 }
435
unescape(UCHARBUF * buf,UErrorCode * status)436 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
437 if (U_FAILURE(*status)) {
438 return U_EOF;
439 }
440
441 /* We expect to be called after the ESCAPE has been seen, but
442 * u_fgetcx needs an ESCAPE to do its magic. */
443 ucbuf_ungetc(ESCAPE, buf);
444
445 return ucbuf_getcx32(buf, status);
446 }
447
isWhitespace(UChar32 c)448 static UBool isWhitespace(UChar32 c) {
449 switch (c) {
450 /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
451 case 0x000A:
452 case 0x2029:
453 lineCount++;
454 case 0x000D:
455 case 0x0020:
456 case 0x0009:
457 case 0xFEFF:
458 return TRUE;
459
460 default:
461 return FALSE;
462 }
463 }
464
isNewline(UChar32 c)465 static UBool isNewline(UChar32 c) {
466 switch (c) {
467 /* '\n', '\r', 0x2029 */
468 case 0x000A:
469 case 0x2029:
470 lineCount++;
471 case 0x000D:
472 return TRUE;
473
474 default:
475 return FALSE;
476 }
477 }
478