1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 1998-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 *
11 * File read.c
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 * 05/26/99 stephen Creation.
17 * 5/10/01 Ram removed ustdio dependency
18 *******************************************************************************
19 */
20
21 #include <stdbool.h>
22
23 #include "read.h"
24 #include "errmsg.h"
25 #include "toolutil.h"
26 #include "unicode/ustring.h"
27 #include "unicode/utf16.h"
28
29 #define OPENBRACE 0x007B
30 #define CLOSEBRACE 0x007D
31 #define COMMA 0x002C
32 #define QUOTE 0x0022
33 #define ESCAPE 0x005C
34 #define SLASH 0x002F
35 #define ASTERISK 0x002A
36 #define SPACE 0x0020
37 #define COLON 0x003A
38 #define BADBOM 0xFFFE
39 #define CR 0x000D
40 #define LF 0x000A
41
42 static int32_t lineCount;
43
44 /* Protos */
45 static enum ETokenType getStringToken(UCHARBUF *buf,
46 UChar32 initialChar,
47 struct UString *token,
48 UErrorCode *status);
49
50 static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
51 static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status);
52 static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
53 static UBool isWhitespace (UChar32 c);
54 static UBool isNewline (UChar32 c);
55
resetLineNumber()56 U_CFUNC void resetLineNumber() {
57 lineCount = 1;
58 }
59
60 /* Read and return the next token from the stream. If the token is of
61 type eString, fill in the token parameter with the token. If the
62 token is eError, then the status parameter will contain the
63 specific error. This will be eItemNotFound at the end of file,
64 indicating that all tokens have been returned. This method will
65 never return eString twice in a row; instead, multiple adjacent
66 string tokens will be merged into one, with no intervening
67 space. */
68 U_CFUNC enum ETokenType
getNextToken(UCHARBUF * buf,struct UString * token,uint32_t * linenumber,struct UString * comment,UErrorCode * status)69 getNextToken(UCHARBUF* buf,
70 struct UString *token,
71 uint32_t *linenumber, /* out: linenumber of token */
72 struct UString *comment,
73 UErrorCode *status) {
74 enum ETokenType result;
75 UChar32 c;
76
77 if (U_FAILURE(*status)) {
78 return TOK_ERROR;
79 }
80
81 /* Skip whitespace */
82 c = getNextChar(buf, true, comment, status);
83
84 if (U_FAILURE(*status)) {
85 return TOK_ERROR;
86 }
87
88 *linenumber = lineCount;
89
90 switch(c) {
91 case BADBOM:
92 return TOK_ERROR;
93 case OPENBRACE:
94 return TOK_OPEN_BRACE;
95 case CLOSEBRACE:
96 return TOK_CLOSE_BRACE;
97 case COMMA:
98 return TOK_COMMA;
99 case U_EOF:
100 return TOK_EOF;
101 case COLON:
102 return TOK_COLON;
103
104 default:
105 result = getStringToken(buf, c, token, status);
106 }
107
108 *linenumber = lineCount;
109 return result;
110 }
111
112 /* Copy a string token into the given UnicodeString. Upon entry, we
113 have already read the first character of the string token, which is
114 not a whitespace character (but may be a QUOTE or ESCAPE). This
115 function reads all subsequent characters that belong with this
116 string, and copy them into the token parameter. The other
117 important, and slightly convoluted purpose of this function is to
118 merge adjacent strings. It looks forward a bit, and if the next
119 non comment, non whitespace item is a string, it reads it in as
120 well. If two adjacent strings are quoted, they are merged without
121 intervening space. Otherwise a single SPACE character is
122 inserted. */
getStringToken(UCHARBUF * buf,UChar32 initialChar,struct UString * token,UErrorCode * status)123 static enum ETokenType getStringToken(UCHARBUF* buf,
124 UChar32 initialChar,
125 struct UString *token,
126 UErrorCode *status) {
127 UBool lastStringWasQuoted;
128 UChar32 c;
129 UChar target[3] = { '\0' };
130 UChar *pTarget = target;
131 int len=0;
132 UBool isFollowingCharEscaped=false;
133 UBool isNLUnescaped = false;
134 UChar32 prevC=0;
135
136 /* We are guaranteed on entry that initialChar is not a whitespace
137 character. If we are at the EOF, or have some other problem, it
138 doesn't matter; we still want to validly return the initialChar
139 (if nothing else) as a string token. */
140
141 if (U_FAILURE(*status)) {
142 return TOK_ERROR;
143 }
144
145 /* setup */
146 lastStringWasQuoted = false;
147 c = initialChar;
148 ustr_setlen(token, 0, status);
149
150 if (U_FAILURE(*status)) {
151 return TOK_ERROR;
152 }
153
154 for (;;) {
155 if (c == QUOTE) {
156 if (!lastStringWasQuoted && token->fLength > 0) {
157 ustr_ucat(token, SPACE, status);
158
159 if (U_FAILURE(*status)) {
160 return TOK_ERROR;
161 }
162 }
163
164 lastStringWasQuoted = true;
165
166 for (;;) {
167 c = ucbuf_getc(buf,status);
168
169 /* EOF reached */
170 if (c == U_EOF) {
171 return TOK_EOF;
172 }
173
174 /* Unterminated quoted strings */
175 if (U_FAILURE(*status)) {
176 return TOK_ERROR;
177 }
178
179 if (c == QUOTE && !isFollowingCharEscaped) {
180 break;
181 }
182
183 if (c == ESCAPE && !isFollowingCharEscaped) {
184 pTarget = target;
185 c = unescape(buf, status);
186
187 if (c == U_ERR) {
188 return TOK_ERROR;
189 }
190 if(c == CR || c == LF){
191 isNLUnescaped = true;
192 }
193 }
194
195 if(c==ESCAPE && !isFollowingCharEscaped){
196 isFollowingCharEscaped = true;
197 }else{
198 U_APPEND_CHAR32(c, pTarget,len);
199 pTarget = target;
200 ustr_uscat(token, pTarget,len, status);
201 isFollowingCharEscaped = false;
202 len=0;
203 if(c == CR || c == LF){
204 if(isNLUnescaped == false && prevC!=CR){
205 lineCount++;
206 }
207 isNLUnescaped = false;
208 }
209 }
210
211 if (U_FAILURE(*status)) {
212 return TOK_ERROR;
213 }
214 prevC = c;
215 }
216 } else {
217 if (token->fLength > 0) {
218 ustr_ucat(token, SPACE, status);
219
220 if (U_FAILURE(*status)) {
221 return TOK_ERROR;
222 }
223 }
224
225 if(lastStringWasQuoted){
226 if(getShowWarning()){
227 warning(lineCount, "Mixing quoted and unquoted strings");
228 }
229 if(isStrict()){
230 return TOK_ERROR;
231 }
232
233 }
234
235 lastStringWasQuoted = false;
236
237 /* if we reach here we are mixing
238 * quoted and unquoted strings
239 * warn in normal mode and error in
240 * pedantic mode
241 */
242
243 if (c == ESCAPE) {
244 pTarget = target;
245 c = unescape(buf, status);
246
247 /* EOF reached */
248 if (c == U_EOF) {
249 return TOK_ERROR;
250 }
251 }
252
253 U_APPEND_CHAR32(c, pTarget,len);
254 pTarget = target;
255 ustr_uscat(token, pTarget,len, status);
256 len=0;
257
258 if (U_FAILURE(*status)) {
259 return TOK_ERROR;
260 }
261
262 for (;;) {
263 /* DON'T skip whitespace */
264 c = getNextChar(buf, false, NULL, status);
265
266 /* EOF reached */
267 if (c == U_EOF) {
268 ucbuf_ungetc(c, buf);
269 return TOK_STRING;
270 }
271
272 if (U_FAILURE(*status)) {
273 return TOK_STRING;
274 }
275
276 if (c == QUOTE
277 || c == OPENBRACE
278 || c == CLOSEBRACE
279 || c == COMMA
280 || c == COLON) {
281 ucbuf_ungetc(c, buf);
282 break;
283 }
284
285 if (isWhitespace(c)) {
286 break;
287 }
288
289 if (c == ESCAPE) {
290 pTarget = target;
291 c = unescape(buf, status);
292
293 if (c == U_ERR) {
294 return TOK_ERROR;
295 }
296 }
297
298 U_APPEND_CHAR32(c, pTarget,len);
299 pTarget = target;
300 ustr_uscat(token, pTarget,len, status);
301 len=0;
302 if (U_FAILURE(*status)) {
303 return TOK_ERROR;
304 }
305 }
306 }
307
308 /* DO skip whitespace */
309 c = getNextChar(buf, true, NULL, status);
310
311 if (U_FAILURE(*status)) {
312 return TOK_STRING;
313 }
314
315 if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
316 ucbuf_ungetc(c, buf);
317 return TOK_STRING;
318 }
319 }
320 }
321
322 /* Retrieve the next character. If skipwhite is
323 true, whitespace is skipped as well. */
getNextChar(UCHARBUF * buf,UBool skipwhite,struct UString * token,UErrorCode * status)324 static UChar32 getNextChar(UCHARBUF* buf,
325 UBool skipwhite,
326 struct UString *token,
327 UErrorCode *status) {
328 UChar32 c, c2;
329
330 if (U_FAILURE(*status)) {
331 return U_EOF;
332 }
333
334 for (;;) {
335 c = ucbuf_getc(buf,status);
336
337 if (c == U_EOF) {
338 return U_EOF;
339 }
340
341 if (skipwhite && isWhitespace(c)) {
342 continue;
343 }
344
345 /* This also handles the get() failing case */
346 if (c != SLASH) {
347 return c;
348 }
349
350 c = ucbuf_getc(buf,status); /* "/c" */
351
352 if (c == U_EOF) {
353 return U_EOF;
354 }
355
356 switch (c) {
357 case SLASH: /* "//" */
358 seekUntilNewline(buf, NULL, status);
359 break;
360
361 case ASTERISK: /* " / * " */
362 c2 = ucbuf_getc(buf, status); /* "/ * c" */
363 if(c2 == ASTERISK){ /* "/ * *" */
364 /* parse multi-line comment and store it in token*/
365 seekUntilEndOfComment(buf, token, status);
366 } else {
367 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */
368 seekUntilEndOfComment(buf, NULL, status);
369 }
370 break;
371
372 default:
373 ucbuf_ungetc(c, buf); /* "/c" - put back the c */
374 /* If get() failed this is a NOP */
375 return SLASH;
376 }
377
378 }
379 }
380
seekUntilNewline(UCHARBUF * buf,struct UString * token,UErrorCode * status)381 static void seekUntilNewline(UCHARBUF* buf,
382 struct UString *token,
383 UErrorCode *status) {
384 UChar32 c;
385
386 if (U_FAILURE(*status)) {
387 return;
388 }
389
390 do {
391 c = ucbuf_getc(buf,status);
392 /* add the char to token */
393 if(token!=NULL){
394 ustr_u32cat(token, c, status);
395 }
396 } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
397 }
398
seekUntilEndOfComment(UCHARBUF * buf,struct UString * token,UErrorCode * status)399 static void seekUntilEndOfComment(UCHARBUF *buf,
400 struct UString *token,
401 UErrorCode *status) {
402 UChar32 c, d;
403 uint32_t line;
404
405 if (U_FAILURE(*status)) {
406 return;
407 }
408
409 line = lineCount;
410
411 do {
412 c = ucbuf_getc(buf, status);
413
414 if (c == ASTERISK) {
415 d = ucbuf_getc(buf, status);
416
417 if (d != SLASH) {
418 ucbuf_ungetc(d, buf);
419 } else {
420 break;
421 }
422 }
423 /* add the char to token */
424 if(token!=NULL){
425 ustr_u32cat(token, c, status);
426 }
427 /* increment the lineCount */
428 isNewline(c);
429
430 } while (c != U_EOF && *status == U_ZERO_ERROR);
431
432 if (c == U_EOF) {
433 *status = U_INVALID_FORMAT_ERROR;
434 error(line, "unterminated comment detected");
435 }
436 }
437
unescape(UCHARBUF * buf,UErrorCode * status)438 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
439 if (U_FAILURE(*status)) {
440 return U_EOF;
441 }
442
443 /* We expect to be called after the ESCAPE has been seen, but
444 * u_fgetcx needs an ESCAPE to do its magic. */
445 ucbuf_ungetc(ESCAPE, buf);
446
447 return ucbuf_getcx32(buf, status);
448 }
449
isWhitespace(UChar32 c)450 static UBool isWhitespace(UChar32 c) {
451 switch (c) {
452 /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
453 case 0x000A:
454 case 0x2029:
455 lineCount++;
456 case 0x000D:
457 case 0x0020:
458 case 0x0009:
459 case 0xFEFF:
460 return true;
461
462 default:
463 return false;
464 }
465 }
466
isNewline(UChar32 c)467 static UBool isNewline(UChar32 c) {
468 switch (c) {
469 /* '\n', '\r', 0x2029 */
470 case 0x000A:
471 case 0x2029:
472 lineCount++;
473 case 0x000D:
474 return true;
475
476 default:
477 return false;
478 }
479 }
480