1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2000-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: uparse.c
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2000apr18
16 * created by: Markus W. Scherer
17 *
18 * This file provides a parser for files that are delimited by one single
19 * character like ';' or TAB. Example: the Unicode Character Properties files
20 * like UnicodeData.txt are semicolon-delimited.
21 */
22
23 #include "unicode/utypes.h"
24 #include "unicode/uchar.h"
25 #include "unicode/ustring.h"
26 #include "unicode/utf16.h"
27 #include "cstring.h"
28 #include "filestrm.h"
29 #include "uparse.h"
30 #include "ustr_imp.h"
31
32 #include <stdio.h>
33
34 U_CAPI const char * U_EXPORT2
u_skipWhitespace(const char * s)35 u_skipWhitespace(const char *s) {
36 while(U_IS_INV_WHITESPACE(*s)) {
37 ++s;
38 }
39 return s;
40 }
41
42 U_CAPI char * U_EXPORT2
u_rtrim(char * s)43 u_rtrim(char *s) {
44 char *end=uprv_strchr(s, 0);
45 while(s<end && U_IS_INV_WHITESPACE(*(end-1))) {
46 *--end = 0;
47 }
48 return end;
49 }
50
51 /*
52 * If the string starts with # @missing: then return the pointer to the
53 * following non-whitespace character.
54 * Otherwise return the original pointer.
55 * Unicode 5.0 adds such lines in some data files to document
56 * default property values.
57 * Poor man's regex for variable amounts of white space.
58 */
59 static const char *
getMissingLimit(const char * s)60 getMissingLimit(const char *s) {
61 const char *s0=s;
62 if(
63 *(s=u_skipWhitespace(s))=='#' &&
64 *(s=u_skipWhitespace(s+1))=='@' &&
65 0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) &&
66 *(s=u_skipWhitespace(s+7))==':'
67 ) {
68 return u_skipWhitespace(s+1);
69 } else {
70 return s0;
71 }
72 }
73
74 U_CAPI void U_EXPORT2
u_parseDelimitedFile(const char * filename,char delimiter,char * fields[][2],int32_t fieldCount,UParseLineFn * lineFn,void * context,UErrorCode * pErrorCode)75 u_parseDelimitedFile(const char *filename, char delimiter,
76 char *fields[][2], int32_t fieldCount,
77 UParseLineFn *lineFn, void *context,
78 UErrorCode *pErrorCode) {
79 FileStream *file;
80 char line[10000];
81 char *start, *limit;
82 int32_t i, length;
83
84 if(U_FAILURE(*pErrorCode)) {
85 return;
86 }
87
88 if(fields==nullptr || lineFn==nullptr || fieldCount<=0) {
89 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
90 return;
91 }
92
93 if(filename==nullptr || *filename==0 || (*filename=='-' && filename[1]==0)) {
94 filename=nullptr;
95 file=T_FileStream_stdin();
96 } else {
97 file=T_FileStream_open(filename, "r");
98 }
99 if(file==nullptr) {
100 *pErrorCode=U_FILE_ACCESS_ERROR;
101 return;
102 }
103
104 while(T_FileStream_readLine(file, line, sizeof(line))!=nullptr) {
105 /* remove trailing newline characters */
106 length=(int32_t)(u_rtrim(line)-line);
107
108 /*
109 * detect a line with # @missing:
110 * start parsing after that, or else from the beginning of the line
111 * set the default warning for @missing lines
112 */
113 start=(char *)getMissingLimit(line);
114 if(start==line) {
115 *pErrorCode=U_ZERO_ERROR;
116 } else {
117 *pErrorCode=U_USING_DEFAULT_WARNING;
118 }
119
120 /* skip this line if it is empty or a comment */
121 if(*start==0 || *start=='#') {
122 continue;
123 }
124
125 /* remove in-line comments */
126 limit=uprv_strchr(start, '#');
127 if(limit!=nullptr) {
128 /* get white space before the pound sign */
129 while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) {
130 --limit;
131 }
132
133 /* truncate the line */
134 *limit=0;
135 }
136
137 /* skip lines with only whitespace */
138 if(u_skipWhitespace(start)[0]==0) {
139 continue;
140 }
141
142 /* for each field, call the corresponding field function */
143 for(i=0; i<fieldCount; ++i) {
144 /* set the limit pointer of this field */
145 limit=start;
146 while(*limit!=delimiter && *limit!=0) {
147 ++limit;
148 }
149
150 /* set the field start and limit in the fields array */
151 fields[i][0]=start;
152 fields[i][1]=limit;
153
154 /* set start to the beginning of the next field, if any */
155 start=limit;
156 if(*start!=0) {
157 ++start;
158 } else if(i+1<fieldCount) {
159 *pErrorCode=U_PARSE_ERROR;
160 limit=line+length;
161 i=fieldCount;
162 break;
163 }
164 }
165
166 /* too few fields? */
167 if(U_FAILURE(*pErrorCode)) {
168 break;
169 }
170
171 /* call the field function */
172 lineFn(context, fields, fieldCount, pErrorCode);
173 if(U_FAILURE(*pErrorCode)) {
174 break;
175 }
176 }
177
178 if(filename!=nullptr) {
179 T_FileStream_close(file);
180 }
181 }
182
183 /*
184 * parse a list of code points
185 * store them as a UTF-32 string in dest[destCapacity]
186 * return the number of code points
187 */
188 U_CAPI int32_t U_EXPORT2
u_parseCodePoints(const char * s,uint32_t * dest,int32_t destCapacity,UErrorCode * pErrorCode)189 u_parseCodePoints(const char *s,
190 uint32_t *dest, int32_t destCapacity,
191 UErrorCode *pErrorCode) {
192 char *end;
193 uint32_t value;
194 int32_t count;
195
196 if(U_FAILURE(*pErrorCode)) {
197 return 0;
198 }
199 if(s==nullptr || destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
200 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
201 return 0;
202 }
203
204 count=0;
205 for(;;) {
206 s=u_skipWhitespace(s);
207 if(*s==';' || *s==0) {
208 return count;
209 }
210
211 /* read one code point */
212 value=(uint32_t)uprv_strtoul(s, &end, 16);
213 if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
214 *pErrorCode=U_PARSE_ERROR;
215 return 0;
216 }
217
218 /* append it to the destination array */
219 if(count<destCapacity) {
220 dest[count++]=value;
221 } else {
222 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
223 }
224
225 /* go to the following characters */
226 s=end;
227 }
228 }
229
230 /*
231 * parse a list of code points
232 * store them as a string in dest[destCapacity]
233 * set the first code point in *pFirst
234 * @return The length of the string in numbers of UChars.
235 */
236 U_CAPI int32_t U_EXPORT2
u_parseString(const char * s,char16_t * dest,int32_t destCapacity,uint32_t * pFirst,UErrorCode * pErrorCode)237 u_parseString(const char *s,
238 char16_t *dest, int32_t destCapacity,
239 uint32_t *pFirst,
240 UErrorCode *pErrorCode) {
241 char *end;
242 uint32_t value;
243 int32_t destLength;
244
245 if(U_FAILURE(*pErrorCode)) {
246 return 0;
247 }
248 if(s==nullptr || destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
249 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
250 return 0;
251 }
252
253 if(pFirst!=nullptr) {
254 *pFirst=0xffffffff;
255 }
256
257 destLength=0;
258 for(;;) {
259 s=u_skipWhitespace(s);
260 if(*s==';' || *s==0) {
261 if(destLength<destCapacity) {
262 dest[destLength]=0;
263 } else if(destLength==destCapacity) {
264 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
265 } else {
266 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
267 }
268 return destLength;
269 }
270
271 /* read one code point */
272 value=(uint32_t)uprv_strtoul(s, &end, 16);
273 if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
274 *pErrorCode=U_PARSE_ERROR;
275 return 0;
276 }
277
278 /* store the first code point */
279 if(pFirst!=nullptr) {
280 *pFirst=value;
281 pFirst=nullptr;
282 }
283
284 /* append it to the destination array */
285 if((destLength+U16_LENGTH(value))<=destCapacity) {
286 U16_APPEND_UNSAFE(dest, destLength, value);
287 } else {
288 destLength+=U16_LENGTH(value);
289 }
290
291 /* go to the following characters */
292 s=end;
293 }
294 }
295
296 /* read a range like start or start..end */
297 U_CAPI int32_t U_EXPORT2
u_parseCodePointRangeAnyTerminator(const char * s,uint32_t * pStart,uint32_t * pEnd,const char ** terminator,UErrorCode * pErrorCode)298 u_parseCodePointRangeAnyTerminator(const char *s,
299 uint32_t *pStart, uint32_t *pEnd,
300 const char **terminator,
301 UErrorCode *pErrorCode) {
302 char *end;
303 uint32_t value;
304
305 if(U_FAILURE(*pErrorCode)) {
306 return 0;
307 }
308 if(s==nullptr || pStart==nullptr || pEnd==nullptr) {
309 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
310 return 0;
311 }
312
313 /* read the start code point */
314 s=u_skipWhitespace(s);
315 value=(uint32_t)uprv_strtoul(s, &end, 16);
316 if(end<=s || value>=0x110000) {
317 *pErrorCode=U_PARSE_ERROR;
318 return 0;
319 }
320 *pStart=*pEnd=value;
321
322 /* is there a "..end"? */
323 s=u_skipWhitespace(end);
324 if(*s!='.' || s[1]!='.') {
325 *terminator=end;
326 return 1;
327 }
328 s=u_skipWhitespace(s+2);
329
330 /* read the end code point */
331 value=(uint32_t)uprv_strtoul(s, &end, 16);
332 if(end<=s || value>=0x110000) {
333 *pErrorCode=U_PARSE_ERROR;
334 return 0;
335 }
336 *pEnd=value;
337
338 /* is this a valid range? */
339 if(value<*pStart) {
340 *pErrorCode=U_PARSE_ERROR;
341 return 0;
342 }
343
344 *terminator=end;
345 return value-*pStart+1;
346 }
347
348 U_CAPI int32_t U_EXPORT2
u_parseCodePointRange(const char * s,uint32_t * pStart,uint32_t * pEnd,UErrorCode * pErrorCode)349 u_parseCodePointRange(const char *s,
350 uint32_t *pStart, uint32_t *pEnd,
351 UErrorCode *pErrorCode) {
352 const char *terminator;
353 int32_t rangeLength=
354 u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode);
355 if(U_SUCCESS(*pErrorCode)) {
356 terminator=u_skipWhitespace(terminator);
357 if(*terminator!=';' && *terminator!=0) {
358 *pErrorCode=U_PARSE_ERROR;
359 return 0;
360 }
361 }
362 return rangeLength;
363 }
364
365 U_CAPI int32_t U_EXPORT2
u_parseUTF8(const char * source,int32_t sLen,char * dest,int32_t destCapacity,UErrorCode * status)366 u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) {
367 const char *read = source;
368 int32_t i = 0;
369 unsigned int value = 0;
370 if(sLen == -1) {
371 sLen = (int32_t)strlen(source);
372 }
373
374 while(read < source+sLen) {
375 sscanf(read, "%2x", &value);
376 if(i < destCapacity) {
377 dest[i] = (char)value;
378 }
379 i++;
380 read += 2;
381 }
382 return u_terminateChars(dest, destCapacity, i, status);
383 }
384