• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2000-2007, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  uparse.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2000apr18
14 *   created by: Markus W. Scherer
15 *
16 *   This file provides a parser for files that are delimited by one single
17 *   character like ';' or TAB. Example: the Unicode Character Properties files
18 *   like UnicodeData.txt are semicolon-delimited.
19 */
20 
21 #include "unicode/utypes.h"
22 #include "cstring.h"
23 #include "filestrm.h"
24 #include "uparse.h"
25 #include "unicode/uchar.h"
26 #include "unicode/ustring.h"
27 #include "ustr_imp.h"
28 
29 #include <stdio.h>
30 
31 U_CAPI const char * U_EXPORT2
u_skipWhitespace(const char * s)32 u_skipWhitespace(const char *s) {
33     while(*s==' ' || *s=='\t') {
34         ++s;
35     }
36     return s;
37 }
38 
39 /*
40  * If the string starts with # @missing: then return the pointer to the
41  * following non-whitespace character.
42  * Otherwise return the original pointer.
43  * Unicode 5.0 adds such lines in some data files to document
44  * default property values.
45  * Poor man's regex for variable amounts of white space.
46  */
47 static const char *
getMissingLimit(const char * s)48 getMissingLimit(const char *s) {
49     const char *s0=s;
50     if(
51         *(s=u_skipWhitespace(s))=='#' &&
52         *(s=u_skipWhitespace(s+1))=='@' &&
53         0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) &&
54         *(s=u_skipWhitespace(s+7))==':'
55     ) {
56         return u_skipWhitespace(s+1);
57     } else {
58         return s0;
59     }
60 }
61 
62 U_CAPI void U_EXPORT2
u_parseDelimitedFile(const char * filename,char delimiter,char * fields[][2],int32_t fieldCount,UParseLineFn * lineFn,void * context,UErrorCode * pErrorCode)63 u_parseDelimitedFile(const char *filename, char delimiter,
64                      char *fields[][2], int32_t fieldCount,
65                      UParseLineFn *lineFn, void *context,
66                      UErrorCode *pErrorCode) {
67     FileStream *file;
68     char line[300];
69     char *start, *limit;
70     int32_t i, length;
71 
72     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
73         return;
74     }
75 
76     if(fields==NULL || lineFn==NULL || fieldCount<=0) {
77         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
78         return;
79     }
80 
81     if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
82         filename=NULL;
83         file=T_FileStream_stdin();
84     } else {
85         file=T_FileStream_open(filename, "r");
86     }
87     if(file==NULL) {
88         *pErrorCode=U_FILE_ACCESS_ERROR;
89         return;
90     }
91 
92     while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) {
93         length=(int32_t)uprv_strlen(line);
94 
95         /* remove trailing newline characters */
96         while(length>0 && (line[length-1]=='\r' || line[length-1]=='\n')) {
97             line[--length]=0;
98         }
99 
100         /*
101          * detect a line with # @missing:
102          * start parsing after that, or else from the beginning of the line
103          * set the default warning for @missing lines
104          */
105         start=(char *)getMissingLimit(line);
106         if(start==line) {
107             *pErrorCode=U_ZERO_ERROR;
108         } else {
109             *pErrorCode=U_USING_DEFAULT_WARNING;
110         }
111 
112         /* skip this line if it is empty or a comment */
113         if(*start==0 || *start=='#') {
114             continue;
115         }
116 
117         /* remove in-line comments */
118         limit=uprv_strchr(start, '#');
119         if(limit!=NULL) {
120             /* get white space before the pound sign */
121             while(limit>start && (*(limit-1)==' ' || *(limit-1)=='\t')) {
122                 --limit;
123             }
124 
125             /* truncate the line */
126             *limit=0;
127         }
128 
129         /* skip lines with only whitespace */
130         if(u_skipWhitespace(start)[0]==0) {
131             continue;
132         }
133 
134         /* for each field, call the corresponding field function */
135         for(i=0; i<fieldCount; ++i) {
136             /* set the limit pointer of this field */
137             limit=start;
138             while(*limit!=delimiter && *limit!=0) {
139                 ++limit;
140             }
141 
142             /* set the field start and limit in the fields array */
143             fields[i][0]=start;
144             fields[i][1]=limit;
145 
146             /* set start to the beginning of the next field, if any */
147             start=limit;
148             if(*start!=0) {
149                 ++start;
150             } else if(i+1<fieldCount) {
151                 *pErrorCode=U_PARSE_ERROR;
152                 limit=line+length;
153                 i=fieldCount;
154                 break;
155             }
156         }
157 
158         /* error in a field function? */
159         if(U_FAILURE(*pErrorCode)) {
160             break;
161         }
162 
163         /* call the field function */
164         lineFn(context, fields, fieldCount, pErrorCode);
165         if(U_FAILURE(*pErrorCode)) {
166             break;
167         }
168     }
169 
170     if(filename!=NULL) {
171         T_FileStream_close(file);
172     }
173 }
174 
175 /*
176  * parse a list of code points
177  * store them as a UTF-32 string in dest[destCapacity]
178  * return the number of code points
179  */
180 U_CAPI int32_t U_EXPORT2
u_parseCodePoints(const char * s,uint32_t * dest,int32_t destCapacity,UErrorCode * pErrorCode)181 u_parseCodePoints(const char *s,
182                   uint32_t *dest, int32_t destCapacity,
183                   UErrorCode *pErrorCode) {
184     char *end;
185     uint32_t value;
186     int32_t count;
187 
188     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
189         return 0;
190     }
191     if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
192         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
193         return 0;
194     }
195 
196     count=0;
197     for(;;) {
198         s=u_skipWhitespace(s);
199         if(*s==';' || *s==0) {
200             return count;
201         }
202 
203         /* read one code point */
204         value=(uint32_t)uprv_strtoul(s, &end, 16);
205         if(end<=s || (*end!=' ' && *end!='\t' && *end!=';' && *end!=0) || value>=0x110000) {
206             *pErrorCode=U_PARSE_ERROR;
207             return 0;
208         }
209 
210         /* append it to the destination array */
211         if(count<destCapacity) {
212             dest[count++]=value;
213         } else {
214             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
215         }
216 
217         /* go to the following characters */
218         s=end;
219     }
220 }
221 
222 /*
223  * parse a list of code points
224  * store them as a string in dest[destCapacity]
225  * set the first code point in *pFirst
226  * @return The length of the string in numbers of UChars.
227  */
228 U_CAPI int32_t U_EXPORT2
u_parseString(const char * s,UChar * dest,int32_t destCapacity,uint32_t * pFirst,UErrorCode * pErrorCode)229 u_parseString(const char *s,
230               UChar *dest, int32_t destCapacity,
231               uint32_t *pFirst,
232               UErrorCode *pErrorCode) {
233     char *end;
234     uint32_t value;
235     int32_t destLength;
236 
237     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
238         return 0;
239     }
240     if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
241         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
242     }
243 
244     if(pFirst!=NULL) {
245         *pFirst=0xffffffff;
246     }
247 
248     destLength=0;
249     for(;;) {
250         s=u_skipWhitespace(s);
251         if(*s==';' || *s==0) {
252             if(destLength<destCapacity) {
253                 dest[destLength]=0;
254             } else if(destLength==destCapacity) {
255                 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
256             } else {
257                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
258             }
259             return destLength;
260         }
261 
262         /* read one code point */
263         value=(uint32_t)uprv_strtoul(s, &end, 16);
264         if(end<=s || (*end!=' ' && *end!='\t' && *end!=';' && *end!=0) || value>=0x110000) {
265             *pErrorCode=U_PARSE_ERROR;
266             return 0;
267         }
268 
269         /* store the first code point */
270         if(destLength==0 && pFirst!=NULL) {
271             *pFirst=value;
272         }
273 
274         /* append it to the destination array */
275         if((destLength+UTF_CHAR_LENGTH(value))<=destCapacity) {
276             UTF_APPEND_CHAR_UNSAFE(dest, destLength, value);
277         } else {
278             destLength+=UTF_CHAR_LENGTH(value);
279         }
280 
281         /* go to the following characters */
282         s=end;
283     }
284 }
285 
286 /* read a range like start or start..end */
287 U_CAPI int32_t U_EXPORT2
u_parseCodePointRange(const char * s,uint32_t * pStart,uint32_t * pEnd,UErrorCode * pErrorCode)288 u_parseCodePointRange(const char *s,
289                       uint32_t *pStart, uint32_t *pEnd,
290                       UErrorCode *pErrorCode) {
291     char *end;
292     uint32_t value;
293 
294     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
295         return 0;
296     }
297     if(s==NULL || pStart==NULL || pEnd==NULL) {
298         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
299         return 0;
300     }
301 
302     s=u_skipWhitespace(s);
303     if(*s==';' || *s==0) {
304         *pErrorCode=U_PARSE_ERROR;
305         return 0;
306     }
307 
308     /* read the start code point */
309     value=(uint32_t)uprv_strtoul(s, &end, 16);
310     if(end<=s || (*end!=' ' && *end!='\t' && *end!='.' && *end!=';') || value>=0x110000) {
311         *pErrorCode=U_PARSE_ERROR;
312         return 0;
313     }
314     *pStart=*pEnd=value;
315 
316     /* is there a "..end"? */
317     s=u_skipWhitespace(end);
318     if(*s==';' || *s==0) {
319         return 1;
320     }
321 
322     if(*s!='.' || s[1]!='.') {
323         *pErrorCode=U_PARSE_ERROR;
324         return 0;
325     }
326     s+=2;
327 
328     /* read the end code point */
329     value=(uint32_t)uprv_strtoul(s, &end, 16);
330     if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) {
331         *pErrorCode=U_PARSE_ERROR;
332         return 0;
333     }
334     *pEnd=value;
335 
336     /* is this a valid range? */
337     if(value<*pStart) {
338         *pErrorCode=U_PARSE_ERROR;
339         return 0;
340     }
341 
342     /* no garbage after that? */
343     s=u_skipWhitespace(end);
344     if(*s==';' || *s==0) {
345         return value-*pStart+1;
346     } else {
347         *pErrorCode=U_PARSE_ERROR;
348         return 0;
349     }
350 }
351 
352 U_CAPI int32_t U_EXPORT2
u_parseUTF8(const char * source,int32_t sLen,char * dest,int32_t destCapacity,UErrorCode * status)353 u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) {
354     const char *read = source;
355     int32_t i = 0;
356     unsigned int value = 0;
357     if(sLen == -1) {
358         sLen = (int32_t)strlen(source);
359     }
360 
361     while(read < source+sLen) {
362         sscanf(read, "%2x", &value);
363         if(i < destCapacity) {
364             dest[i] = (char)value;
365         }
366         i++;
367         read += 2;
368     }
369     return u_terminateChars(dest, destCapacity, i, status);
370 }
371