• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2000-2010, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  uparse.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2000apr18
14 *   created by: Markus W. Scherer
15 *
16 *   This file provides a parser for files that are delimited by one single
17 *   character like ';' or TAB. Example: the Unicode Character Properties files
18 *   like UnicodeData.txt are semicolon-delimited.
19 */
20 
21 #include "unicode/utypes.h"
22 #include "cstring.h"
23 #include "filestrm.h"
24 #include "uparse.h"
25 #include "unicode/uchar.h"
26 #include "unicode/ustring.h"
27 #include "ustr_imp.h"
28 
29 #include <stdio.h>
30 
31 U_CAPI const char * U_EXPORT2
u_skipWhitespace(const char * s)32 u_skipWhitespace(const char *s) {
33     while(U_IS_INV_WHITESPACE(*s)) {
34         ++s;
35     }
36     return s;
37 }
38 
39 U_CAPI char * U_EXPORT2
u_rtrim(char * s)40 u_rtrim(char *s) {
41     char *end=uprv_strchr(s, 0);
42     while(s<end && U_IS_INV_WHITESPACE(*(end-1))) {
43         *--end = 0;
44     }
45     return end;
46 }
47 
48 /*
49  * If the string starts with # @missing: then return the pointer to the
50  * following non-whitespace character.
51  * Otherwise return the original pointer.
52  * Unicode 5.0 adds such lines in some data files to document
53  * default property values.
54  * Poor man's regex for variable amounts of white space.
55  */
56 static const char *
getMissingLimit(const char * s)57 getMissingLimit(const char *s) {
58     const char *s0=s;
59     if(
60         *(s=u_skipWhitespace(s))=='#' &&
61         *(s=u_skipWhitespace(s+1))=='@' &&
62         0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) &&
63         *(s=u_skipWhitespace(s+7))==':'
64     ) {
65         return u_skipWhitespace(s+1);
66     } else {
67         return s0;
68     }
69 }
70 
71 U_CAPI void U_EXPORT2
u_parseDelimitedFile(const char * filename,char delimiter,char * fields[][2],int32_t fieldCount,UParseLineFn * lineFn,void * context,UErrorCode * pErrorCode)72 u_parseDelimitedFile(const char *filename, char delimiter,
73                      char *fields[][2], int32_t fieldCount,
74                      UParseLineFn *lineFn, void *context,
75                      UErrorCode *pErrorCode) {
76     FileStream *file;
77     char line[300];
78     char *start, *limit;
79     int32_t i, length;
80 
81     if(U_FAILURE(*pErrorCode)) {
82         return;
83     }
84 
85     if(fields==NULL || lineFn==NULL || fieldCount<=0) {
86         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
87         return;
88     }
89 
90     if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
91         filename=NULL;
92         file=T_FileStream_stdin();
93     } else {
94         file=T_FileStream_open(filename, "r");
95     }
96     if(file==NULL) {
97         *pErrorCode=U_FILE_ACCESS_ERROR;
98         return;
99     }
100 
101     while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) {
102         /* remove trailing newline characters */
103         length=(int32_t)(u_rtrim(line)-line);
104 
105         /*
106          * detect a line with # @missing:
107          * start parsing after that, or else from the beginning of the line
108          * set the default warning for @missing lines
109          */
110         start=(char *)getMissingLimit(line);
111         if(start==line) {
112             *pErrorCode=U_ZERO_ERROR;
113         } else {
114             *pErrorCode=U_USING_DEFAULT_WARNING;
115         }
116 
117         /* skip this line if it is empty or a comment */
118         if(*start==0 || *start=='#') {
119             continue;
120         }
121 
122         /* remove in-line comments */
123         limit=uprv_strchr(start, '#');
124         if(limit!=NULL) {
125             /* get white space before the pound sign */
126             while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) {
127                 --limit;
128             }
129 
130             /* truncate the line */
131             *limit=0;
132         }
133 
134         /* skip lines with only whitespace */
135         if(u_skipWhitespace(start)[0]==0) {
136             continue;
137         }
138 
139         /* for each field, call the corresponding field function */
140         for(i=0; i<fieldCount; ++i) {
141             /* set the limit pointer of this field */
142             limit=start;
143             while(*limit!=delimiter && *limit!=0) {
144                 ++limit;
145             }
146 
147             /* set the field start and limit in the fields array */
148             fields[i][0]=start;
149             fields[i][1]=limit;
150 
151             /* set start to the beginning of the next field, if any */
152             start=limit;
153             if(*start!=0) {
154                 ++start;
155             } else if(i+1<fieldCount) {
156                 *pErrorCode=U_PARSE_ERROR;
157                 limit=line+length;
158                 i=fieldCount;
159                 break;
160             }
161         }
162 
163         /* error in a field function? */
164         if(U_FAILURE(*pErrorCode)) {
165             break;
166         }
167 
168         /* call the field function */
169         lineFn(context, fields, fieldCount, pErrorCode);
170         if(U_FAILURE(*pErrorCode)) {
171             break;
172         }
173     }
174 
175     if(filename!=NULL) {
176         T_FileStream_close(file);
177     }
178 }
179 
180 /*
181  * parse a list of code points
182  * store them as a UTF-32 string in dest[destCapacity]
183  * return the number of code points
184  */
185 U_CAPI int32_t U_EXPORT2
u_parseCodePoints(const char * s,uint32_t * dest,int32_t destCapacity,UErrorCode * pErrorCode)186 u_parseCodePoints(const char *s,
187                   uint32_t *dest, int32_t destCapacity,
188                   UErrorCode *pErrorCode) {
189     char *end;
190     uint32_t value;
191     int32_t count;
192 
193     if(U_FAILURE(*pErrorCode)) {
194         return 0;
195     }
196     if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
197         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
198         return 0;
199     }
200 
201     count=0;
202     for(;;) {
203         s=u_skipWhitespace(s);
204         if(*s==';' || *s==0) {
205             return count;
206         }
207 
208         /* read one code point */
209         value=(uint32_t)uprv_strtoul(s, &end, 16);
210         if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
211             *pErrorCode=U_PARSE_ERROR;
212             return 0;
213         }
214 
215         /* append it to the destination array */
216         if(count<destCapacity) {
217             dest[count++]=value;
218         } else {
219             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
220         }
221 
222         /* go to the following characters */
223         s=end;
224     }
225 }
226 
227 /*
228  * parse a list of code points
229  * store them as a string in dest[destCapacity]
230  * set the first code point in *pFirst
231  * @return The length of the string in numbers of UChars.
232  */
233 U_CAPI int32_t U_EXPORT2
u_parseString(const char * s,UChar * dest,int32_t destCapacity,uint32_t * pFirst,UErrorCode * pErrorCode)234 u_parseString(const char *s,
235               UChar *dest, int32_t destCapacity,
236               uint32_t *pFirst,
237               UErrorCode *pErrorCode) {
238     char *end;
239     uint32_t value;
240     int32_t destLength;
241 
242     if(U_FAILURE(*pErrorCode)) {
243         return 0;
244     }
245     if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
246         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
247     }
248 
249     if(pFirst!=NULL) {
250         *pFirst=0xffffffff;
251     }
252 
253     destLength=0;
254     for(;;) {
255         s=u_skipWhitespace(s);
256         if(*s==';' || *s==0) {
257             if(destLength<destCapacity) {
258                 dest[destLength]=0;
259             } else if(destLength==destCapacity) {
260                 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
261             } else {
262                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
263             }
264             return destLength;
265         }
266 
267         /* read one code point */
268         value=(uint32_t)uprv_strtoul(s, &end, 16);
269         if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
270             *pErrorCode=U_PARSE_ERROR;
271             return 0;
272         }
273 
274         /* store the first code point */
275         if(pFirst!=NULL) {
276             *pFirst=value;
277             pFirst=NULL;
278         }
279 
280         /* append it to the destination array */
281         if((destLength+U16_LENGTH(value))<=destCapacity) {
282             U16_APPEND_UNSAFE(dest, destLength, value);
283         } else {
284             destLength+=U16_LENGTH(value);
285         }
286 
287         /* go to the following characters */
288         s=end;
289     }
290 }
291 
292 /* read a range like start or start..end */
293 U_CAPI int32_t U_EXPORT2
u_parseCodePointRangeAnyTerminator(const char * s,uint32_t * pStart,uint32_t * pEnd,const char ** terminator,UErrorCode * pErrorCode)294 u_parseCodePointRangeAnyTerminator(const char *s,
295                                    uint32_t *pStart, uint32_t *pEnd,
296                                    const char **terminator,
297                                    UErrorCode *pErrorCode) {
298     char *end;
299     uint32_t value;
300 
301     if(U_FAILURE(*pErrorCode)) {
302         return 0;
303     }
304     if(s==NULL || pStart==NULL || pEnd==NULL) {
305         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
306         return 0;
307     }
308 
309     /* read the start code point */
310     s=u_skipWhitespace(s);
311     value=(uint32_t)uprv_strtoul(s, &end, 16);
312     if(end<=s || value>=0x110000) {
313         *pErrorCode=U_PARSE_ERROR;
314         return 0;
315     }
316     *pStart=*pEnd=value;
317 
318     /* is there a "..end"? */
319     s=u_skipWhitespace(end);
320     if(*s!='.' || s[1]!='.') {
321         *terminator=end;
322         return 1;
323     }
324     s=u_skipWhitespace(s+2);
325 
326     /* read the end code point */
327     value=(uint32_t)uprv_strtoul(s, &end, 16);
328     if(end<=s || value>=0x110000) {
329         *pErrorCode=U_PARSE_ERROR;
330         return 0;
331     }
332     *pEnd=value;
333 
334     /* is this a valid range? */
335     if(value<*pStart) {
336         *pErrorCode=U_PARSE_ERROR;
337         return 0;
338     }
339 
340     *terminator=end;
341     return value-*pStart+1;
342 }
343 
344 U_CAPI int32_t U_EXPORT2
u_parseCodePointRange(const char * s,uint32_t * pStart,uint32_t * pEnd,UErrorCode * pErrorCode)345 u_parseCodePointRange(const char *s,
346                       uint32_t *pStart, uint32_t *pEnd,
347                       UErrorCode *pErrorCode) {
348     const char *terminator;
349     int32_t rangeLength=
350         u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode);
351     if(U_SUCCESS(*pErrorCode)) {
352         terminator=u_skipWhitespace(terminator);
353         if(*terminator!=';' && *terminator!=0) {
354             *pErrorCode=U_PARSE_ERROR;
355             return 0;
356         }
357     }
358     return rangeLength;
359 }
360 
361 U_CAPI int32_t U_EXPORT2
u_parseUTF8(const char * source,int32_t sLen,char * dest,int32_t destCapacity,UErrorCode * status)362 u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) {
363     const char *read = source;
364     int32_t i = 0;
365     unsigned int value = 0;
366     if(sLen == -1) {
367         sLen = (int32_t)strlen(source);
368     }
369 
370     while(read < source+sLen) {
371         sscanf(read, "%2x", &value);
372         if(i < destCapacity) {
373             dest[i] = (char)value;
374         }
375         i++;
376         read += 2;
377     }
378     return u_terminateChars(dest, destCapacity, i, status);
379 }
380