• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2002-2006, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  props2.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002feb24
14 *   created by: Markus W. Scherer
15 *
16 *   Parse more Unicode Character Database files and store
17 *   additional Unicode character properties in bit set vectors.
18 */
19 
20 #include <stdio.h>
21 #include "unicode/utypes.h"
22 #include "unicode/uchar.h"
23 #include "unicode/uscript.h"
24 #include "cstring.h"
25 #include "cmemory.h"
26 #include "utrie.h"
27 #include "uprops.h"
28 #include "propsvec.h"
29 #include "uparse.h"
30 #include "writesrc.h"
31 #include "genprops.h"
32 
33 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
34 
35 /* data --------------------------------------------------------------------- */
36 
37 static UNewTrie *trie;
38 uint32_t *pv;
39 static int32_t pvCount;
40 
41 /* miscellaneous ------------------------------------------------------------ */
42 
43 static char *
trimTerminateField(char * s,char * limit)44 trimTerminateField(char *s, char *limit) {
45     /* trim leading whitespace */
46     s=(char *)u_skipWhitespace(s);
47 
48     /* trim trailing whitespace */
49     while(s<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
50         --limit;
51     }
52     *limit=0;
53 
54     return s;
55 }
56 
57 static void
parseTwoFieldFile(char * filename,char * basename,const char * ucdFile,const char * suffix,UParseLineFn * lineFn,UErrorCode * pErrorCode)58 parseTwoFieldFile(char *filename, char *basename,
59                   const char *ucdFile, const char *suffix,
60                   UParseLineFn *lineFn,
61                   UErrorCode *pErrorCode) {
62     char *fields[2][2];
63 
64     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
65         return;
66     }
67 
68     writeUCDFilename(basename, ucdFile, suffix);
69 
70     u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode);
71     if(U_FAILURE(*pErrorCode)) {
72         fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
73     }
74 }
75 
76 static void U_CALLCONV
77 ageLineFn(void *context,
78           char *fields[][2], int32_t fieldCount,
79           UErrorCode *pErrorCode);
80 
81 static void
parseMultiFieldFile(char * filename,char * basename,const char * ucdFile,const char * suffix,int32_t fieldCount,UParseLineFn * lineFn,UErrorCode * pErrorCode)82 parseMultiFieldFile(char *filename, char *basename,
83                     const char *ucdFile, const char *suffix,
84                     int32_t fieldCount,
85                     UParseLineFn *lineFn,
86                     UErrorCode *pErrorCode) {
87     char *fields[20][2];
88 
89     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
90         return;
91     }
92 
93     writeUCDFilename(basename, ucdFile, suffix);
94 
95     u_parseDelimitedFile(filename, ';', fields, fieldCount, lineFn, NULL, pErrorCode);
96     if(U_FAILURE(*pErrorCode)) {
97         fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
98     }
99 }
100 
101 static void U_CALLCONV
102 numericLineFn(void *context,
103               char *fields[][2], int32_t fieldCount,
104               UErrorCode *pErrorCode);
105 
106 /* parse files with single enumerated properties ---------------------------- */
107 
108 struct SingleEnum {
109     const char *ucdFile, *propName;
110     UProperty prop;
111     int32_t vecWord, vecShift;
112     uint32_t vecMask;
113 };
114 typedef struct SingleEnum SingleEnum;
115 
116 static void
117 parseSingleEnumFile(char *filename, char *basename, const char *suffix,
118                     const SingleEnum *sen,
119                     UErrorCode *pErrorCode);
120 
121 static const SingleEnum scriptSingleEnum={
122     "Scripts", "script",
123     UCHAR_SCRIPT,
124     0, 0, UPROPS_SCRIPT_MASK
125 };
126 
127 static const SingleEnum blockSingleEnum={
128     "Blocks", "block",
129     UCHAR_BLOCK,
130     0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK
131 };
132 
133 static const SingleEnum graphemeClusterBreakSingleEnum={
134     "GraphemeBreakProperty", "Grapheme_Cluster_Break",
135     UCHAR_GRAPHEME_CLUSTER_BREAK,
136     2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK
137 };
138 
139 static const SingleEnum wordBreakSingleEnum={
140     "WordBreakProperty", "Word_Break",
141     UCHAR_WORD_BREAK,
142     2, UPROPS_WB_SHIFT, UPROPS_WB_MASK
143 };
144 
145 static const SingleEnum sentenceBreakSingleEnum={
146     "SentenceBreakProperty", "Sentence_Break",
147     UCHAR_SENTENCE_BREAK,
148     2, UPROPS_SB_SHIFT, UPROPS_SB_MASK
149 };
150 
151 static const SingleEnum lineBreakSingleEnum={
152     "LineBreak", "line break",
153     UCHAR_LINE_BREAK,
154     0, UPROPS_LB_SHIFT, UPROPS_LB_MASK
155 };
156 
157 static const SingleEnum eawSingleEnum={
158     "EastAsianWidth", "east asian width",
159     UCHAR_EAST_ASIAN_WIDTH,
160     0, UPROPS_EA_SHIFT, UPROPS_EA_MASK
161 };
162 
163 static void U_CALLCONV
singleEnumLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)164 singleEnumLineFn(void *context,
165                  char *fields[][2], int32_t fieldCount,
166                  UErrorCode *pErrorCode) {
167     const SingleEnum *sen;
168     char *s;
169     uint32_t start, limit, uv;
170     int32_t value;
171 
172     sen=(const SingleEnum *)context;
173 
174     u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
175     if(U_FAILURE(*pErrorCode)) {
176         fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", sen->ucdFile, fields[0][0]);
177         exit(*pErrorCode);
178     }
179     ++limit;
180 
181     /* parse property alias */
182     s=trimTerminateField(fields[1][0], fields[1][1]);
183     value=u_getPropertyValueEnum(sen->prop, s);
184     if(value<0) {
185         if(sen->prop==UCHAR_BLOCK) {
186             if(isToken("Greek", s)) {
187                 value=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */
188             } else if(isToken("Combining Marks for Symbols", s)) {
189                 value=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */
190             } else if(isToken("Private Use", s)) {
191                 value=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */
192             }
193         }
194     }
195     if(value<0) {
196         fprintf(stderr, "genprops error: unknown %s name in %s.txt field 1 at %s\n",
197                         sen->propName, sen->ucdFile, s);
198         exit(U_PARSE_ERROR);
199     }
200 
201     uv=(uint32_t)(value<<sen->vecShift);
202     if((uv&sen->vecMask)!=uv) {
203         fprintf(stderr, "genprops error: %s value overflow (0x%x) at %s\n",
204                         sen->propName, (int)uv, s);
205         exit(U_INTERNAL_PROGRAM_ERROR);
206     }
207 
208     if(!upvec_setValue(pv, start, limit, sen->vecWord, uv, sen->vecMask, pErrorCode)) {
209         fprintf(stderr, "genprops error: unable to set %s code: %s\n",
210                         sen->propName, u_errorName(*pErrorCode));
211         exit(*pErrorCode);
212     }
213 }
214 
215 static void
parseSingleEnumFile(char * filename,char * basename,const char * suffix,const SingleEnum * sen,UErrorCode * pErrorCode)216 parseSingleEnumFile(char *filename, char *basename, const char *suffix,
217                     const SingleEnum *sen,
218                     UErrorCode *pErrorCode) {
219     char *fields[2][2];
220 
221     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
222         return;
223     }
224 
225     writeUCDFilename(basename, sen->ucdFile, suffix);
226 
227     u_parseDelimitedFile(filename, ';', fields, 2, singleEnumLineFn, (void *)sen, pErrorCode);
228     if(U_FAILURE(*pErrorCode)) {
229         fprintf(stderr, "error parsing %s.txt: %s\n", sen->ucdFile, u_errorName(*pErrorCode));
230     }
231 }
232 
233 /* parse files with multiple binary properties ------------------------------ */
234 
235 struct Binary {
236     const char *propName;
237     int32_t vecWord, vecShift;
238 };
239 typedef struct Binary Binary;
240 
241 struct Binaries {
242     const char *ucdFile;
243     const Binary *binaries;
244     int32_t binariesCount;
245 };
246 typedef struct Binaries Binaries;
247 
248 static const Binary
249 propListNames[]={
250     { "White_Space",                        1, UPROPS_WHITE_SPACE },
251     { "Dash",                               1, UPROPS_DASH },
252     { "Hyphen",                             1, UPROPS_HYPHEN },
253     { "Quotation_Mark",                     1, UPROPS_QUOTATION_MARK },
254     { "Terminal_Punctuation",               1, UPROPS_TERMINAL_PUNCTUATION },
255     { "Hex_Digit",                          1, UPROPS_HEX_DIGIT },
256     { "ASCII_Hex_Digit",                    1, UPROPS_ASCII_HEX_DIGIT },
257     { "Ideographic",                        1, UPROPS_IDEOGRAPHIC },
258     { "Diacritic",                          1, UPROPS_DIACRITIC },
259     { "Extender",                           1, UPROPS_EXTENDER },
260     { "Noncharacter_Code_Point",            1, UPROPS_NONCHARACTER_CODE_POINT },
261     { "Grapheme_Link",                      1, UPROPS_GRAPHEME_LINK },
262     { "IDS_Binary_Operator",                1, UPROPS_IDS_BINARY_OPERATOR },
263     { "IDS_Trinary_Operator",               1, UPROPS_IDS_TRINARY_OPERATOR },
264     { "Radical",                            1, UPROPS_RADICAL },
265     { "Unified_Ideograph",                  1, UPROPS_UNIFIED_IDEOGRAPH },
266     { "Deprecated",                         1, UPROPS_DEPRECATED },
267     { "Logical_Order_Exception",            1, UPROPS_LOGICAL_ORDER_EXCEPTION },
268 
269     /* new properties in Unicode 4.0.1 */
270     { "STerm",                              2, UPROPS_V2_S_TERM },
271     { "Variation_Selector",                 2, UPROPS_V2_VARIATION_SELECTOR },
272 
273     /* new properties in Unicode 4.1 */
274     { "Pattern_Syntax",                     2, UPROPS_V2_PATTERN_SYNTAX },
275     { "Pattern_White_Space",                2, UPROPS_V2_PATTERN_WHITE_SPACE }
276 };
277 
278 static const Binaries
279 propListBinaries={
280     "PropList", propListNames, LENGTHOF(propListNames)
281 };
282 
283 static const Binary
284 derCorePropsNames[]={
285     { "XID_Start",                          1, UPROPS_XID_START },
286     { "XID_Continue",                       1, UPROPS_XID_CONTINUE },
287 
288     /* before Unicode 4/ICU 2.6/format version 3.2, these used to be Other_XYZ from PropList.txt */
289     { "Math",                               1, UPROPS_MATH },
290     { "Alphabetic",                         1, UPROPS_ALPHABETIC },
291     { "Grapheme_Extend",                    1, UPROPS_GRAPHEME_EXTEND },
292     { "Default_Ignorable_Code_Point",       1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
293 
294     /* new properties bits in ICU 2.6/format version 3.2 */
295     { "ID_Start",                           1, UPROPS_ID_START },
296     { "ID_Continue",                        1, UPROPS_ID_CONTINUE },
297     { "Grapheme_Base",                      1, UPROPS_GRAPHEME_BASE },
298 
299     /*
300      * Unicode 5/ICU 3.6 moves Grapheme_Link from PropList.txt
301      * to DerivedCoreProperties.txt and deprecates it.
302      */
303     { "Grapheme_Link",                      1, UPROPS_GRAPHEME_LINK }
304 };
305 
306 static const Binaries
307 derCorePropsBinaries={
308     "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
309 };
310 
311 static char ignoredProps[100][64];
312 static int32_t ignoredPropsCount;
313 
314 static void
addIgnoredProp(char * s,char * limit)315 addIgnoredProp(char *s, char *limit) {
316     int32_t i;
317 
318     s=trimTerminateField(s, limit);
319     for(i=0; i<ignoredPropsCount; ++i) {
320         if(0==uprv_strcmp(ignoredProps[i], s)) {
321             return;
322         }
323     }
324     uprv_strcpy(ignoredProps[ignoredPropsCount++], s);
325 }
326 
327 static void U_CALLCONV
binariesLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)328 binariesLineFn(void *context,
329                char *fields[][2], int32_t fieldCount,
330                UErrorCode *pErrorCode) {
331     const Binaries *bin;
332     char *s;
333     uint32_t start, limit, uv;
334     int32_t i;
335 
336     bin=(const Binaries *)context;
337 
338     u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
339     if(U_FAILURE(*pErrorCode)) {
340         fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
341         exit(*pErrorCode);
342     }
343     ++limit;
344 
345     /* parse binary property name */
346     s=(char *)u_skipWhitespace(fields[1][0]);
347     for(i=0;; ++i) {
348         if(i==bin->binariesCount) {
349             /* ignore unrecognized properties */
350             if(beVerbose) {
351                 addIgnoredProp(s, fields[1][1]);
352             }
353             return;
354         }
355         if(isToken(bin->binaries[i].propName, s)) {
356             break;
357         }
358     }
359 
360     if(bin->binaries[i].vecShift>=32) {
361         fprintf(stderr, "genprops error: shift value %d>=32 for %s %s\n",
362                         (int)bin->binaries[i].vecShift, bin->ucdFile, bin->binaries[i].propName);
363         exit(U_INTERNAL_PROGRAM_ERROR);
364     }
365     uv=U_MASK(bin->binaries[i].vecShift);
366 
367     if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, uv, uv, pErrorCode)) {
368         fprintf(stderr, "genprops error: unable to set %s code: %s\n",
369                         bin->binaries[i].propName, u_errorName(*pErrorCode));
370         exit(*pErrorCode);
371     }
372 }
373 
374 static void
parseBinariesFile(char * filename,char * basename,const char * suffix,const Binaries * bin,UErrorCode * pErrorCode)375 parseBinariesFile(char *filename, char *basename, const char *suffix,
376                   const Binaries *bin,
377                   UErrorCode *pErrorCode) {
378     char *fields[2][2];
379     int32_t i;
380 
381     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
382         return;
383     }
384 
385     writeUCDFilename(basename, bin->ucdFile, suffix);
386 
387     ignoredPropsCount=0;
388 
389     u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
390     if(U_FAILURE(*pErrorCode)) {
391         fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
392     }
393 
394     if(beVerbose) {
395         for(i=0; i<ignoredPropsCount; ++i) {
396             printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile);
397         }
398     }
399 }
400 
401 /* -------------------------------------------------------------------------- */
402 
403 U_CFUNC void
initAdditionalProperties()404 initAdditionalProperties() {
405     pv=upvec_open(UPROPS_VECTOR_WORDS, 20000);
406 }
407 
408 U_CFUNC void
exitAdditionalProperties()409 exitAdditionalProperties() {
410     utrie_close(trie);
411     upvec_close(pv);
412 }
413 
414 U_CFUNC void
generateAdditionalProperties(char * filename,const char * suffix,UErrorCode * pErrorCode)415 generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
416     char *basename;
417 
418     basename=filename+uprv_strlen(filename);
419 
420     /* process various UCD .txt files */
421 
422     /* add Han numeric types & values */
423     parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode);
424 
425     parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);
426 
427     /*
428      * UTR 24 says:
429      * Section 2:
430      *   "Common - For characters that may be used
431      *             within multiple scripts,
432      *             or any unassigned code points."
433      *
434      * Section 4:
435      *   "The value COMMON is the default value,
436      *    given to all code points that are not
437      *    explicitly mentioned in the data file."
438      *
439      * COMMON==USCRIPT_COMMON==0 - nothing to do
440      */
441     parseSingleEnumFile(filename, basename, suffix, &scriptSingleEnum, pErrorCode);
442 
443     parseSingleEnumFile(filename, basename, suffix, &blockSingleEnum, pErrorCode);
444 
445     parseBinariesFile(filename, basename, suffix, &propListBinaries, pErrorCode);
446 
447     parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, pErrorCode);
448 
449     parseSingleEnumFile(filename, basename, suffix, &graphemeClusterBreakSingleEnum, pErrorCode);
450 
451     parseSingleEnumFile(filename, basename, suffix, &wordBreakSingleEnum, pErrorCode);
452 
453     parseSingleEnumFile(filename, basename, suffix, &sentenceBreakSingleEnum, pErrorCode);
454 
455     /*
456      * LineBreak-4.0.0.txt:
457      *  - All code points, assigned and unassigned, that are not listed
458      *         explicitly are given the value "XX".
459      *
460      * XX==U_LB_UNKNOWN==0 - nothing to do
461      */
462     parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode);
463 
464     /*
465      * Preset East Asian Width defaults:
466      *
467      * http://www.unicode.org/reports/tr11/#Unassigned
468      * 7.1 Unassigned and Private Use characters
469      *
470      * All unassigned characters are by default classified as non-East Asian neutral,
471      * except for the range U+20000 to U+2FFFD,
472      * since all code positions from U+20000 to U+2FFFD are intended for CJK ideographs (W).
473      * All Private use characters are by default classified as ambiguous,
474      * since their definition depends on context.
475      *
476      * N for all ==0 - nothing to do
477      * A for Private Use
478      * W for plane 2
479      */
480     *pErrorCode=U_ZERO_ERROR;
481     if( !upvec_setValue(pv, 0xe000, 0xf900, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
482         !upvec_setValue(pv, 0xf0000, 0xffffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
483         !upvec_setValue(pv, 0x100000, 0x10fffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
484         !upvec_setValue(pv, 0x20000, 0x2fffe, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode)
485     ) {
486         fprintf(stderr, "genprops: unable to set default East Asian Widths: %s\n", u_errorName(*pErrorCode));
487         exit(*pErrorCode);
488     }
489 
490     /* parse EastAsianWidth.txt */
491     parseSingleEnumFile(filename, basename, suffix, &eawSingleEnum, pErrorCode);
492 
493     trie=utrie_open(NULL, NULL, 50000, 0, 0, TRUE);
494     if(trie==NULL) {
495         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
496         upvec_close(pv);
497         return;
498     }
499 
500     pvCount=upvec_compact(pv, upvec_compactToTrieHandler, trie, pErrorCode);
501     if(U_FAILURE(*pErrorCode)) {
502         fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", u_errorName(*pErrorCode));
503         exit(*pErrorCode);
504     }
505 }
506 
507 /* DerivedAge.txt ----------------------------------------------------------- */
508 
509 static void U_CALLCONV
ageLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)510 ageLineFn(void *context,
511           char *fields[][2], int32_t fieldCount,
512           UErrorCode *pErrorCode) {
513     char *s, *end;
514     uint32_t value, start, limit, version;
515 
516     u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
517     if(U_FAILURE(*pErrorCode)) {
518         fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]);
519         exit(*pErrorCode);
520     }
521     ++limit;
522 
523     /* ignore "unassigned" (the default is already set to 0.0) */
524     s=(char *)u_skipWhitespace(fields[1][0]);
525     if(0==uprv_strncmp(s, "unassigned", 10)) {
526         return;
527     }
528 
529     /* parse version number */
530     value=(uint32_t)uprv_strtoul(s, &end, 10);
531     if(s==end || value==0 || value>15 || (*end!='.' && *end!=' ' && *end!='\t' && *end!=0)) {
532         fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
533         *pErrorCode=U_PARSE_ERROR;
534         exit(U_PARSE_ERROR);
535     }
536     version=value<<4;
537 
538     /* parse minor version number */
539     if(*end=='.') {
540         s=(char *)u_skipWhitespace(end+1);
541         value=(uint32_t)uprv_strtoul(s, &end, 10);
542         if(s==end || value>15 || (*end!=' ' && *end!='\t' && *end!=0)) {
543             fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
544             *pErrorCode=U_PARSE_ERROR;
545             exit(U_PARSE_ERROR);
546         }
547         version|=value;
548     }
549 
550     if(!upvec_setValue(pv, start, limit, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode)) {
551         fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode));
552         exit(*pErrorCode);
553     }
554 }
555 
556 /* DerivedNumericValues.txt ------------------------------------------------- */
557 
558 static void U_CALLCONV
numericLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)559 numericLineFn(void *context,
560               char *fields[][2], int32_t fieldCount,
561               UErrorCode *pErrorCode) {
562     Props newProps={ 0 };
563     char *s, *end;
564     uint32_t start, limit, value, oldProps32;
565     int32_t oldType;
566     char c;
567     UBool isFraction;
568 
569     /* get the code point range */
570     u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
571     if(U_FAILURE(*pErrorCode)) {
572         fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 0 at %s\n", fields[0][0]);
573         exit(*pErrorCode);
574     }
575     ++limit;
576 
577     /* check if the numeric value is a fraction (this code does not handle any) */
578     isFraction=FALSE;
579     s=uprv_strchr(fields[1][0], '.');
580     if(s!=NULL) {
581         end=s+1;
582         while('0'<=(c=*end++) && c<='9') {
583             if(c!='0') {
584                 isFraction=TRUE;
585                 break;
586             }
587         }
588     }
589 
590     if(isFraction) {
591         value=0;
592     } else {
593         /* parse numeric value */
594         s=(char *)u_skipWhitespace(fields[1][0]);
595 
596         /* try large powers of 10 first, may otherwise overflow strtoul() */
597         if(0==uprv_strncmp(s, "10000000000", 11)) {
598             /* large powers of 10 are encoded in a special way, see store.c */
599             uint8_t exp=0;
600 
601             end=s;
602             while(*(++end)=='0') {
603                 ++exp;
604             }
605             value=1;
606             newProps.exponent=exp;
607         } else {
608             /* normal number parsing */
609             value=(uint32_t)uprv_strtoul(s, &end, 10);
610         }
611         if(end<=s || (*end!='.' && u_skipWhitespace(end)!=fields[1][1]) || value>=0x80000000) {
612             fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 1 at %s\n", fields[0][0]);
613             exit(U_PARSE_ERROR);
614         }
615     }
616 
617     /*
618      * Unicode 4.0.1 removes the third column that used to list the numeric type.
619      * Assume that either the data is the same as in UnicodeData.txt,
620      * or else that the numeric type is "numeric".
621      * This should work because we only expect to add numeric values for
622      * Han characters; for those, UnicodeData.txt lists only ranges without
623      * specific properties for single characters.
624      */
625 
626     /* set the new numeric type and value */
627     newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
628     newProps.numericValue=(int32_t)value;       /* newly parsed numeric value */
629     /* the exponent may have been set above */
630     value=makeProps(&newProps);
631 
632     for(; start<limit; ++start) {
633         oldProps32=getProps(start);
634         oldType=(int32_t)GET_NUMERIC_TYPE(oldProps32);
635 
636         if(isFraction) {
637             if(oldType!=0) {
638                 /* this code point was already listed with its numeric value in UnicodeData.txt */
639                 continue;
640             } else {
641                 fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]);
642                 exit(U_PARSE_ERROR);
643             }
644         }
645 
646         /*
647          * For simplicity, and because we only expect to set numeric values for Han characters,
648          * for now we only allow to set these values for Lo characters.
649          */
650         if(oldType==0 && GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
651             fprintf(stderr, "genprops error: new numeric value for a character other than Lo in DerivedNumericValues.txt at %s\n", fields[0][0]);
652             exit(U_PARSE_ERROR);
653         }
654 
655         /* verify that we do not change an existing value (fractions were excluded above) */
656         if(oldType!=0) {
657             /* the code point already has a value stored */
658             if((oldProps32&0xff00)!=(value&0xff00)) {
659                 fprintf(stderr, "genprops error: new numeric value differs from old one for U+%04lx\n", (long)start);
660                 exit(U_PARSE_ERROR);
661             }
662             /* same value, continue */
663         } else {
664             /* the code point is getting a new numeric value */
665             if(beVerbose) {
666                 printf("adding U+%04x numeric type %d value 0x%04x from %s\n", (int)start, U_NT_NUMERIC, (int)value, fields[0][0]);
667             }
668 
669             addProps(start, value|GET_CATEGORY(oldProps32));
670         }
671     }
672 }
673 
674 /* data serialization ------------------------------------------------------- */
675 
676 U_CFUNC int32_t
writeAdditionalData(FILE * f,uint8_t * p,int32_t capacity,int32_t indexes[UPROPS_INDEX_COUNT])677 writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
678     int32_t length;
679     UErrorCode errorCode;
680 
681     errorCode=U_ZERO_ERROR;
682     length=utrie_serialize(trie, p, capacity, NULL, TRUE, &errorCode);
683     if(U_FAILURE(errorCode)) {
684         fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
685         exit(errorCode);
686     }
687     if(p!=NULL) {
688         if(beVerbose) {
689             printf("size in bytes of additional props trie:%5u\n", (int)length);
690         }
691         if(f!=NULL) {
692             UTrie trie2={ NULL };
693             utrie_unserialize(&trie2, p, length, &errorCode);
694             if(U_FAILURE(errorCode)) {
695                 fprintf(
696                     stderr,
697                     "genprops error: failed to utrie_unserialize(trie for additional properties) - %s\n",
698                     u_errorName(errorCode));
699                 exit(errorCode);
700             }
701             usrc_writeUTrieArrays(f,
702                 "static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL,
703                 &trie2,
704                 "\n};\n\n");
705             usrc_writeUTrieStruct(f,
706                 "static const UTrie propsVectorsTrie={\n",
707                 &trie2, "propsVectorsTrie_index", NULL, NULL,
708                 "};\n\n");
709         }
710 
711         p+=length;
712         capacity-=length;
713 
714         /* set indexes */
715         indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
716             indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
717         indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
718         indexes[UPROPS_RESERVED_INDEX]=
719             indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
720 
721         indexes[UPROPS_MAX_VALUES_INDEX]=
722             (((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
723             (((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
724             (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
725             ((int32_t)USCRIPT_CODE_LIMIT-1);
726         indexes[UPROPS_MAX_VALUES_2_INDEX]=
727             (((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
728             (((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
729             (((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
730             ((int32_t)U_DT_COUNT-1);
731     }
732 
733     if(p!=NULL && (pvCount*4)<=capacity) {
734         if(f!=NULL) {
735             usrc_writeArray(f,
736                 "static const uint32_t propsVectors[%ld]={\n",
737                 pv, 32, pvCount,
738                 "};\n\n");
739             fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
740             fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]);
741         } else {
742             uprv_memcpy(p, pv, pvCount*4);
743         }
744         if(beVerbose) {
745             printf("number of additional props vectors:    %5u\n", (int)pvCount/UPROPS_VECTOR_WORDS);
746             printf("number of 32-bit words per vector:     %5u\n", UPROPS_VECTOR_WORDS);
747         }
748     }
749     length+=pvCount*4;
750 
751     return length;
752 }
753