• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2002-2009, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  props2.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002feb24
14 *   created by: Markus W. Scherer
15 *
16 *   Parse more Unicode Character Database files and store
17 *   additional Unicode character properties in bit set vectors.
18 */
19 
20 #include <stdio.h>
21 #include "unicode/utypes.h"
22 #include "unicode/uchar.h"
23 #include "unicode/uscript.h"
24 #include "cstring.h"
25 #include "cmemory.h"
26 #include "utrie.h"
27 #include "uprops.h"
28 #include "propsvec.h"
29 #include "uparse.h"
30 #include "writesrc.h"
31 #include "genprops.h"
32 
33 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
34 
35 /* data --------------------------------------------------------------------- */
36 
37 static UNewTrie *newTrie;
38 UPropsVectors *pv;
39 
40 /* miscellaneous ------------------------------------------------------------ */
41 
42 static char *
trimTerminateField(char * s,char * limit)43 trimTerminateField(char *s, char *limit) {
44     /* trim leading whitespace */
45     s=(char *)u_skipWhitespace(s);
46 
47     /* trim trailing whitespace */
48     while(s<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
49         --limit;
50     }
51     *limit=0;
52 
53     return s;
54 }
55 
56 static void
parseTwoFieldFile(char * filename,char * basename,const char * ucdFile,const char * suffix,UParseLineFn * lineFn,UErrorCode * pErrorCode)57 parseTwoFieldFile(char *filename, char *basename,
58                   const char *ucdFile, const char *suffix,
59                   UParseLineFn *lineFn,
60                   UErrorCode *pErrorCode) {
61     char *fields[2][2];
62 
63     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
64         return;
65     }
66 
67     writeUCDFilename(basename, ucdFile, suffix);
68 
69     u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode);
70     if(U_FAILURE(*pErrorCode)) {
71         fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
72     }
73 }
74 
75 static void U_CALLCONV
76 ageLineFn(void *context,
77           char *fields[][2], int32_t fieldCount,
78           UErrorCode *pErrorCode);
79 
80 static void
parseMultiFieldFile(char * filename,char * basename,const char * ucdFile,const char * suffix,int32_t fieldCount,UParseLineFn * lineFn,UErrorCode * pErrorCode)81 parseMultiFieldFile(char *filename, char *basename,
82                     const char *ucdFile, const char *suffix,
83                     int32_t fieldCount,
84                     UParseLineFn *lineFn,
85                     UErrorCode *pErrorCode) {
86     char *fields[20][2];
87 
88     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
89         return;
90     }
91 
92     writeUCDFilename(basename, ucdFile, suffix);
93 
94     u_parseDelimitedFile(filename, ';', fields, fieldCount, lineFn, NULL, pErrorCode);
95     if(U_FAILURE(*pErrorCode)) {
96         fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
97     }
98 }
99 
100 static void U_CALLCONV
101 numericLineFn(void *context,
102               char *fields[][2], int32_t fieldCount,
103               UErrorCode *pErrorCode);
104 
105 /* parse files with single enumerated properties ---------------------------- */
106 
107 struct SingleEnum {
108     const char *ucdFile, *propName;
109     UProperty prop;
110     int32_t vecWord, vecShift;
111     uint32_t vecMask;
112 };
113 typedef struct SingleEnum SingleEnum;
114 
115 static void
116 parseSingleEnumFile(char *filename, char *basename, const char *suffix,
117                     const SingleEnum *sen,
118                     UErrorCode *pErrorCode);
119 
120 static const SingleEnum scriptSingleEnum={
121     "Scripts", "script",
122     UCHAR_SCRIPT,
123     0, 0, UPROPS_SCRIPT_MASK
124 };
125 
126 static const SingleEnum blockSingleEnum={
127     "Blocks", "block",
128     UCHAR_BLOCK,
129     0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK
130 };
131 
132 static const SingleEnum graphemeClusterBreakSingleEnum={
133     "GraphemeBreakProperty", "Grapheme_Cluster_Break",
134     UCHAR_GRAPHEME_CLUSTER_BREAK,
135     2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK
136 };
137 
138 static const SingleEnum wordBreakSingleEnum={
139     "WordBreakProperty", "Word_Break",
140     UCHAR_WORD_BREAK,
141     2, UPROPS_WB_SHIFT, UPROPS_WB_MASK
142 };
143 
144 static const SingleEnum sentenceBreakSingleEnum={
145     "SentenceBreakProperty", "Sentence_Break",
146     UCHAR_SENTENCE_BREAK,
147     2, UPROPS_SB_SHIFT, UPROPS_SB_MASK
148 };
149 
150 static const SingleEnum lineBreakSingleEnum={
151     "LineBreak", "line break",
152     UCHAR_LINE_BREAK,
153     UPROPS_LB_VWORD, UPROPS_LB_SHIFT, UPROPS_LB_MASK
154 };
155 
156 static const SingleEnum eawSingleEnum={
157     "EastAsianWidth", "east asian width",
158     UCHAR_EAST_ASIAN_WIDTH,
159     0, UPROPS_EA_SHIFT, UPROPS_EA_MASK
160 };
161 
162 static void U_CALLCONV
singleEnumLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)163 singleEnumLineFn(void *context,
164                  char *fields[][2], int32_t fieldCount,
165                  UErrorCode *pErrorCode) {
166     const SingleEnum *sen;
167     char *s;
168     uint32_t start, end, uv;
169     int32_t value;
170 
171     sen=(const SingleEnum *)context;
172 
173     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
174     if(U_FAILURE(*pErrorCode)) {
175         fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", sen->ucdFile, fields[0][0]);
176         exit(*pErrorCode);
177     }
178 
179     /* parse property alias */
180     s=trimTerminateField(fields[1][0], fields[1][1]);
181     value=u_getPropertyValueEnum(sen->prop, s);
182     if(value<0) {
183         if(sen->prop==UCHAR_BLOCK) {
184             if(isToken("Greek", s)) {
185                 value=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */
186             } else if(isToken("Combining Marks for Symbols", s)) {
187                 value=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */
188             } else if(isToken("Private Use", s)) {
189                 value=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */
190             }
191         }
192     }
193     if(value<0) {
194         fprintf(stderr, "genprops error: unknown %s name in %s.txt field 1 at %s\n",
195                         sen->propName, sen->ucdFile, s);
196         exit(U_PARSE_ERROR);
197     }
198 
199     uv=(uint32_t)(value<<sen->vecShift);
200     if((uv&sen->vecMask)!=uv) {
201         fprintf(stderr, "genprops error: %s value overflow (0x%x) at %s\n",
202                         sen->propName, (int)uv, s);
203         exit(U_INTERNAL_PROGRAM_ERROR);
204     }
205 
206     if(start==0 && end==0x10ffff) {
207         /* Also set bits for initialValue and errorValue. */
208         end=UPVEC_MAX_CP;
209     }
210     upvec_setValue(pv, start, end, sen->vecWord, uv, sen->vecMask, pErrorCode);
211     if(U_FAILURE(*pErrorCode)) {
212         fprintf(stderr, "genprops error: unable to set %s code: %s\n",
213                         sen->propName, u_errorName(*pErrorCode));
214         exit(*pErrorCode);
215     }
216 }
217 
218 static void
parseSingleEnumFile(char * filename,char * basename,const char * suffix,const SingleEnum * sen,UErrorCode * pErrorCode)219 parseSingleEnumFile(char *filename, char *basename, const char *suffix,
220                     const SingleEnum *sen,
221                     UErrorCode *pErrorCode) {
222     char *fields[2][2];
223 
224     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
225         return;
226     }
227 
228     writeUCDFilename(basename, sen->ucdFile, suffix);
229 
230     u_parseDelimitedFile(filename, ';', fields, 2, singleEnumLineFn, (void *)sen, pErrorCode);
231     if(U_FAILURE(*pErrorCode)) {
232         fprintf(stderr, "error parsing %s.txt: %s\n", sen->ucdFile, u_errorName(*pErrorCode));
233     }
234 }
235 
236 /* parse files with multiple binary properties ------------------------------ */
237 
238 struct Binary {
239     const char *propName;
240     int32_t vecWord, vecShift;
241 };
242 typedef struct Binary Binary;
243 
244 struct Binaries {
245     const char *ucdFile;
246     const Binary *binaries;
247     int32_t binariesCount;
248 };
249 typedef struct Binaries Binaries;
250 
251 static const Binary
252 propListNames[]={
253     { "White_Space",                        1, UPROPS_WHITE_SPACE },
254     { "Dash",                               1, UPROPS_DASH },
255     { "Hyphen",                             1, UPROPS_HYPHEN },
256     { "Quotation_Mark",                     1, UPROPS_QUOTATION_MARK },
257     { "Terminal_Punctuation",               1, UPROPS_TERMINAL_PUNCTUATION },
258     { "Hex_Digit",                          1, UPROPS_HEX_DIGIT },
259     { "ASCII_Hex_Digit",                    1, UPROPS_ASCII_HEX_DIGIT },
260     { "Ideographic",                        1, UPROPS_IDEOGRAPHIC },
261     { "Diacritic",                          1, UPROPS_DIACRITIC },
262     { "Extender",                           1, UPROPS_EXTENDER },
263     { "Noncharacter_Code_Point",            1, UPROPS_NONCHARACTER_CODE_POINT },
264     { "Grapheme_Link",                      1, UPROPS_GRAPHEME_LINK },
265     { "IDS_Binary_Operator",                1, UPROPS_IDS_BINARY_OPERATOR },
266     { "IDS_Trinary_Operator",               1, UPROPS_IDS_TRINARY_OPERATOR },
267     { "Radical",                            1, UPROPS_RADICAL },
268     { "Unified_Ideograph",                  1, UPROPS_UNIFIED_IDEOGRAPH },
269     { "Deprecated",                         1, UPROPS_DEPRECATED },
270     { "Logical_Order_Exception",            1, UPROPS_LOGICAL_ORDER_EXCEPTION },
271 
272     /* new properties in Unicode 4.0.1 */
273     { "STerm",                              1, UPROPS_S_TERM },
274     { "Variation_Selector",                 1, UPROPS_VARIATION_SELECTOR },
275 
276     /* new properties in Unicode 4.1 */
277     { "Pattern_Syntax",                     1, UPROPS_PATTERN_SYNTAX },
278     { "Pattern_White_Space",                1, UPROPS_PATTERN_WHITE_SPACE }
279 };
280 
281 static const Binaries
282 propListBinaries={
283     "PropList", propListNames, LENGTHOF(propListNames)
284 };
285 
286 static const Binary
287 derCorePropsNames[]={
288     { "XID_Start",                          1, UPROPS_XID_START },
289     { "XID_Continue",                       1, UPROPS_XID_CONTINUE },
290 
291     /* before Unicode 4/ICU 2.6/format version 3.2, these used to be Other_XYZ from PropList.txt */
292     { "Math",                               1, UPROPS_MATH },
293     { "Alphabetic",                         1, UPROPS_ALPHABETIC },
294     { "Grapheme_Extend",                    1, UPROPS_GRAPHEME_EXTEND },
295     { "Default_Ignorable_Code_Point",       1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
296 
297     /* new properties bits in ICU 2.6/format version 3.2 */
298     { "ID_Start",                           1, UPROPS_ID_START },
299     { "ID_Continue",                        1, UPROPS_ID_CONTINUE },
300     { "Grapheme_Base",                      1, UPROPS_GRAPHEME_BASE },
301 
302     /*
303      * Unicode 5/ICU 3.6 moves Grapheme_Link from PropList.txt
304      * to DerivedCoreProperties.txt and deprecates it.
305      */
306     { "Grapheme_Link",                      1, UPROPS_GRAPHEME_LINK }
307 };
308 
309 static const Binaries
310 derCorePropsBinaries={
311     "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
312 };
313 
314 static char ignoredProps[100][64];
315 static int32_t ignoredPropsCount;
316 
317 static void
addIgnoredProp(char * s,char * limit)318 addIgnoredProp(char *s, char *limit) {
319     int32_t i;
320 
321     s=trimTerminateField(s, limit);
322     for(i=0; i<ignoredPropsCount; ++i) {
323         if(0==uprv_strcmp(ignoredProps[i], s)) {
324             return;
325         }
326     }
327     uprv_strcpy(ignoredProps[ignoredPropsCount++], s);
328 }
329 
330 static void U_CALLCONV
binariesLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)331 binariesLineFn(void *context,
332                char *fields[][2], int32_t fieldCount,
333                UErrorCode *pErrorCode) {
334     const Binaries *bin;
335     char *s;
336     uint32_t start, end, uv;
337     int32_t i;
338 
339     bin=(const Binaries *)context;
340 
341     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
342     if(U_FAILURE(*pErrorCode)) {
343         fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
344         exit(*pErrorCode);
345     }
346 
347     /* parse binary property name */
348     s=(char *)u_skipWhitespace(fields[1][0]);
349     for(i=0;; ++i) {
350         if(i==bin->binariesCount) {
351             /* ignore unrecognized properties */
352             if(beVerbose) {
353                 addIgnoredProp(s, fields[1][1]);
354             }
355             return;
356         }
357         if(isToken(bin->binaries[i].propName, s)) {
358             break;
359         }
360     }
361 
362     if(bin->binaries[i].vecShift>=32) {
363         fprintf(stderr, "genprops error: shift value %d>=32 for %s %s\n",
364                         (int)bin->binaries[i].vecShift, bin->ucdFile, bin->binaries[i].propName);
365         exit(U_INTERNAL_PROGRAM_ERROR);
366     }
367     uv=U_MASK(bin->binaries[i].vecShift);
368 
369     if(start==0 && end==0x10ffff) {
370         /* Also set bits for initialValue and errorValue. */
371         end=UPVEC_MAX_CP;
372     }
373     upvec_setValue(pv, start, end, bin->binaries[i].vecWord, uv, uv, pErrorCode);
374     if(U_FAILURE(*pErrorCode)) {
375         fprintf(stderr, "genprops error: unable to set %s code: %s\n",
376                         bin->binaries[i].propName, u_errorName(*pErrorCode));
377         exit(*pErrorCode);
378     }
379 }
380 
381 static void
parseBinariesFile(char * filename,char * basename,const char * suffix,const Binaries * bin,UErrorCode * pErrorCode)382 parseBinariesFile(char *filename, char *basename, const char *suffix,
383                   const Binaries *bin,
384                   UErrorCode *pErrorCode) {
385     char *fields[2][2];
386     int32_t i;
387 
388     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
389         return;
390     }
391 
392     writeUCDFilename(basename, bin->ucdFile, suffix);
393 
394     ignoredPropsCount=0;
395 
396     u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
397     if(U_FAILURE(*pErrorCode)) {
398         fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
399     }
400 
401     if(beVerbose) {
402         for(i=0; i<ignoredPropsCount; ++i) {
403             printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile);
404         }
405     }
406 }
407 
408 /* -------------------------------------------------------------------------- */
409 
410 U_CFUNC void
initAdditionalProperties()411 initAdditionalProperties() {
412     UErrorCode errorCode=U_ZERO_ERROR;
413     pv=upvec_open(UPROPS_VECTOR_WORDS, &errorCode);
414     if(U_FAILURE(errorCode)) {
415         fprintf(stderr, "error: upvec_open() failed - %s\n", u_errorName(errorCode));
416         exit(errorCode);
417     }
418 }
419 
420 U_CFUNC void
exitAdditionalProperties()421 exitAdditionalProperties() {
422     utrie_close(newTrie);
423     upvec_close(pv);
424 }
425 
426 U_CFUNC void
generateAdditionalProperties(char * filename,const char * suffix,UErrorCode * pErrorCode)427 generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
428     char *basename;
429 
430     basename=filename+uprv_strlen(filename);
431 
432     /* process various UCD .txt files */
433 
434     /* add Han numeric types & values */
435     parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode);
436 
437     parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);
438 
439     /*
440      * UTR 24 says:
441      * Section 2:
442      *   "Common - For characters that may be used
443      *             within multiple scripts,
444      *             or any unassigned code points."
445      *
446      * Section 4:
447      *   "The value COMMON is the default value,
448      *    given to all code points that are not
449      *    explicitly mentioned in the data file."
450      *
451      * COMMON==USCRIPT_COMMON==0 - nothing to do
452      */
453     parseSingleEnumFile(filename, basename, suffix, &scriptSingleEnum, pErrorCode);
454 
455     parseSingleEnumFile(filename, basename, suffix, &blockSingleEnum, pErrorCode);
456 
457     parseBinariesFile(filename, basename, suffix, &propListBinaries, pErrorCode);
458 
459     parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, pErrorCode);
460 
461     parseSingleEnumFile(filename, basename, suffix, &graphemeClusterBreakSingleEnum, pErrorCode);
462 
463     parseSingleEnumFile(filename, basename, suffix, &wordBreakSingleEnum, pErrorCode);
464 
465     parseSingleEnumFile(filename, basename, suffix, &sentenceBreakSingleEnum, pErrorCode);
466 
467     /*
468      * LineBreak-4.0.0.txt:
469      *  - All code points, assigned and unassigned, that are not listed
470      *         explicitly are given the value "XX".
471      *
472      * XX==U_LB_UNKNOWN==0 - nothing to do
473      */
474     parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode);
475 
476     /*
477      * Preset East Asian Width defaults:
478      *
479      * http://www.unicode.org/reports/tr11/#Unassigned
480      * 7.1 Unassigned and Private Use characters
481      *
482      * All unassigned characters are by default classified as non-East Asian neutral,
483      * except for the range U+20000 to U+2FFFD,
484      * since all code positions from U+20000 to U+2FFFD are intended for CJK ideographs (W).
485      * All Private use characters are by default classified as ambiguous,
486      * since their definition depends on context.
487      *
488      * N for all ==0 - nothing to do
489      * A for Private Use
490      * W for plane 2
491      */
492     *pErrorCode=U_ZERO_ERROR;
493     upvec_setValue(pv, 0xe000, 0xf8ff, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
494     upvec_setValue(pv, 0xf0000, 0xffffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
495     upvec_setValue(pv, 0x100000, 0x10fffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
496     upvec_setValue(pv, 0x20000, 0x2fffd, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
497     if(U_FAILURE(*pErrorCode)) {
498         fprintf(stderr, "genprops: unable to set default East Asian Widths: %s\n", u_errorName(*pErrorCode));
499         exit(*pErrorCode);
500     }
501 
502     /* parse EastAsianWidth.txt */
503     parseSingleEnumFile(filename, basename, suffix, &eawSingleEnum, pErrorCode);
504 
505     {
506         UPVecToUTrieContext toUTrie={ NULL, 50000 /* capacity */, 0, TRUE /* latin1Linear */ };
507         upvec_compact(pv, upvec_compactToUTrieHandler, &toUTrie, pErrorCode);
508         if(U_FAILURE(*pErrorCode)) {
509             fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n",
510                     u_errorName(*pErrorCode));
511             exit(*pErrorCode);
512         }
513         newTrie=toUTrie.newTrie;
514     }
515 }
516 
517 /* DerivedAge.txt ----------------------------------------------------------- */
518 
519 static void U_CALLCONV
ageLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)520 ageLineFn(void *context,
521           char *fields[][2], int32_t fieldCount,
522           UErrorCode *pErrorCode) {
523     char *s, *numberLimit;
524     uint32_t value, start, end, version;
525 
526     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
527     if(U_FAILURE(*pErrorCode)) {
528         fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]);
529         exit(*pErrorCode);
530     }
531 
532     /* ignore "unassigned" (the default is already set to 0.0) */
533     s=(char *)u_skipWhitespace(fields[1][0]);
534     if(0==uprv_strncmp(s, "unassigned", 10)) {
535         return;
536     }
537 
538     /* parse version number */
539     value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
540     if(s==numberLimit || value==0 || value>15 || (*numberLimit!='.' && *numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) {
541         fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
542         *pErrorCode=U_PARSE_ERROR;
543         exit(U_PARSE_ERROR);
544     }
545     version=value<<4;
546 
547     /* parse minor version number */
548     if(*numberLimit=='.') {
549         s=(char *)u_skipWhitespace(numberLimit+1);
550         value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
551         if(s==numberLimit || value>15 || (*numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) {
552             fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
553             *pErrorCode=U_PARSE_ERROR;
554             exit(U_PARSE_ERROR);
555         }
556         version|=value;
557     }
558 
559     if(start==0 && end==0x10ffff) {
560         /* Also set bits for initialValue and errorValue. */
561         end=UPVEC_MAX_CP;
562     }
563     upvec_setValue(pv, start, end, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode);
564     if(U_FAILURE(*pErrorCode)) {
565         fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode));
566         exit(*pErrorCode);
567     }
568 }
569 
570 /* DerivedNumericValues.txt ------------------------------------------------- */
571 
572 static void U_CALLCONV
numericLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)573 numericLineFn(void *context,
574               char *fields[][2], int32_t fieldCount,
575               UErrorCode *pErrorCode) {
576     Props newProps={ 0 };
577     char *s, *numberLimit;
578     uint32_t start, end, value, oldProps32;
579     char c;
580     UBool isFraction;
581 
582     /* get the code point range */
583     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
584     if(U_FAILURE(*pErrorCode)) {
585         fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 0 at %s\n", fields[0][0]);
586         exit(*pErrorCode);
587     }
588 
589     /*
590      * Ignore the
591      * # @missing: 0000..10FFFF; NaN
592      * line from Unicode 5.1's DerivedNumericValues.txt:
593      * The following code cannot parse "NaN", and we don't want to overwrite
594      * the numeric values for all characters after reading most
595      * from UnicodeData.txt already.
596      */
597     if(start==0 && end==0x10ffff) {
598         return;
599     }
600 
601     /* check if the numeric value is a fraction (this code does not handle any) */
602     isFraction=FALSE;
603     s=uprv_strchr(fields[1][0], '.');
604     if(s!=NULL) {
605         numberLimit=s+1;
606         while('0'<=(c=*numberLimit++) && c<='9') {
607             if(c!='0') {
608                 isFraction=TRUE;
609                 break;
610             }
611         }
612     }
613 
614     if(isFraction) {
615         value=0;
616     } else {
617         /* parse numeric value */
618         s=(char *)u_skipWhitespace(fields[1][0]);
619 
620         /* try large, single-significant-digit numbers, may otherwise overflow strtoul() */
621         if('1'<=s[0] && s[0]<='9' && s[1]=='0' && s[2]=='0') {
622             /* large integers are encoded in a special way, see store.c */
623             uint8_t exp=0;
624 
625             value=s[0]-'0';
626             numberLimit=s;
627             while(*(++numberLimit)=='0') {
628                 ++exp;
629             }
630             newProps.exponent=exp;
631         } else {
632             /* normal number parsing */
633             value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
634         }
635         if(numberLimit<=s || (*numberLimit!='.' && u_skipWhitespace(numberLimit)!=fields[1][1]) || value>=0x80000000) {
636             fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 1 at %s\n", fields[0][0]);
637             exit(U_PARSE_ERROR);
638         }
639     }
640 
641     /*
642      * Unicode 4.0.1 removes the third column that used to list the numeric type.
643      * Assume that either the data is the same as in UnicodeData.txt,
644      * or else that the numeric type is "numeric".
645      * This should work because we only expect to add numeric values for
646      * Han characters; for those, UnicodeData.txt lists only ranges without
647      * specific properties for single characters.
648      */
649 
650     /* set the new numeric value */
651     newProps.code=start;
652     newProps.numericValue=(int32_t)value;       /* newly parsed numeric value */
653     /* the exponent may have been set above */
654 
655     for(; start<=end; ++start) {
656         uint32_t newProps32;
657         int32_t oldNtv;
658         oldProps32=getProps(start);
659         oldNtv=(int32_t)GET_NUMERIC_TYPE_VALUE(oldProps32);
660 
661         if(isFraction) {
662             if(UPROPS_NTV_FRACTION_START<=oldNtv && oldNtv<UPROPS_NTV_LARGE_START) {
663                 /* this code point was already listed with its numeric value in UnicodeData.txt */
664                 continue;
665             } else {
666                 fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]);
667                 exit(U_PARSE_ERROR);
668             }
669         }
670 
671         /*
672          * For simplicity, and because we only expect to set numeric values for Han characters,
673          * for now we only allow to set these values for Lo characters.
674          */
675         if(oldNtv==UPROPS_NTV_NONE && GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
676             fprintf(stderr, "genprops error: new numeric value for a character other than Lo in DerivedNumericValues.txt at %s\n", fields[0][0]);
677             exit(U_PARSE_ERROR);
678         }
679 
680         /* verify that we do not change an existing value (fractions were excluded above) */
681         if(oldNtv!=UPROPS_NTV_NONE) {
682             /* the code point already has a value stored */
683             newProps.numericType=UPROPS_NTV_GET_TYPE(oldNtv);
684             newProps32=makeProps(&newProps);
685             if(oldNtv!=GET_NUMERIC_TYPE_VALUE(newProps32)) {
686                 fprintf(stderr, "genprops error: new numeric value differs from old one for U+%04lx\n", (long)start);
687                 exit(U_PARSE_ERROR);
688             }
689             /* same value, continue */
690         } else {
691             /* the code point is getting a new numeric value */
692             newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
693             newProps32=makeProps(&newProps);
694             if(beVerbose) {
695                 printf("adding U+%04x numeric type %d encoded-numeric-type-value 0x%03x from %s\n",
696                        (int)start, U_NT_NUMERIC, (int)GET_NUMERIC_TYPE_VALUE(newProps32), fields[0][0]);
697             }
698 
699             addProps(start, newProps32|GET_CATEGORY(oldProps32));
700         }
701     }
702 }
703 
704 /* data serialization ------------------------------------------------------- */
705 
706 U_CFUNC int32_t
writeAdditionalData(FILE * f,uint8_t * p,int32_t capacity,int32_t indexes[UPROPS_INDEX_COUNT])707 writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
708     const uint32_t *pvArray;
709     int32_t pvRows, pvCount;
710     int32_t length;
711     UErrorCode errorCode;
712 
713     pvArray=upvec_getArray(pv, &pvRows, NULL);
714     pvCount=pvRows*UPROPS_VECTOR_WORDS;
715 
716     errorCode=U_ZERO_ERROR;
717     length=utrie_serialize(newTrie, p, capacity, NULL, TRUE, &errorCode);
718     if(U_FAILURE(errorCode)) {
719         fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
720         exit(errorCode);
721     }
722     if(p!=NULL) {
723         if(beVerbose) {
724             printf("size in bytes of additional props trie:%5u\n", (int)length);
725         }
726         if(f!=NULL) {
727             UTrie trie={ NULL };
728             UTrie2 *trie2;
729 
730             utrie_unserialize(&trie, p, length, &errorCode);
731             if(U_FAILURE(errorCode)) {
732                 fprintf(
733                     stderr,
734                     "genprops error: failed to utrie_unserialize(trie for additional properties) - %s\n",
735                     u_errorName(errorCode));
736                 exit(errorCode);
737             }
738 
739             /* use UTrie2 */
740             trie2=utrie2_fromUTrie(&trie, trie.initialValue, &errorCode);
741             if(U_FAILURE(errorCode)) {
742                 fprintf(
743                     stderr,
744                     "genprops error: utrie2_fromUTrie() failed - %s\n",
745                     u_errorName(errorCode));
746                 exit(errorCode);
747             }
748             {
749                 /* delete lead surrogate code unit values */
750                 UChar lead;
751                 trie2=utrie2_cloneAsThawed(trie2, &errorCode);
752                 for(lead=0xd800; lead<0xdc00; ++lead) {
753                     utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode);
754                 }
755                 utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode);
756                 if(U_FAILURE(errorCode)) {
757                     fprintf(
758                         stderr,
759                         "genbidi error: deleting lead surrogate code unit values failed - %s\n",
760                         u_errorName(errorCode));
761                     exit(errorCode);
762                 }
763             }
764 
765             usrc_writeUTrie2Arrays(f,
766                 "static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL,
767                 trie2,
768                 "\n};\n\n");
769             usrc_writeUTrie2Struct(f,
770                 "static const UTrie2 propsVectorsTrie={\n",
771                 trie2, "propsVectorsTrie_index", NULL,
772                 "};\n\n");
773 
774             utrie2_close(trie2);
775         }
776 
777         p+=length;
778         capacity-=length;
779 
780         /* set indexes */
781         indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
782             indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
783         indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
784         indexes[UPROPS_RESERVED_INDEX]=
785             indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
786 
787         indexes[UPROPS_MAX_VALUES_INDEX]=
788             (((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
789             (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
790             (((int32_t)USCRIPT_CODE_LIMIT-1)&UPROPS_SCRIPT_MASK);
791         indexes[UPROPS_MAX_VALUES_2_INDEX]=
792             (((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
793             (((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
794             (((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
795             (((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
796             ((int32_t)U_DT_COUNT-1);
797     }
798 
799     if(p!=NULL && (pvCount*4)<=capacity) {
800         if(f!=NULL) {
801             usrc_writeArray(f,
802                 "static const uint32_t propsVectors[%ld]={\n",
803                 pvArray, 32, pvCount,
804                 "};\n\n");
805             fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
806             fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]);
807         } else {
808             uprv_memcpy(p, pvArray, pvCount*4);
809         }
810         if(beVerbose) {
811             printf("number of additional props vectors:    %5u\n", (int)pvRows);
812             printf("number of 32-bit words per vector:     %5u\n", UPROPS_VECTOR_WORDS);
813         }
814     }
815     length+=pvCount*4;
816 
817     return length;
818 }
819