1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2002-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: props2.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002feb24
14 * created by: Markus W. Scherer
15 *
16 * Parse more Unicode Character Database files and store
17 * additional Unicode character properties in bit set vectors.
18 */
19
20 #include <stdio.h>
21 #include "unicode/utypes.h"
22 #include "unicode/uchar.h"
23 #include "unicode/uscript.h"
24 #include "cstring.h"
25 #include "cmemory.h"
26 #include "utrie.h"
27 #include "uprops.h"
28 #include "propsvec.h"
29 #include "uparse.h"
30 #include "writesrc.h"
31 #include "genprops.h"
32
33 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
34
35 /* data --------------------------------------------------------------------- */
36
37 static UNewTrie *trie;
38 uint32_t *pv;
39 static int32_t pvCount;
40
41 /* miscellaneous ------------------------------------------------------------ */
42
43 static char *
trimTerminateField(char * s,char * limit)44 trimTerminateField(char *s, char *limit) {
45 /* trim leading whitespace */
46 s=(char *)u_skipWhitespace(s);
47
48 /* trim trailing whitespace */
49 while(s<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
50 --limit;
51 }
52 *limit=0;
53
54 return s;
55 }
56
57 static void
parseTwoFieldFile(char * filename,char * basename,const char * ucdFile,const char * suffix,UParseLineFn * lineFn,UErrorCode * pErrorCode)58 parseTwoFieldFile(char *filename, char *basename,
59 const char *ucdFile, const char *suffix,
60 UParseLineFn *lineFn,
61 UErrorCode *pErrorCode) {
62 char *fields[2][2];
63
64 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
65 return;
66 }
67
68 writeUCDFilename(basename, ucdFile, suffix);
69
70 u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode);
71 if(U_FAILURE(*pErrorCode)) {
72 fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
73 }
74 }
75
76 static void U_CALLCONV
77 ageLineFn(void *context,
78 char *fields[][2], int32_t fieldCount,
79 UErrorCode *pErrorCode);
80
81 static void
parseMultiFieldFile(char * filename,char * basename,const char * ucdFile,const char * suffix,int32_t fieldCount,UParseLineFn * lineFn,UErrorCode * pErrorCode)82 parseMultiFieldFile(char *filename, char *basename,
83 const char *ucdFile, const char *suffix,
84 int32_t fieldCount,
85 UParseLineFn *lineFn,
86 UErrorCode *pErrorCode) {
87 char *fields[20][2];
88
89 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
90 return;
91 }
92
93 writeUCDFilename(basename, ucdFile, suffix);
94
95 u_parseDelimitedFile(filename, ';', fields, fieldCount, lineFn, NULL, pErrorCode);
96 if(U_FAILURE(*pErrorCode)) {
97 fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
98 }
99 }
100
101 static void U_CALLCONV
102 numericLineFn(void *context,
103 char *fields[][2], int32_t fieldCount,
104 UErrorCode *pErrorCode);
105
106 /* parse files with single enumerated properties ---------------------------- */
107
108 struct SingleEnum {
109 const char *ucdFile, *propName;
110 UProperty prop;
111 int32_t vecWord, vecShift;
112 uint32_t vecMask;
113 };
114 typedef struct SingleEnum SingleEnum;
115
116 static void
117 parseSingleEnumFile(char *filename, char *basename, const char *suffix,
118 const SingleEnum *sen,
119 UErrorCode *pErrorCode);
120
121 static const SingleEnum scriptSingleEnum={
122 "Scripts", "script",
123 UCHAR_SCRIPT,
124 0, 0, UPROPS_SCRIPT_MASK
125 };
126
127 static const SingleEnum blockSingleEnum={
128 "Blocks", "block",
129 UCHAR_BLOCK,
130 0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK
131 };
132
133 static const SingleEnum graphemeClusterBreakSingleEnum={
134 "GraphemeBreakProperty", "Grapheme_Cluster_Break",
135 UCHAR_GRAPHEME_CLUSTER_BREAK,
136 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK
137 };
138
139 static const SingleEnum wordBreakSingleEnum={
140 "WordBreakProperty", "Word_Break",
141 UCHAR_WORD_BREAK,
142 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK
143 };
144
145 static const SingleEnum sentenceBreakSingleEnum={
146 "SentenceBreakProperty", "Sentence_Break",
147 UCHAR_SENTENCE_BREAK,
148 2, UPROPS_SB_SHIFT, UPROPS_SB_MASK
149 };
150
151 static const SingleEnum lineBreakSingleEnum={
152 "LineBreak", "line break",
153 UCHAR_LINE_BREAK,
154 0, UPROPS_LB_SHIFT, UPROPS_LB_MASK
155 };
156
157 static const SingleEnum eawSingleEnum={
158 "EastAsianWidth", "east asian width",
159 UCHAR_EAST_ASIAN_WIDTH,
160 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK
161 };
162
163 static void U_CALLCONV
singleEnumLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)164 singleEnumLineFn(void *context,
165 char *fields[][2], int32_t fieldCount,
166 UErrorCode *pErrorCode) {
167 const SingleEnum *sen;
168 char *s;
169 uint32_t start, limit, uv;
170 int32_t value;
171
172 sen=(const SingleEnum *)context;
173
174 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
175 if(U_FAILURE(*pErrorCode)) {
176 fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", sen->ucdFile, fields[0][0]);
177 exit(*pErrorCode);
178 }
179 ++limit;
180
181 /* parse property alias */
182 s=trimTerminateField(fields[1][0], fields[1][1]);
183 value=u_getPropertyValueEnum(sen->prop, s);
184 if(value<0) {
185 if(sen->prop==UCHAR_BLOCK) {
186 if(isToken("Greek", s)) {
187 value=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */
188 } else if(isToken("Combining Marks for Symbols", s)) {
189 value=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */
190 } else if(isToken("Private Use", s)) {
191 value=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */
192 }
193 }
194 }
195 if(value<0) {
196 fprintf(stderr, "genprops error: unknown %s name in %s.txt field 1 at %s\n",
197 sen->propName, sen->ucdFile, s);
198 exit(U_PARSE_ERROR);
199 }
200
201 uv=(uint32_t)(value<<sen->vecShift);
202 if((uv&sen->vecMask)!=uv) {
203 fprintf(stderr, "genprops error: %s value overflow (0x%x) at %s\n",
204 sen->propName, (int)uv, s);
205 exit(U_INTERNAL_PROGRAM_ERROR);
206 }
207
208 if(!upvec_setValue(pv, start, limit, sen->vecWord, uv, sen->vecMask, pErrorCode)) {
209 fprintf(stderr, "genprops error: unable to set %s code: %s\n",
210 sen->propName, u_errorName(*pErrorCode));
211 exit(*pErrorCode);
212 }
213 }
214
215 static void
parseSingleEnumFile(char * filename,char * basename,const char * suffix,const SingleEnum * sen,UErrorCode * pErrorCode)216 parseSingleEnumFile(char *filename, char *basename, const char *suffix,
217 const SingleEnum *sen,
218 UErrorCode *pErrorCode) {
219 char *fields[2][2];
220
221 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
222 return;
223 }
224
225 writeUCDFilename(basename, sen->ucdFile, suffix);
226
227 u_parseDelimitedFile(filename, ';', fields, 2, singleEnumLineFn, (void *)sen, pErrorCode);
228 if(U_FAILURE(*pErrorCode)) {
229 fprintf(stderr, "error parsing %s.txt: %s\n", sen->ucdFile, u_errorName(*pErrorCode));
230 }
231 }
232
233 /* parse files with multiple binary properties ------------------------------ */
234
235 struct Binary {
236 const char *propName;
237 int32_t vecWord, vecShift;
238 };
239 typedef struct Binary Binary;
240
241 struct Binaries {
242 const char *ucdFile;
243 const Binary *binaries;
244 int32_t binariesCount;
245 };
246 typedef struct Binaries Binaries;
247
248 static const Binary
249 propListNames[]={
250 { "White_Space", 1, UPROPS_WHITE_SPACE },
251 { "Dash", 1, UPROPS_DASH },
252 { "Hyphen", 1, UPROPS_HYPHEN },
253 { "Quotation_Mark", 1, UPROPS_QUOTATION_MARK },
254 { "Terminal_Punctuation", 1, UPROPS_TERMINAL_PUNCTUATION },
255 { "Hex_Digit", 1, UPROPS_HEX_DIGIT },
256 { "ASCII_Hex_Digit", 1, UPROPS_ASCII_HEX_DIGIT },
257 { "Ideographic", 1, UPROPS_IDEOGRAPHIC },
258 { "Diacritic", 1, UPROPS_DIACRITIC },
259 { "Extender", 1, UPROPS_EXTENDER },
260 { "Noncharacter_Code_Point", 1, UPROPS_NONCHARACTER_CODE_POINT },
261 { "Grapheme_Link", 1, UPROPS_GRAPHEME_LINK },
262 { "IDS_Binary_Operator", 1, UPROPS_IDS_BINARY_OPERATOR },
263 { "IDS_Trinary_Operator", 1, UPROPS_IDS_TRINARY_OPERATOR },
264 { "Radical", 1, UPROPS_RADICAL },
265 { "Unified_Ideograph", 1, UPROPS_UNIFIED_IDEOGRAPH },
266 { "Deprecated", 1, UPROPS_DEPRECATED },
267 { "Logical_Order_Exception", 1, UPROPS_LOGICAL_ORDER_EXCEPTION },
268
269 /* new properties in Unicode 4.0.1 */
270 { "STerm", 2, UPROPS_V2_S_TERM },
271 { "Variation_Selector", 2, UPROPS_V2_VARIATION_SELECTOR },
272
273 /* new properties in Unicode 4.1 */
274 { "Pattern_Syntax", 2, UPROPS_V2_PATTERN_SYNTAX },
275 { "Pattern_White_Space", 2, UPROPS_V2_PATTERN_WHITE_SPACE }
276 };
277
278 static const Binaries
279 propListBinaries={
280 "PropList", propListNames, LENGTHOF(propListNames)
281 };
282
283 static const Binary
284 derCorePropsNames[]={
285 { "XID_Start", 1, UPROPS_XID_START },
286 { "XID_Continue", 1, UPROPS_XID_CONTINUE },
287
288 /* before Unicode 4/ICU 2.6/format version 3.2, these used to be Other_XYZ from PropList.txt */
289 { "Math", 1, UPROPS_MATH },
290 { "Alphabetic", 1, UPROPS_ALPHABETIC },
291 { "Grapheme_Extend", 1, UPROPS_GRAPHEME_EXTEND },
292 { "Default_Ignorable_Code_Point", 1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
293
294 /* new properties bits in ICU 2.6/format version 3.2 */
295 { "ID_Start", 1, UPROPS_ID_START },
296 { "ID_Continue", 1, UPROPS_ID_CONTINUE },
297 { "Grapheme_Base", 1, UPROPS_GRAPHEME_BASE },
298
299 /*
300 * Unicode 5/ICU 3.6 moves Grapheme_Link from PropList.txt
301 * to DerivedCoreProperties.txt and deprecates it.
302 */
303 { "Grapheme_Link", 1, UPROPS_GRAPHEME_LINK }
304 };
305
306 static const Binaries
307 derCorePropsBinaries={
308 "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
309 };
310
311 static char ignoredProps[100][64];
312 static int32_t ignoredPropsCount;
313
314 static void
addIgnoredProp(char * s,char * limit)315 addIgnoredProp(char *s, char *limit) {
316 int32_t i;
317
318 s=trimTerminateField(s, limit);
319 for(i=0; i<ignoredPropsCount; ++i) {
320 if(0==uprv_strcmp(ignoredProps[i], s)) {
321 return;
322 }
323 }
324 uprv_strcpy(ignoredProps[ignoredPropsCount++], s);
325 }
326
327 static void U_CALLCONV
binariesLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)328 binariesLineFn(void *context,
329 char *fields[][2], int32_t fieldCount,
330 UErrorCode *pErrorCode) {
331 const Binaries *bin;
332 char *s;
333 uint32_t start, limit, uv;
334 int32_t i;
335
336 bin=(const Binaries *)context;
337
338 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
339 if(U_FAILURE(*pErrorCode)) {
340 fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
341 exit(*pErrorCode);
342 }
343 ++limit;
344
345 /* parse binary property name */
346 s=(char *)u_skipWhitespace(fields[1][0]);
347 for(i=0;; ++i) {
348 if(i==bin->binariesCount) {
349 /* ignore unrecognized properties */
350 if(beVerbose) {
351 addIgnoredProp(s, fields[1][1]);
352 }
353 return;
354 }
355 if(isToken(bin->binaries[i].propName, s)) {
356 break;
357 }
358 }
359
360 if(bin->binaries[i].vecShift>=32) {
361 fprintf(stderr, "genprops error: shift value %d>=32 for %s %s\n",
362 (int)bin->binaries[i].vecShift, bin->ucdFile, bin->binaries[i].propName);
363 exit(U_INTERNAL_PROGRAM_ERROR);
364 }
365 uv=U_MASK(bin->binaries[i].vecShift);
366
367 if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, uv, uv, pErrorCode)) {
368 fprintf(stderr, "genprops error: unable to set %s code: %s\n",
369 bin->binaries[i].propName, u_errorName(*pErrorCode));
370 exit(*pErrorCode);
371 }
372 }
373
374 static void
parseBinariesFile(char * filename,char * basename,const char * suffix,const Binaries * bin,UErrorCode * pErrorCode)375 parseBinariesFile(char *filename, char *basename, const char *suffix,
376 const Binaries *bin,
377 UErrorCode *pErrorCode) {
378 char *fields[2][2];
379 int32_t i;
380
381 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
382 return;
383 }
384
385 writeUCDFilename(basename, bin->ucdFile, suffix);
386
387 ignoredPropsCount=0;
388
389 u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
390 if(U_FAILURE(*pErrorCode)) {
391 fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
392 }
393
394 if(beVerbose) {
395 for(i=0; i<ignoredPropsCount; ++i) {
396 printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile);
397 }
398 }
399 }
400
401 /* -------------------------------------------------------------------------- */
402
403 U_CFUNC void
initAdditionalProperties()404 initAdditionalProperties() {
405 pv=upvec_open(UPROPS_VECTOR_WORDS, 20000);
406 }
407
408 U_CFUNC void
exitAdditionalProperties()409 exitAdditionalProperties() {
410 utrie_close(trie);
411 upvec_close(pv);
412 }
413
414 U_CFUNC void
generateAdditionalProperties(char * filename,const char * suffix,UErrorCode * pErrorCode)415 generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
416 char *basename;
417
418 basename=filename+uprv_strlen(filename);
419
420 /* process various UCD .txt files */
421
422 /* add Han numeric types & values */
423 parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode);
424
425 parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);
426
427 /*
428 * UTR 24 says:
429 * Section 2:
430 * "Common - For characters that may be used
431 * within multiple scripts,
432 * or any unassigned code points."
433 *
434 * Section 4:
435 * "The value COMMON is the default value,
436 * given to all code points that are not
437 * explicitly mentioned in the data file."
438 *
439 * COMMON==USCRIPT_COMMON==0 - nothing to do
440 */
441 parseSingleEnumFile(filename, basename, suffix, &scriptSingleEnum, pErrorCode);
442
443 parseSingleEnumFile(filename, basename, suffix, &blockSingleEnum, pErrorCode);
444
445 parseBinariesFile(filename, basename, suffix, &propListBinaries, pErrorCode);
446
447 parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, pErrorCode);
448
449 parseSingleEnumFile(filename, basename, suffix, &graphemeClusterBreakSingleEnum, pErrorCode);
450
451 parseSingleEnumFile(filename, basename, suffix, &wordBreakSingleEnum, pErrorCode);
452
453 parseSingleEnumFile(filename, basename, suffix, &sentenceBreakSingleEnum, pErrorCode);
454
455 /*
456 * LineBreak-4.0.0.txt:
457 * - All code points, assigned and unassigned, that are not listed
458 * explicitly are given the value "XX".
459 *
460 * XX==U_LB_UNKNOWN==0 - nothing to do
461 */
462 parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode);
463
464 /*
465 * Preset East Asian Width defaults:
466 *
467 * http://www.unicode.org/reports/tr11/#Unassigned
468 * 7.1 Unassigned and Private Use characters
469 *
470 * All unassigned characters are by default classified as non-East Asian neutral,
471 * except for the range U+20000 to U+2FFFD,
472 * since all code positions from U+20000 to U+2FFFD are intended for CJK ideographs (W).
473 * All Private use characters are by default classified as ambiguous,
474 * since their definition depends on context.
475 *
476 * N for all ==0 - nothing to do
477 * A for Private Use
478 * W for plane 2
479 */
480 *pErrorCode=U_ZERO_ERROR;
481 if( !upvec_setValue(pv, 0xe000, 0xf900, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
482 !upvec_setValue(pv, 0xf0000, 0xffffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
483 !upvec_setValue(pv, 0x100000, 0x10fffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
484 !upvec_setValue(pv, 0x20000, 0x2fffe, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode)
485 ) {
486 fprintf(stderr, "genprops: unable to set default East Asian Widths: %s\n", u_errorName(*pErrorCode));
487 exit(*pErrorCode);
488 }
489
490 /* parse EastAsianWidth.txt */
491 parseSingleEnumFile(filename, basename, suffix, &eawSingleEnum, pErrorCode);
492
493 trie=utrie_open(NULL, NULL, 50000, 0, 0, TRUE);
494 if(trie==NULL) {
495 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
496 upvec_close(pv);
497 return;
498 }
499
500 pvCount=upvec_compact(pv, upvec_compactToTrieHandler, trie, pErrorCode);
501 if(U_FAILURE(*pErrorCode)) {
502 fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", u_errorName(*pErrorCode));
503 exit(*pErrorCode);
504 }
505 }
506
507 /* DerivedAge.txt ----------------------------------------------------------- */
508
509 static void U_CALLCONV
ageLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)510 ageLineFn(void *context,
511 char *fields[][2], int32_t fieldCount,
512 UErrorCode *pErrorCode) {
513 char *s, *end;
514 uint32_t value, start, limit, version;
515
516 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
517 if(U_FAILURE(*pErrorCode)) {
518 fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]);
519 exit(*pErrorCode);
520 }
521 ++limit;
522
523 /* ignore "unassigned" (the default is already set to 0.0) */
524 s=(char *)u_skipWhitespace(fields[1][0]);
525 if(0==uprv_strncmp(s, "unassigned", 10)) {
526 return;
527 }
528
529 /* parse version number */
530 value=(uint32_t)uprv_strtoul(s, &end, 10);
531 if(s==end || value==0 || value>15 || (*end!='.' && *end!=' ' && *end!='\t' && *end!=0)) {
532 fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
533 *pErrorCode=U_PARSE_ERROR;
534 exit(U_PARSE_ERROR);
535 }
536 version=value<<4;
537
538 /* parse minor version number */
539 if(*end=='.') {
540 s=(char *)u_skipWhitespace(end+1);
541 value=(uint32_t)uprv_strtoul(s, &end, 10);
542 if(s==end || value>15 || (*end!=' ' && *end!='\t' && *end!=0)) {
543 fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
544 *pErrorCode=U_PARSE_ERROR;
545 exit(U_PARSE_ERROR);
546 }
547 version|=value;
548 }
549
550 if(!upvec_setValue(pv, start, limit, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode)) {
551 fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode));
552 exit(*pErrorCode);
553 }
554 }
555
556 /* DerivedNumericValues.txt ------------------------------------------------- */
557
558 static void U_CALLCONV
numericLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)559 numericLineFn(void *context,
560 char *fields[][2], int32_t fieldCount,
561 UErrorCode *pErrorCode) {
562 Props newProps={ 0 };
563 char *s, *end;
564 uint32_t start, limit, value, oldProps32;
565 int32_t oldType;
566 char c;
567 UBool isFraction;
568
569 /* get the code point range */
570 u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
571 if(U_FAILURE(*pErrorCode)) {
572 fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 0 at %s\n", fields[0][0]);
573 exit(*pErrorCode);
574 }
575 ++limit;
576
577 /* check if the numeric value is a fraction (this code does not handle any) */
578 isFraction=FALSE;
579 s=uprv_strchr(fields[1][0], '.');
580 if(s!=NULL) {
581 end=s+1;
582 while('0'<=(c=*end++) && c<='9') {
583 if(c!='0') {
584 isFraction=TRUE;
585 break;
586 }
587 }
588 }
589
590 if(isFraction) {
591 value=0;
592 } else {
593 /* parse numeric value */
594 s=(char *)u_skipWhitespace(fields[1][0]);
595
596 /* try large powers of 10 first, may otherwise overflow strtoul() */
597 if(0==uprv_strncmp(s, "10000000000", 11)) {
598 /* large powers of 10 are encoded in a special way, see store.c */
599 uint8_t exp=0;
600
601 end=s;
602 while(*(++end)=='0') {
603 ++exp;
604 }
605 value=1;
606 newProps.exponent=exp;
607 } else {
608 /* normal number parsing */
609 value=(uint32_t)uprv_strtoul(s, &end, 10);
610 }
611 if(end<=s || (*end!='.' && u_skipWhitespace(end)!=fields[1][1]) || value>=0x80000000) {
612 fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 1 at %s\n", fields[0][0]);
613 exit(U_PARSE_ERROR);
614 }
615 }
616
617 /*
618 * Unicode 4.0.1 removes the third column that used to list the numeric type.
619 * Assume that either the data is the same as in UnicodeData.txt,
620 * or else that the numeric type is "numeric".
621 * This should work because we only expect to add numeric values for
622 * Han characters; for those, UnicodeData.txt lists only ranges without
623 * specific properties for single characters.
624 */
625
626 /* set the new numeric type and value */
627 newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
628 newProps.numericValue=(int32_t)value; /* newly parsed numeric value */
629 /* the exponent may have been set above */
630 value=makeProps(&newProps);
631
632 for(; start<limit; ++start) {
633 oldProps32=getProps(start);
634 oldType=(int32_t)GET_NUMERIC_TYPE(oldProps32);
635
636 if(isFraction) {
637 if(oldType!=0) {
638 /* this code point was already listed with its numeric value in UnicodeData.txt */
639 continue;
640 } else {
641 fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]);
642 exit(U_PARSE_ERROR);
643 }
644 }
645
646 /*
647 * For simplicity, and because we only expect to set numeric values for Han characters,
648 * for now we only allow to set these values for Lo characters.
649 */
650 if(oldType==0 && GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
651 fprintf(stderr, "genprops error: new numeric value for a character other than Lo in DerivedNumericValues.txt at %s\n", fields[0][0]);
652 exit(U_PARSE_ERROR);
653 }
654
655 /* verify that we do not change an existing value (fractions were excluded above) */
656 if(oldType!=0) {
657 /* the code point already has a value stored */
658 if((oldProps32&0xff00)!=(value&0xff00)) {
659 fprintf(stderr, "genprops error: new numeric value differs from old one for U+%04lx\n", (long)start);
660 exit(U_PARSE_ERROR);
661 }
662 /* same value, continue */
663 } else {
664 /* the code point is getting a new numeric value */
665 if(beVerbose) {
666 printf("adding U+%04x numeric type %d value 0x%04x from %s\n", (int)start, U_NT_NUMERIC, (int)value, fields[0][0]);
667 }
668
669 addProps(start, value|GET_CATEGORY(oldProps32));
670 }
671 }
672 }
673
674 /* data serialization ------------------------------------------------------- */
675
676 U_CFUNC int32_t
writeAdditionalData(FILE * f,uint8_t * p,int32_t capacity,int32_t indexes[UPROPS_INDEX_COUNT])677 writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
678 int32_t length;
679 UErrorCode errorCode;
680
681 errorCode=U_ZERO_ERROR;
682 length=utrie_serialize(trie, p, capacity, NULL, TRUE, &errorCode);
683 if(U_FAILURE(errorCode)) {
684 fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
685 exit(errorCode);
686 }
687 if(p!=NULL) {
688 if(beVerbose) {
689 printf("size in bytes of additional props trie:%5u\n", (int)length);
690 }
691 if(f!=NULL) {
692 UTrie trie2={ NULL };
693 utrie_unserialize(&trie2, p, length, &errorCode);
694 if(U_FAILURE(errorCode)) {
695 fprintf(
696 stderr,
697 "genprops error: failed to utrie_unserialize(trie for additional properties) - %s\n",
698 u_errorName(errorCode));
699 exit(errorCode);
700 }
701 usrc_writeUTrieArrays(f,
702 "static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL,
703 &trie2,
704 "\n};\n\n");
705 usrc_writeUTrieStruct(f,
706 "static const UTrie propsVectorsTrie={\n",
707 &trie2, "propsVectorsTrie_index", NULL, NULL,
708 "};\n\n");
709 }
710
711 p+=length;
712 capacity-=length;
713
714 /* set indexes */
715 indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
716 indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
717 indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
718 indexes[UPROPS_RESERVED_INDEX]=
719 indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
720
721 indexes[UPROPS_MAX_VALUES_INDEX]=
722 (((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
723 (((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
724 (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
725 ((int32_t)USCRIPT_CODE_LIMIT-1);
726 indexes[UPROPS_MAX_VALUES_2_INDEX]=
727 (((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
728 (((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
729 (((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
730 ((int32_t)U_DT_COUNT-1);
731 }
732
733 if(p!=NULL && (pvCount*4)<=capacity) {
734 if(f!=NULL) {
735 usrc_writeArray(f,
736 "static const uint32_t propsVectors[%ld]={\n",
737 pv, 32, pvCount,
738 "};\n\n");
739 fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
740 fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]);
741 } else {
742 uprv_memcpy(p, pv, pvCount*4);
743 }
744 if(beVerbose) {
745 printf("number of additional props vectors: %5u\n", (int)pvCount/UPROPS_VECTOR_WORDS);
746 printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
747 }
748 }
749 length+=pvCount*4;
750
751 return length;
752 }
753