1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2002-2009, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: props2.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002feb24
14 * created by: Markus W. Scherer
15 *
16 * Parse more Unicode Character Database files and store
17 * additional Unicode character properties in bit set vectors.
18 */
19
20 #include <stdio.h>
21 #include "unicode/utypes.h"
22 #include "unicode/uchar.h"
23 #include "unicode/uscript.h"
24 #include "cstring.h"
25 #include "cmemory.h"
26 #include "utrie.h"
27 #include "uprops.h"
28 #include "propsvec.h"
29 #include "uparse.h"
30 #include "writesrc.h"
31 #include "genprops.h"
32
33 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
34
35 /* data --------------------------------------------------------------------- */
36
37 static UNewTrie *newTrie;
38 UPropsVectors *pv;
39
40 /* miscellaneous ------------------------------------------------------------ */
41
42 static char *
trimTerminateField(char * s,char * limit)43 trimTerminateField(char *s, char *limit) {
44 /* trim leading whitespace */
45 s=(char *)u_skipWhitespace(s);
46
47 /* trim trailing whitespace */
48 while(s<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
49 --limit;
50 }
51 *limit=0;
52
53 return s;
54 }
55
56 static void
parseTwoFieldFile(char * filename,char * basename,const char * ucdFile,const char * suffix,UParseLineFn * lineFn,UErrorCode * pErrorCode)57 parseTwoFieldFile(char *filename, char *basename,
58 const char *ucdFile, const char *suffix,
59 UParseLineFn *lineFn,
60 UErrorCode *pErrorCode) {
61 char *fields[2][2];
62
63 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
64 return;
65 }
66
67 writeUCDFilename(basename, ucdFile, suffix);
68
69 u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode);
70 if(U_FAILURE(*pErrorCode)) {
71 fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
72 }
73 }
74
75 static void U_CALLCONV
76 ageLineFn(void *context,
77 char *fields[][2], int32_t fieldCount,
78 UErrorCode *pErrorCode);
79
80 static void
parseMultiFieldFile(char * filename,char * basename,const char * ucdFile,const char * suffix,int32_t fieldCount,UParseLineFn * lineFn,UErrorCode * pErrorCode)81 parseMultiFieldFile(char *filename, char *basename,
82 const char *ucdFile, const char *suffix,
83 int32_t fieldCount,
84 UParseLineFn *lineFn,
85 UErrorCode *pErrorCode) {
86 char *fields[20][2];
87
88 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
89 return;
90 }
91
92 writeUCDFilename(basename, ucdFile, suffix);
93
94 u_parseDelimitedFile(filename, ';', fields, fieldCount, lineFn, NULL, pErrorCode);
95 if(U_FAILURE(*pErrorCode)) {
96 fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
97 }
98 }
99
100 static void U_CALLCONV
101 numericLineFn(void *context,
102 char *fields[][2], int32_t fieldCount,
103 UErrorCode *pErrorCode);
104
105 /* parse files with single enumerated properties ---------------------------- */
106
107 struct SingleEnum {
108 const char *ucdFile, *propName;
109 UProperty prop;
110 int32_t vecWord, vecShift;
111 uint32_t vecMask;
112 };
113 typedef struct SingleEnum SingleEnum;
114
115 static void
116 parseSingleEnumFile(char *filename, char *basename, const char *suffix,
117 const SingleEnum *sen,
118 UErrorCode *pErrorCode);
119
120 static const SingleEnum scriptSingleEnum={
121 "Scripts", "script",
122 UCHAR_SCRIPT,
123 0, 0, UPROPS_SCRIPT_MASK
124 };
125
126 static const SingleEnum blockSingleEnum={
127 "Blocks", "block",
128 UCHAR_BLOCK,
129 0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK
130 };
131
132 static const SingleEnum graphemeClusterBreakSingleEnum={
133 "GraphemeBreakProperty", "Grapheme_Cluster_Break",
134 UCHAR_GRAPHEME_CLUSTER_BREAK,
135 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK
136 };
137
138 static const SingleEnum wordBreakSingleEnum={
139 "WordBreakProperty", "Word_Break",
140 UCHAR_WORD_BREAK,
141 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK
142 };
143
144 static const SingleEnum sentenceBreakSingleEnum={
145 "SentenceBreakProperty", "Sentence_Break",
146 UCHAR_SENTENCE_BREAK,
147 2, UPROPS_SB_SHIFT, UPROPS_SB_MASK
148 };
149
150 static const SingleEnum lineBreakSingleEnum={
151 "LineBreak", "line break",
152 UCHAR_LINE_BREAK,
153 UPROPS_LB_VWORD, UPROPS_LB_SHIFT, UPROPS_LB_MASK
154 };
155
156 static const SingleEnum eawSingleEnum={
157 "EastAsianWidth", "east asian width",
158 UCHAR_EAST_ASIAN_WIDTH,
159 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK
160 };
161
162 static void U_CALLCONV
singleEnumLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)163 singleEnumLineFn(void *context,
164 char *fields[][2], int32_t fieldCount,
165 UErrorCode *pErrorCode) {
166 const SingleEnum *sen;
167 char *s;
168 uint32_t start, end, uv;
169 int32_t value;
170
171 sen=(const SingleEnum *)context;
172
173 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
174 if(U_FAILURE(*pErrorCode)) {
175 fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", sen->ucdFile, fields[0][0]);
176 exit(*pErrorCode);
177 }
178
179 /* parse property alias */
180 s=trimTerminateField(fields[1][0], fields[1][1]);
181 value=u_getPropertyValueEnum(sen->prop, s);
182 if(value<0) {
183 if(sen->prop==UCHAR_BLOCK) {
184 if(isToken("Greek", s)) {
185 value=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */
186 } else if(isToken("Combining Marks for Symbols", s)) {
187 value=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */
188 } else if(isToken("Private Use", s)) {
189 value=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */
190 }
191 }
192 }
193 if(value<0) {
194 fprintf(stderr, "genprops error: unknown %s name in %s.txt field 1 at %s\n",
195 sen->propName, sen->ucdFile, s);
196 exit(U_PARSE_ERROR);
197 }
198
199 uv=(uint32_t)(value<<sen->vecShift);
200 if((uv&sen->vecMask)!=uv) {
201 fprintf(stderr, "genprops error: %s value overflow (0x%x) at %s\n",
202 sen->propName, (int)uv, s);
203 exit(U_INTERNAL_PROGRAM_ERROR);
204 }
205
206 if(start==0 && end==0x10ffff) {
207 /* Also set bits for initialValue and errorValue. */
208 end=UPVEC_MAX_CP;
209 }
210 upvec_setValue(pv, start, end, sen->vecWord, uv, sen->vecMask, pErrorCode);
211 if(U_FAILURE(*pErrorCode)) {
212 fprintf(stderr, "genprops error: unable to set %s code: %s\n",
213 sen->propName, u_errorName(*pErrorCode));
214 exit(*pErrorCode);
215 }
216 }
217
218 static void
parseSingleEnumFile(char * filename,char * basename,const char * suffix,const SingleEnum * sen,UErrorCode * pErrorCode)219 parseSingleEnumFile(char *filename, char *basename, const char *suffix,
220 const SingleEnum *sen,
221 UErrorCode *pErrorCode) {
222 char *fields[2][2];
223
224 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
225 return;
226 }
227
228 writeUCDFilename(basename, sen->ucdFile, suffix);
229
230 u_parseDelimitedFile(filename, ';', fields, 2, singleEnumLineFn, (void *)sen, pErrorCode);
231 if(U_FAILURE(*pErrorCode)) {
232 fprintf(stderr, "error parsing %s.txt: %s\n", sen->ucdFile, u_errorName(*pErrorCode));
233 }
234 }
235
236 /* parse files with multiple binary properties ------------------------------ */
237
238 struct Binary {
239 const char *propName;
240 int32_t vecWord, vecShift;
241 };
242 typedef struct Binary Binary;
243
244 struct Binaries {
245 const char *ucdFile;
246 const Binary *binaries;
247 int32_t binariesCount;
248 };
249 typedef struct Binaries Binaries;
250
251 static const Binary
252 propListNames[]={
253 { "White_Space", 1, UPROPS_WHITE_SPACE },
254 { "Dash", 1, UPROPS_DASH },
255 { "Hyphen", 1, UPROPS_HYPHEN },
256 { "Quotation_Mark", 1, UPROPS_QUOTATION_MARK },
257 { "Terminal_Punctuation", 1, UPROPS_TERMINAL_PUNCTUATION },
258 { "Hex_Digit", 1, UPROPS_HEX_DIGIT },
259 { "ASCII_Hex_Digit", 1, UPROPS_ASCII_HEX_DIGIT },
260 { "Ideographic", 1, UPROPS_IDEOGRAPHIC },
261 { "Diacritic", 1, UPROPS_DIACRITIC },
262 { "Extender", 1, UPROPS_EXTENDER },
263 { "Noncharacter_Code_Point", 1, UPROPS_NONCHARACTER_CODE_POINT },
264 { "Grapheme_Link", 1, UPROPS_GRAPHEME_LINK },
265 { "IDS_Binary_Operator", 1, UPROPS_IDS_BINARY_OPERATOR },
266 { "IDS_Trinary_Operator", 1, UPROPS_IDS_TRINARY_OPERATOR },
267 { "Radical", 1, UPROPS_RADICAL },
268 { "Unified_Ideograph", 1, UPROPS_UNIFIED_IDEOGRAPH },
269 { "Deprecated", 1, UPROPS_DEPRECATED },
270 { "Logical_Order_Exception", 1, UPROPS_LOGICAL_ORDER_EXCEPTION },
271
272 /* new properties in Unicode 4.0.1 */
273 { "STerm", 1, UPROPS_S_TERM },
274 { "Variation_Selector", 1, UPROPS_VARIATION_SELECTOR },
275
276 /* new properties in Unicode 4.1 */
277 { "Pattern_Syntax", 1, UPROPS_PATTERN_SYNTAX },
278 { "Pattern_White_Space", 1, UPROPS_PATTERN_WHITE_SPACE }
279 };
280
281 static const Binaries
282 propListBinaries={
283 "PropList", propListNames, LENGTHOF(propListNames)
284 };
285
286 static const Binary
287 derCorePropsNames[]={
288 { "XID_Start", 1, UPROPS_XID_START },
289 { "XID_Continue", 1, UPROPS_XID_CONTINUE },
290
291 /* before Unicode 4/ICU 2.6/format version 3.2, these used to be Other_XYZ from PropList.txt */
292 { "Math", 1, UPROPS_MATH },
293 { "Alphabetic", 1, UPROPS_ALPHABETIC },
294 { "Grapheme_Extend", 1, UPROPS_GRAPHEME_EXTEND },
295 { "Default_Ignorable_Code_Point", 1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
296
297 /* new properties bits in ICU 2.6/format version 3.2 */
298 { "ID_Start", 1, UPROPS_ID_START },
299 { "ID_Continue", 1, UPROPS_ID_CONTINUE },
300 { "Grapheme_Base", 1, UPROPS_GRAPHEME_BASE },
301
302 /*
303 * Unicode 5/ICU 3.6 moves Grapheme_Link from PropList.txt
304 * to DerivedCoreProperties.txt and deprecates it.
305 */
306 { "Grapheme_Link", 1, UPROPS_GRAPHEME_LINK }
307 };
308
309 static const Binaries
310 derCorePropsBinaries={
311 "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
312 };
313
314 static char ignoredProps[100][64];
315 static int32_t ignoredPropsCount;
316
317 static void
addIgnoredProp(char * s,char * limit)318 addIgnoredProp(char *s, char *limit) {
319 int32_t i;
320
321 s=trimTerminateField(s, limit);
322 for(i=0; i<ignoredPropsCount; ++i) {
323 if(0==uprv_strcmp(ignoredProps[i], s)) {
324 return;
325 }
326 }
327 uprv_strcpy(ignoredProps[ignoredPropsCount++], s);
328 }
329
330 static void U_CALLCONV
binariesLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)331 binariesLineFn(void *context,
332 char *fields[][2], int32_t fieldCount,
333 UErrorCode *pErrorCode) {
334 const Binaries *bin;
335 char *s;
336 uint32_t start, end, uv;
337 int32_t i;
338
339 bin=(const Binaries *)context;
340
341 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
342 if(U_FAILURE(*pErrorCode)) {
343 fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
344 exit(*pErrorCode);
345 }
346
347 /* parse binary property name */
348 s=(char *)u_skipWhitespace(fields[1][0]);
349 for(i=0;; ++i) {
350 if(i==bin->binariesCount) {
351 /* ignore unrecognized properties */
352 if(beVerbose) {
353 addIgnoredProp(s, fields[1][1]);
354 }
355 return;
356 }
357 if(isToken(bin->binaries[i].propName, s)) {
358 break;
359 }
360 }
361
362 if(bin->binaries[i].vecShift>=32) {
363 fprintf(stderr, "genprops error: shift value %d>=32 for %s %s\n",
364 (int)bin->binaries[i].vecShift, bin->ucdFile, bin->binaries[i].propName);
365 exit(U_INTERNAL_PROGRAM_ERROR);
366 }
367 uv=U_MASK(bin->binaries[i].vecShift);
368
369 if(start==0 && end==0x10ffff) {
370 /* Also set bits for initialValue and errorValue. */
371 end=UPVEC_MAX_CP;
372 }
373 upvec_setValue(pv, start, end, bin->binaries[i].vecWord, uv, uv, pErrorCode);
374 if(U_FAILURE(*pErrorCode)) {
375 fprintf(stderr, "genprops error: unable to set %s code: %s\n",
376 bin->binaries[i].propName, u_errorName(*pErrorCode));
377 exit(*pErrorCode);
378 }
379 }
380
381 static void
parseBinariesFile(char * filename,char * basename,const char * suffix,const Binaries * bin,UErrorCode * pErrorCode)382 parseBinariesFile(char *filename, char *basename, const char *suffix,
383 const Binaries *bin,
384 UErrorCode *pErrorCode) {
385 char *fields[2][2];
386 int32_t i;
387
388 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
389 return;
390 }
391
392 writeUCDFilename(basename, bin->ucdFile, suffix);
393
394 ignoredPropsCount=0;
395
396 u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
397 if(U_FAILURE(*pErrorCode)) {
398 fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
399 }
400
401 if(beVerbose) {
402 for(i=0; i<ignoredPropsCount; ++i) {
403 printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile);
404 }
405 }
406 }
407
408 /* -------------------------------------------------------------------------- */
409
410 U_CFUNC void
initAdditionalProperties()411 initAdditionalProperties() {
412 UErrorCode errorCode=U_ZERO_ERROR;
413 pv=upvec_open(UPROPS_VECTOR_WORDS, &errorCode);
414 if(U_FAILURE(errorCode)) {
415 fprintf(stderr, "error: upvec_open() failed - %s\n", u_errorName(errorCode));
416 exit(errorCode);
417 }
418 }
419
420 U_CFUNC void
exitAdditionalProperties()421 exitAdditionalProperties() {
422 utrie_close(newTrie);
423 upvec_close(pv);
424 }
425
426 U_CFUNC void
generateAdditionalProperties(char * filename,const char * suffix,UErrorCode * pErrorCode)427 generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
428 char *basename;
429
430 basename=filename+uprv_strlen(filename);
431
432 /* process various UCD .txt files */
433
434 /* add Han numeric types & values */
435 parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode);
436
437 parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);
438
439 /*
440 * UTR 24 says:
441 * Section 2:
442 * "Common - For characters that may be used
443 * within multiple scripts,
444 * or any unassigned code points."
445 *
446 * Section 4:
447 * "The value COMMON is the default value,
448 * given to all code points that are not
449 * explicitly mentioned in the data file."
450 *
451 * COMMON==USCRIPT_COMMON==0 - nothing to do
452 */
453 parseSingleEnumFile(filename, basename, suffix, &scriptSingleEnum, pErrorCode);
454
455 parseSingleEnumFile(filename, basename, suffix, &blockSingleEnum, pErrorCode);
456
457 parseBinariesFile(filename, basename, suffix, &propListBinaries, pErrorCode);
458
459 parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, pErrorCode);
460
461 parseSingleEnumFile(filename, basename, suffix, &graphemeClusterBreakSingleEnum, pErrorCode);
462
463 parseSingleEnumFile(filename, basename, suffix, &wordBreakSingleEnum, pErrorCode);
464
465 parseSingleEnumFile(filename, basename, suffix, &sentenceBreakSingleEnum, pErrorCode);
466
467 /*
468 * LineBreak-4.0.0.txt:
469 * - All code points, assigned and unassigned, that are not listed
470 * explicitly are given the value "XX".
471 *
472 * XX==U_LB_UNKNOWN==0 - nothing to do
473 */
474 parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode);
475
476 /*
477 * Preset East Asian Width defaults:
478 *
479 * http://www.unicode.org/reports/tr11/#Unassigned
480 * 7.1 Unassigned and Private Use characters
481 *
482 * All unassigned characters are by default classified as non-East Asian neutral,
483 * except for the range U+20000 to U+2FFFD,
484 * since all code positions from U+20000 to U+2FFFD are intended for CJK ideographs (W).
485 * All Private use characters are by default classified as ambiguous,
486 * since their definition depends on context.
487 *
488 * N for all ==0 - nothing to do
489 * A for Private Use
490 * W for plane 2
491 */
492 *pErrorCode=U_ZERO_ERROR;
493 upvec_setValue(pv, 0xe000, 0xf8ff, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
494 upvec_setValue(pv, 0xf0000, 0xffffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
495 upvec_setValue(pv, 0x100000, 0x10fffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
496 upvec_setValue(pv, 0x20000, 0x2fffd, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
497 if(U_FAILURE(*pErrorCode)) {
498 fprintf(stderr, "genprops: unable to set default East Asian Widths: %s\n", u_errorName(*pErrorCode));
499 exit(*pErrorCode);
500 }
501
502 /* parse EastAsianWidth.txt */
503 parseSingleEnumFile(filename, basename, suffix, &eawSingleEnum, pErrorCode);
504
505 {
506 UPVecToUTrieContext toUTrie={ NULL, 50000 /* capacity */, 0, TRUE /* latin1Linear */ };
507 upvec_compact(pv, upvec_compactToUTrieHandler, &toUTrie, pErrorCode);
508 if(U_FAILURE(*pErrorCode)) {
509 fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n",
510 u_errorName(*pErrorCode));
511 exit(*pErrorCode);
512 }
513 newTrie=toUTrie.newTrie;
514 }
515 }
516
517 /* DerivedAge.txt ----------------------------------------------------------- */
518
519 static void U_CALLCONV
ageLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)520 ageLineFn(void *context,
521 char *fields[][2], int32_t fieldCount,
522 UErrorCode *pErrorCode) {
523 char *s, *numberLimit;
524 uint32_t value, start, end, version;
525
526 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
527 if(U_FAILURE(*pErrorCode)) {
528 fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]);
529 exit(*pErrorCode);
530 }
531
532 /* ignore "unassigned" (the default is already set to 0.0) */
533 s=(char *)u_skipWhitespace(fields[1][0]);
534 if(0==uprv_strncmp(s, "unassigned", 10)) {
535 return;
536 }
537
538 /* parse version number */
539 value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
540 if(s==numberLimit || value==0 || value>15 || (*numberLimit!='.' && *numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) {
541 fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
542 *pErrorCode=U_PARSE_ERROR;
543 exit(U_PARSE_ERROR);
544 }
545 version=value<<4;
546
547 /* parse minor version number */
548 if(*numberLimit=='.') {
549 s=(char *)u_skipWhitespace(numberLimit+1);
550 value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
551 if(s==numberLimit || value>15 || (*numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) {
552 fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
553 *pErrorCode=U_PARSE_ERROR;
554 exit(U_PARSE_ERROR);
555 }
556 version|=value;
557 }
558
559 if(start==0 && end==0x10ffff) {
560 /* Also set bits for initialValue and errorValue. */
561 end=UPVEC_MAX_CP;
562 }
563 upvec_setValue(pv, start, end, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode);
564 if(U_FAILURE(*pErrorCode)) {
565 fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode));
566 exit(*pErrorCode);
567 }
568 }
569
570 /* DerivedNumericValues.txt ------------------------------------------------- */
571
572 static void U_CALLCONV
numericLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)573 numericLineFn(void *context,
574 char *fields[][2], int32_t fieldCount,
575 UErrorCode *pErrorCode) {
576 Props newProps={ 0 };
577 char *s, *numberLimit;
578 uint32_t start, end, value, oldProps32;
579 char c;
580 UBool isFraction;
581
582 /* get the code point range */
583 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
584 if(U_FAILURE(*pErrorCode)) {
585 fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 0 at %s\n", fields[0][0]);
586 exit(*pErrorCode);
587 }
588
589 /*
590 * Ignore the
591 * # @missing: 0000..10FFFF; NaN
592 * line from Unicode 5.1's DerivedNumericValues.txt:
593 * The following code cannot parse "NaN", and we don't want to overwrite
594 * the numeric values for all characters after reading most
595 * from UnicodeData.txt already.
596 */
597 if(start==0 && end==0x10ffff) {
598 return;
599 }
600
601 /* check if the numeric value is a fraction (this code does not handle any) */
602 isFraction=FALSE;
603 s=uprv_strchr(fields[1][0], '.');
604 if(s!=NULL) {
605 numberLimit=s+1;
606 while('0'<=(c=*numberLimit++) && c<='9') {
607 if(c!='0') {
608 isFraction=TRUE;
609 break;
610 }
611 }
612 }
613
614 if(isFraction) {
615 value=0;
616 } else {
617 /* parse numeric value */
618 s=(char *)u_skipWhitespace(fields[1][0]);
619
620 /* try large, single-significant-digit numbers, may otherwise overflow strtoul() */
621 if('1'<=s[0] && s[0]<='9' && s[1]=='0' && s[2]=='0') {
622 /* large integers are encoded in a special way, see store.c */
623 uint8_t exp=0;
624
625 value=s[0]-'0';
626 numberLimit=s;
627 while(*(++numberLimit)=='0') {
628 ++exp;
629 }
630 newProps.exponent=exp;
631 } else {
632 /* normal number parsing */
633 value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
634 }
635 if(numberLimit<=s || (*numberLimit!='.' && u_skipWhitespace(numberLimit)!=fields[1][1]) || value>=0x80000000) {
636 fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 1 at %s\n", fields[0][0]);
637 exit(U_PARSE_ERROR);
638 }
639 }
640
641 /*
642 * Unicode 4.0.1 removes the third column that used to list the numeric type.
643 * Assume that either the data is the same as in UnicodeData.txt,
644 * or else that the numeric type is "numeric".
645 * This should work because we only expect to add numeric values for
646 * Han characters; for those, UnicodeData.txt lists only ranges without
647 * specific properties for single characters.
648 */
649
650 /* set the new numeric value */
651 newProps.code=start;
652 newProps.numericValue=(int32_t)value; /* newly parsed numeric value */
653 /* the exponent may have been set above */
654
655 for(; start<=end; ++start) {
656 uint32_t newProps32;
657 int32_t oldNtv;
658 oldProps32=getProps(start);
659 oldNtv=(int32_t)GET_NUMERIC_TYPE_VALUE(oldProps32);
660
661 if(isFraction) {
662 if(UPROPS_NTV_FRACTION_START<=oldNtv && oldNtv<UPROPS_NTV_LARGE_START) {
663 /* this code point was already listed with its numeric value in UnicodeData.txt */
664 continue;
665 } else {
666 fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]);
667 exit(U_PARSE_ERROR);
668 }
669 }
670
671 /*
672 * For simplicity, and because we only expect to set numeric values for Han characters,
673 * for now we only allow to set these values for Lo characters.
674 */
675 if(oldNtv==UPROPS_NTV_NONE && GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
676 fprintf(stderr, "genprops error: new numeric value for a character other than Lo in DerivedNumericValues.txt at %s\n", fields[0][0]);
677 exit(U_PARSE_ERROR);
678 }
679
680 /* verify that we do not change an existing value (fractions were excluded above) */
681 if(oldNtv!=UPROPS_NTV_NONE) {
682 /* the code point already has a value stored */
683 newProps.numericType=UPROPS_NTV_GET_TYPE(oldNtv);
684 newProps32=makeProps(&newProps);
685 if(oldNtv!=GET_NUMERIC_TYPE_VALUE(newProps32)) {
686 fprintf(stderr, "genprops error: new numeric value differs from old one for U+%04lx\n", (long)start);
687 exit(U_PARSE_ERROR);
688 }
689 /* same value, continue */
690 } else {
691 /* the code point is getting a new numeric value */
692 newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
693 newProps32=makeProps(&newProps);
694 if(beVerbose) {
695 printf("adding U+%04x numeric type %d encoded-numeric-type-value 0x%03x from %s\n",
696 (int)start, U_NT_NUMERIC, (int)GET_NUMERIC_TYPE_VALUE(newProps32), fields[0][0]);
697 }
698
699 addProps(start, newProps32|GET_CATEGORY(oldProps32));
700 }
701 }
702 }
703
704 /* data serialization ------------------------------------------------------- */
705
706 U_CFUNC int32_t
writeAdditionalData(FILE * f,uint8_t * p,int32_t capacity,int32_t indexes[UPROPS_INDEX_COUNT])707 writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
708 const uint32_t *pvArray;
709 int32_t pvRows, pvCount;
710 int32_t length;
711 UErrorCode errorCode;
712
713 pvArray=upvec_getArray(pv, &pvRows, NULL);
714 pvCount=pvRows*UPROPS_VECTOR_WORDS;
715
716 errorCode=U_ZERO_ERROR;
717 length=utrie_serialize(newTrie, p, capacity, NULL, TRUE, &errorCode);
718 if(U_FAILURE(errorCode)) {
719 fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
720 exit(errorCode);
721 }
722 if(p!=NULL) {
723 if(beVerbose) {
724 printf("size in bytes of additional props trie:%5u\n", (int)length);
725 }
726 if(f!=NULL) {
727 UTrie trie={ NULL };
728 UTrie2 *trie2;
729
730 utrie_unserialize(&trie, p, length, &errorCode);
731 if(U_FAILURE(errorCode)) {
732 fprintf(
733 stderr,
734 "genprops error: failed to utrie_unserialize(trie for additional properties) - %s\n",
735 u_errorName(errorCode));
736 exit(errorCode);
737 }
738
739 /* use UTrie2 */
740 trie2=utrie2_fromUTrie(&trie, trie.initialValue, &errorCode);
741 if(U_FAILURE(errorCode)) {
742 fprintf(
743 stderr,
744 "genprops error: utrie2_fromUTrie() failed - %s\n",
745 u_errorName(errorCode));
746 exit(errorCode);
747 }
748 {
749 /* delete lead surrogate code unit values */
750 UChar lead;
751 trie2=utrie2_cloneAsThawed(trie2, &errorCode);
752 for(lead=0xd800; lead<0xdc00; ++lead) {
753 utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode);
754 }
755 utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode);
756 if(U_FAILURE(errorCode)) {
757 fprintf(
758 stderr,
759 "genbidi error: deleting lead surrogate code unit values failed - %s\n",
760 u_errorName(errorCode));
761 exit(errorCode);
762 }
763 }
764
765 usrc_writeUTrie2Arrays(f,
766 "static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL,
767 trie2,
768 "\n};\n\n");
769 usrc_writeUTrie2Struct(f,
770 "static const UTrie2 propsVectorsTrie={\n",
771 trie2, "propsVectorsTrie_index", NULL,
772 "};\n\n");
773
774 utrie2_close(trie2);
775 }
776
777 p+=length;
778 capacity-=length;
779
780 /* set indexes */
781 indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
782 indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
783 indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
784 indexes[UPROPS_RESERVED_INDEX]=
785 indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
786
787 indexes[UPROPS_MAX_VALUES_INDEX]=
788 (((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
789 (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
790 (((int32_t)USCRIPT_CODE_LIMIT-1)&UPROPS_SCRIPT_MASK);
791 indexes[UPROPS_MAX_VALUES_2_INDEX]=
792 (((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
793 (((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
794 (((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
795 (((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
796 ((int32_t)U_DT_COUNT-1);
797 }
798
799 if(p!=NULL && (pvCount*4)<=capacity) {
800 if(f!=NULL) {
801 usrc_writeArray(f,
802 "static const uint32_t propsVectors[%ld]={\n",
803 pvArray, 32, pvCount,
804 "};\n\n");
805 fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
806 fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]);
807 } else {
808 uprv_memcpy(p, pvArray, pvCount*4);
809 }
810 if(beVerbose) {
811 printf("number of additional props vectors: %5u\n", (int)pvRows);
812 printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
813 }
814 }
815 length+=pvCount*4;
816
817 return length;
818 }
819