1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 1999-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: corepropsbuilder.cpp (was store.c & props2.cpp)
11 * encoding: US-ASCII
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 1999dec11
16 * created by: Markus W. Scherer
17 *
18 * Store Unicode character properties efficiently for
19 * random access.
20 */
21
22 #include <stdio.h>
23 #include "unicode/utypes.h"
24 #include "unicode/uchar.h"
25 #include "unicode/udata.h"
26 #include "unicode/uniset.h"
27 #include "unicode/unistr.h"
28 #include "unicode/usetiter.h"
29 #include "unicode/uscript.h"
30 #include "cmemory.h"
31 #include "cstring.h"
32 #include "genprops.h"
33 #include "propsvec.h"
34 #include "uassert.h"
35 #include "unewdata.h"
36 #include "uprops.h"
37 #include "utrie2.h"
38 #include "writesrc.h"
39
40 /* Unicode character properties file format ------------------------------------
41
42 The file format prepared and written here contains several data
43 structures that store indexes or data.
44
45 Before the data contents described below, there are the headers required by
46 the udata API for loading ICU data. Especially, a UDataInfo structure
47 precedes the actual data. It contains platform properties values and the
48 file format version.
49
50 The following is a description of format version 7.8 .
51
52 Data contents:
53
54 The contents is a parsed, binary form of several Unicode character
55 database files, most prominently UnicodeData.txt.
56
57 Any Unicode code point from 0 to 0x10ffff can be looked up to get
58 the properties, if any, for that code point. This means that the input
59 to the lookup are 21-bit unsigned integers, with not all of the
60 21-bit range used.
61
62 It is assumed that client code keeps a uint32_t pointer
63 to the beginning of the data:
64
65 const uint32_t *p32;
66
67 Formally, the file contains the following structures:
68
69 const int32_t indexes[16] with values i0..i15:
70
71 i0 indicates the length of the main trie.
72 i0..i3 all have the same value in format versions 4.0 and higher;
73 the related props32[] and exceptions[] and uchars[] were used in format version 3
74
75 i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
76 i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
77 i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
78
79 i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
80 i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
81 i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
82
83 i6 scriptExtensionsIndex; -- 32-bit unit index to the Script_Extensions data
84 i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data
85 i8 reservedIndex8; -- for now: i7, i8 and i9 have the same values
86 i9 dataTopIndex; -- size of the data file (number of 32-bit units after the header)
87
88 i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
89 i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
90 i12..i15 reservedIndexes; -- reserved values; 0 for now
91
92 PT serialized properties trie, see utrie2.h (byte size: 4*(i0-16))
93
94 P, E, and U are not used (empty) in format versions 4 and above
95
96 P const uint32_t props32[i1-i0];
97 E const uint32_t exceptions[i2-i1];
98 U const UChar uchars[2*(i3-i2)];
99
100 AT serialized trie for additional properties (byte size: 4*(i4-i3))
101 PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
102
103 SCX const uint16_t scriptExtensions[2*(i7-i6)];
104
105 SCX contains Script_Extensions lists and (Script code, Script_Extensions index) pairs.
106 A Script_Extensions list is a sequence of UScriptCode values in ascending order,
107 with the last code having bit 15 set for termination.
108 A (Script code, Script_Extensions index) pair is the main UScriptCode (Script value)
109 followed by the index of the Script_Extensions list.
110 If the propsVectors[] column 0 value indicates that there are Script_Extensions,
111 then the script-code-or-index bit fields are an index to either a list or a pair in SCX,
112 rather than the Script itself. The high bits in the UPROPS_SCRIPT_X_MASK fields
113 indicate whether the main Script value is Common or Inherited (and the index is to a list)
114 vs. another value (and the index is to a pair).
115 (See UPROPS_SCRIPT_X_WITH_COMMON etc. in uprops.h.)
116
117 Trie lookup and properties:
118
119 In order to condense the data for the 21-bit code space, several properties of
120 the Unicode code assignment are exploited:
121 - The code space is sparse.
122 - There are several 10k of consecutive codes with the same properties.
123 - Characters and scripts are allocated in groups of 16 code points.
124 - Inside blocks for scripts the properties are often repetitive.
125 - The 21-bit space is not fully used for Unicode.
126
127 The lookup of properties for a given code point is done with a trie lookup,
128 using the UTrie implementation.
129 The trie lookup result is a 16-bit properties word.
130
131 With a given Unicode code point
132
133 UChar32 c;
134
135 and 0<=c<0x110000, the lookup is done like this:
136
137 uint16_t props;
138 UTRIE_GET16(trie, c, props);
139
140 Each 16-bit properties word contains:
141
142 0.. 4 general category
143 5 reserved
144 6..15 numeric type and value (ntv)
145
146 Encoding of numeric type and value in the 10-bit ntv field:
147 ntv type value
148 0 U_NT_NONE 0
149 1..10 U_NT_DECIMAL 0..9
150 11..20 U_NT_DIGIT 0..9
151 21..0x3ff U_NT_NUMERIC see below
152
153 For U_NT_NUMERIC:
154 ntv value
155 21..0xaf integer 0..154
156 0xb0..0x1df fraction ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16
157 0x1e0..0x2ff large int ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
158 (only one significant decimal digit)
159 0x300..0x323 base-60 (sexagesimal) integer (new in format version 7.1)
160 ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
161 0x324..0x34b fraction-20 (new in format version 7.3)
162 frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640
163 numerator: num = 2*(frac20&3)+1
164 denominator: den = 20<<(frac20>>2)
165 0x34c..0x35b fraction-32 (new in format version 7.6)
166 frac32 = ntv-0x34c = 0..15 -> 1|3|5|7 / 32|64|128|256
167 numerator: num = 2*(frac32&3)+1
168 denominator: den = 32<<(frac32>>2)
169 0x35c..0x3ff reserved
170
171 --- Additional properties (new in format version 2.1) ---
172
173 The second trie for additional properties (AT) is also a UTrie with 16-bit data.
174 The data words consist of 32-bit unit indexes (not row indexes!) into the
175 table of unique properties vectors (PV).
176 Each vector contains a set of properties.
177 The width of a vector (number of uint32_t per row) may change
178 with the formatVersion, it is stored in i5.
179
180 Current properties: see icu/source/common/uprops.h
181
182 --- Changes in format version 3.1 ---
183
184 See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
185
186 --- Changes in format version 3.2 ---
187
188 - The tries use linear Latin-1 ranges.
189 - The additional properties bits store full properties XYZ instead
190 of partial Other_XYZ, so that changes in the derivation formulas
191 need not be tracked in runtime library code.
192 - Joining Type and Line Break are also stored completely, so that uprops.c
193 needs no runtime formulas for enumerated properties either.
194 - Store the case-sensitive flag in the main properties word.
195 - i10 also contains U_LB_COUNT and U_EA_COUNT.
196 - i11 contains maxValues2 for vector word 2.
197
198 --- Changes in format version 4 ---
199
200 The format changes between version 3 and 4 because the properties related to
201 case mappings and bidi/shaping are pulled out into separate files
202 for modularization.
203 In order to reduce the need for code changes, some of the previous data
204 structures are omitted, rather than rearranging everything.
205
206 (The change to format version 4 is for ICU 3.4. The last CVS revision of
207 genprops/store.c for format version 3.2 is 1.48.)
208
209 The main trie's data is significantly simplified:
210 - The trie's 16-bit data word is used directly instead of as an index
211 into props32[].
212 - The trie uses the default trie folding functions instead of custom ones.
213 - Numeric values are stored directly in the trie data word, with special
214 encodings.
215 - No more exception data (the data that needed it was pulled out, or, in the
216 case of numeric values, encoded differently).
217 - No more string data (pulled out - was for case mappings).
218
219 Also, some of the previously used properties vector bits are reserved again.
220
221 The indexes[] values for the omitted structures are still filled in
222 (indicating zero-length arrays) so that the swapper code remains unchanged.
223
224 --- Changes in format version 5 ---
225
226 Format version 5 became necessary because the bit field for script codes
227 overflowed. The changes are incompatible because
228 old code would have seen nonsensically low values for new, higher script codes.
229
230 Rearranged bit fields in the second trie (AT) and widened three (Script, Block,
231 Word_Break) by one bit each.
232
233 Modified bit fields in icu/source/common/uprops.h
234
235 --- Changes in format version 6 ---
236
237 Format version 6 became necessary because Unicode 5.2 adds fractions with
238 denominators 9, 10 and 16, and it was easier to redesign the encoding of numeric
239 types and values rather than add another variant to the previous format.
240
241 --- Changes in format version 7 ---
242
243 Unicode 6.0 adds Script_Extensions. For characters with script extensions data,
244 the script code bits are an index into the new Script_Extensions array rather
245 than a script code.
246
247 Change from UTrie to UTrie2.
248
249 --- Changes in format version 7.1 ---
250
251 Unicode 6.2 adds sexagesimal (base-60) numeric values:
252 cp;12432;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH;nv=216000
253 cp;12433;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN;nv=432000
254
255 The encoding of numeric values was extended to handle such values.
256
257 --- Changes in format version 7.2 ---
258
259 ICU 57 adds 4 Emoji properties to vector word 2.
260 https://unicode-org.atlassian.net/browse/ICU-11802
261 http://www.unicode.org/reports/tr51/#Emoji_Properties
262
263 --- Changes in format version 7.3 ---
264
265 ICU 58 adds fraction-20 numeric values for new Unicode 9 Malayalam fraction characters.
266
267 --- Changes in format version 7.4 ---
268
269 ICU 60 adds the Prepended_Concatenation_Mark property to vector word 1.
270
271 ICU 60 adds the Emoji_Component property to vector word 2, for emoji 5.
272 https://unicode-org.atlassian.net/browse/ICU-13062
273 http://www.unicode.org/reports/tr51/#Emoji_Properties
274
275 --- Changes in format version 7.5 ---
276
277 ICU 62 adds the Extended_Pictographic property to vector word 2, for emoji 11.
278 http://www.unicode.org/reports/tr51/#Emoji_Properties
279
280 --- Changes in format version 7.6 ---
281
282 ICU 64 adds fraction-32 numeric values for new Unicode 12 Tamil fraction characters.
283
284 --- Changes in format version 7.7 ---
285
286 ICU 66 adds two bits for the UScriptCode or Script_Extensions index in vector word 0.
287 The value is split across bits 21..20 & 7..0.
288
289 --- Changes in format version 7.8 ---
290
291 ICU 70 moves the emoji properties from uprops.icu to (new) uemoji.icu.
292 The 6 bits in vector word 2 that stored emoji properties are unused again.
293
294 ----------------------------------------------------------------------------- */
295
296 U_NAMESPACE_USE
297
298 /* UDataInfo cf. udata.h */
299 static UDataInfo dataInfo={
300 sizeof(UDataInfo),
301 0,
302
303 U_IS_BIG_ENDIAN,
304 U_CHARSET_FAMILY,
305 U_SIZEOF_UCHAR,
306 0,
307
308 { 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
309 { 7, 8, 0, 0 }, /* formatVersion */
310 { 14, 0, 0, 0 } /* dataVersion */
311 };
312
splitScriptCodeOrIndex(uint32_t v)313 inline uint32_t splitScriptCodeOrIndex(uint32_t v) {
314 return
315 ((v << UPROPS_SCRIPT_HIGH_SHIFT) & UPROPS_SCRIPT_HIGH_MASK) |
316 (v & UPROPS_SCRIPT_LOW_MASK);
317 }
318
319 class CorePropsBuilder : public PropsBuilder {
320 public:
321 CorePropsBuilder(UErrorCode &errorCode);
322 virtual ~CorePropsBuilder();
323
324 virtual void setUnicodeVersion(const UVersionInfo version);
325 virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
326 virtual void build(UErrorCode &errorCode);
327 virtual void writeCSourceFile(const char *path, UErrorCode &errorCode);
328 virtual void writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode);
329
330 private:
331 void setGcAndNumeric(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
332
333 UTrie2 *pTrie;
334 UTrie2 *props2Trie;
335 UPropsVectors *pv;
336 UnicodeString scriptExtensions;
337 };
338
CorePropsBuilder(UErrorCode & errorCode)339 CorePropsBuilder::CorePropsBuilder(UErrorCode &errorCode)
340 : pTrie(NULL), props2Trie(NULL), pv(NULL) {
341 pTrie=utrie2_open(0, 0, &errorCode);
342 if(U_FAILURE(errorCode)) {
343 fprintf(stderr, "genprops error: corepropsbuilder utrie2_open() failed - %s\n",
344 u_errorName(errorCode));
345 }
346 pv=upvec_open(UPROPS_VECTOR_WORDS, &errorCode);
347 if(U_FAILURE(errorCode)) {
348 fprintf(stderr, "genprops error: corepropsbuilder upvec_open() failed - %s\n",
349 u_errorName(errorCode));
350 }
351 }
352
~CorePropsBuilder()353 CorePropsBuilder::~CorePropsBuilder() {
354 utrie2_close(pTrie);
355 utrie2_close(props2Trie);
356 upvec_close(pv);
357 }
358
359 void
setUnicodeVersion(const UVersionInfo version)360 CorePropsBuilder::setUnicodeVersion(const UVersionInfo version) {
361 uprv_memcpy(dataInfo.dataVersion, version, 4);
362 }
363
encodeFractional20(int32_t value,int32_t den)364 static int32_t encodeFractional20(int32_t value, int32_t den) {
365 if(den<20 || 640<den) { return -1; }
366 int32_t frac20;
367 if(value==1 || value==3 || value==5 || value==7) {
368 frac20=value/2;
369 } else {
370 return -1;
371 }
372 // Denominator: 20 times which power of 2: 0..5 into bits 4..2
373 do {
374 if(den==20) {
375 return UPROPS_NTV_FRACTION20_START+frac20;
376 }
377 if(den&1) {
378 return -1; // odd denominator, and we would lose the low bit in den/=2
379 }
380 den/=2;
381 frac20+=4;
382 } while(den>=20);
383 return -1;
384 }
385
encodeFractional32(int32_t value,int32_t den)386 static int32_t encodeFractional32(int32_t value, int32_t den) {
387 if(den<32 || 256<den) { return -1; }
388 int32_t frac32;
389 if(value==1 || value==3 || value==5 || value==7) {
390 frac32=value/2;
391 } else {
392 return -1;
393 }
394 // Denominator: 32 times which power of 2: 0..3 into bits 3..2
395 do {
396 if(den==32) {
397 return UPROPS_NTV_FRACTION32_START+frac32;
398 }
399 if(den&1) {
400 return -1; // odd denominator, and we would lose the low bit in den/=2
401 }
402 den/=2;
403 frac32+=4;
404 } while(den>=32);
405 return -1;
406 }
407
408 // For nt=U_NT_NUMERIC.
409 static int32_t
encodeNumericValue(UChar32 start,const char * s,UErrorCode & errorCode)410 encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) {
411 const char *original=s;
412 /* get a possible minus sign */
413 UBool isNegative;
414 if(*s=='-') {
415 isNegative=true;
416 ++s;
417 } else {
418 isNegative=false;
419 }
420
421 int32_t value=0, den=0, exp=0, ntv=0;
422 char *numberLimit;
423 /* try large, single-significant-digit numbers, may otherwise overflow strtoul() */
424 if('1'<=s[0] && s[0]<='9' && s[1]=='0' && s[2]=='0') {
425 value=s[0]-'0';
426 numberLimit=const_cast<char *>(s);
427 while(*(++numberLimit)=='0') {
428 ++exp;
429 }
430 } else {
431 /* normal number parsing */
432 unsigned long ul=uprv_strtoul(s, &numberLimit, 10);
433 if(s==numberLimit || (*numberLimit!=0 && *numberLimit!='/') || ul>0x7fffffff) {
434 ntv=-1;
435 } else {
436 value=(int32_t)ul;
437 }
438 if(ntv>=0 && *numberLimit=='/') {
439 /* fractional value, get the denominator */
440 s=numberLimit+1;
441 ul=uprv_strtoul(s, &numberLimit, 10);
442 if(s==numberLimit || *numberLimit!=0 || ul==0 || ul>0x7fffffff) {
443 ntv=-1;
444 } else {
445 den=(int32_t)ul;
446 }
447 }
448 }
449 if(isNegative) {
450 value=-(int32_t)value;
451 }
452
453 if(ntv<0) {
454 // pass
455 } else if(den==0 && value>=0) {
456 if(exp==2 && (value*100)<=UPROPS_NTV_MAX_SMALL_INT) {
457 /* small integer parsed like a large one */
458 ntv=UPROPS_NTV_NUMERIC_START+value*100;
459 } else if(exp==0) {
460 if(value<=UPROPS_NTV_MAX_SMALL_INT) {
461 /* small integer */
462 ntv=UPROPS_NTV_NUMERIC_START+value;
463 } else {
464 /* large integer parsed like a small one */
465 /* split the value into mantissa and exponent, base 10 */
466 int32_t mant=value;
467 while((mant%10)==0) {
468 mant/=10;
469 ++exp;
470 }
471 // Note: value<=0x7fffffff guarantees exp<=33
472 if(mant<=9) {
473 ntv=((mant+14)<<5)+(exp-2);
474 } else {
475 // Try sexagesimal (base 60) numbers.
476 mant=value;
477 exp=0;
478 while((mant%60)==0) {
479 mant/=60;
480 ++exp;
481 }
482 if(mant<=9 && exp<=4) {
483 ntv=((mant+0xbf)<<2)+(exp-1);
484 } else {
485 ntv=-1;
486 }
487 }
488 }
489 } else if(2<=exp && exp<=33 && 1<=value && value<=9) {
490 /* large, single-significant-digit integer */
491 ntv=((value+14)<<5)+(exp-2);
492 } else {
493 ntv=-1;
494 }
495 } else if(exp==0 && -1<=value && value<=17 && 1<=den && den<=16) {
496 /* fraction */
497 ntv=((value+12)<<4)+(den-1);
498 } else if(exp==0 && value==-1 && den==0) {
499 /* -1 parsed with den=0, encoded as pseudo-fraction -1/1 */
500 ntv=((value+12)<<4);
501 } else if(exp==0 && (ntv=encodeFractional20(value, den))>=0) {
502 // fits into fractional-20 format
503 } else if(exp==0 && (ntv=encodeFractional32(value, den))>=0) {
504 // fits into fractional-32 format
505 } else {
506 ntv=-1;
507 }
508 if(ntv<0 || *numberLimit!=0) {
509 fprintf(stderr, "genprops error: unable to encode numeric value nv=%s\n", original);
510 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
511 }
512 return ntv;
513 }
514
515 void
setGcAndNumeric(const UniProps & props,const UnicodeSet & newValues,UErrorCode & errorCode)516 CorePropsBuilder::setGcAndNumeric(const UniProps &props, const UnicodeSet &newValues,
517 UErrorCode &errorCode) {
518 if(U_FAILURE(errorCode)) { return; }
519 UChar32 start=props.start;
520 UChar32 end=props.end;
521
522 int32_t type=props.getIntProp(UCHAR_NUMERIC_TYPE);
523 const char *nvString=props.numericValue;
524 if(type!=U_NT_NONE && nvString==NULL && start==end) {
525 fprintf(stderr, "genprops error: cp line has Numeric_Type but no Numeric_Value\n");
526 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
527 return;
528 }
529
530 if(!newValues.contains(UCHAR_GENERAL_CATEGORY) && !newValues.contains(UCHAR_NUMERIC_VALUE)) {
531 return;
532 }
533
534 int32_t ntv=UPROPS_NTV_NONE; // numeric type & value
535 if(nvString!=NULL && uprv_strcmp(nvString, "NaN")!=0) {
536 int32_t digitValue=props.digitValue;
537 if( type<=U_NT_NONE || U_NT_NUMERIC<type ||
538 ((type==U_NT_DECIMAL || type==U_NT_DIGIT) && digitValue<0)
539 ) {
540 fprintf(stderr, "genprops error: nt=%d but nv=%s\n",
541 (int)type, nvString==NULL ? "NULL" : nvString);
542 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
543 return;
544 }
545
546 switch(type) {
547 case U_NT_NONE:
548 ntv=UPROPS_NTV_NONE;
549 break;
550 case U_NT_DECIMAL:
551 ntv=UPROPS_NTV_DECIMAL_START+digitValue;
552 break;
553 case U_NT_DIGIT:
554 ntv=UPROPS_NTV_DIGIT_START+digitValue;
555 break;
556 case U_NT_NUMERIC:
557 if(digitValue>=0) {
558 ntv=UPROPS_NTV_NUMERIC_START+digitValue;
559 } else {
560 ntv=encodeNumericValue(start, nvString, errorCode);
561 if(U_FAILURE(errorCode)) {
562 return;
563 }
564 }
565 default:
566 break; // unreachable
567 }
568 }
569
570 uint32_t value=
571 (uint32_t)props.getIntProp(UCHAR_GENERAL_CATEGORY) |
572 (ntv<<UPROPS_NUMERIC_TYPE_VALUE_SHIFT);
573 if(start==end) {
574 utrie2_set32(pTrie, start, value, &errorCode);
575 } else {
576 utrie2_setRange32(pTrie, start, end, value, true, &errorCode);
577 }
578 if(U_FAILURE(errorCode)) {
579 fprintf(stderr, "error: utrie2_setRange32(properties trie %04lX..%04lX) failed - %s\n",
580 (long)start, (long)end, u_errorName(errorCode));
581 }
582 }
583
584 struct PropToBinary {
585 int32_t prop; // UProperty
586 int32_t vecWord, vecShift;
587 };
588
589 static const PropToBinary
590 propToBinaries[]={
591 { UCHAR_WHITE_SPACE, 1, UPROPS_WHITE_SPACE },
592 { UCHAR_DASH, 1, UPROPS_DASH },
593 // Note: The Hyphen property is stabilized since Unicode 4.0
594 // and deprecated since Unicode 6.0.
595 { UCHAR_HYPHEN, 1, UPROPS_HYPHEN },
596 { UCHAR_QUOTATION_MARK, 1, UPROPS_QUOTATION_MARK },
597 { UCHAR_TERMINAL_PUNCTUATION, 1, UPROPS_TERMINAL_PUNCTUATION },
598 // Note: The Hex_Digit and ASCII_Hex_Digit properties are probably stable enough
599 // so that they could be hardcoded.
600 { UCHAR_HEX_DIGIT, 1, UPROPS_HEX_DIGIT },
601 { UCHAR_ASCII_HEX_DIGIT, 1, UPROPS_ASCII_HEX_DIGIT },
602 { UCHAR_IDEOGRAPHIC, 1, UPROPS_IDEOGRAPHIC },
603 { UCHAR_DIACRITIC, 1, UPROPS_DIACRITIC },
604 { UCHAR_EXTENDER, 1, UPROPS_EXTENDER },
605 // Note: The Noncharacter_Code_Point property is probably stable enough
606 // so that it could be hardcoded.
607 { UCHAR_NONCHARACTER_CODE_POINT, 1, UPROPS_NONCHARACTER_CODE_POINT },
608 // Note: The Grapheme_Link property is deprecated since Unicode 5.0
609 // because it is a "Duplication of ccc=9" (UAX #44).
610 { UCHAR_GRAPHEME_LINK, 1, UPROPS_GRAPHEME_LINK },
611 { UCHAR_IDS_BINARY_OPERATOR, 1, UPROPS_IDS_BINARY_OPERATOR },
612 { UCHAR_IDS_TRINARY_OPERATOR, 1, UPROPS_IDS_TRINARY_OPERATOR },
613 { UCHAR_RADICAL, 1, UPROPS_RADICAL },
614 { UCHAR_UNIFIED_IDEOGRAPH, 1, UPROPS_UNIFIED_IDEOGRAPH },
615 { UCHAR_DEPRECATED, 1, UPROPS_DEPRECATED },
616 { UCHAR_LOGICAL_ORDER_EXCEPTION, 1, UPROPS_LOGICAL_ORDER_EXCEPTION },
617 { UCHAR_S_TERM, 1, UPROPS_S_TERM },
618 { UCHAR_VARIATION_SELECTOR, 1, UPROPS_VARIATION_SELECTOR },
619 // Note: Pattern_Syntax & Pattern_White_Space are available via
620 // the internal PatternProps class and need not be stored here any more.
621 { UCHAR_PATTERN_SYNTAX, 1, UPROPS_PATTERN_SYNTAX },
622 { UCHAR_PATTERN_WHITE_SPACE, 1, UPROPS_PATTERN_WHITE_SPACE },
623 { UCHAR_XID_START, 1, UPROPS_XID_START },
624 { UCHAR_XID_CONTINUE, 1, UPROPS_XID_CONTINUE },
625 { UCHAR_MATH, 1, UPROPS_MATH },
626 { UCHAR_ALPHABETIC, 1, UPROPS_ALPHABETIC },
627 { UCHAR_GRAPHEME_EXTEND, 1, UPROPS_GRAPHEME_EXTEND },
628 { UCHAR_DEFAULT_IGNORABLE_CODE_POINT, 1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
629 { UCHAR_ID_START, 1, UPROPS_ID_START },
630 { UCHAR_ID_CONTINUE, 1, UPROPS_ID_CONTINUE },
631 { UCHAR_GRAPHEME_BASE, 1, UPROPS_GRAPHEME_BASE },
632
633 { UCHAR_PREPENDED_CONCATENATION_MARK, 1, UPROPS_PREPENDED_CONCATENATION_MARK },
634 };
635
636 struct PropToEnum {
637 int32_t prop; // UProperty
638 int32_t vecWord, vecShift;
639 uint32_t vecMask;
640 };
641
642 static const PropToEnum
643 propToEnums[]={
644 { UCHAR_BLOCK, 0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK },
645 { UCHAR_EAST_ASIAN_WIDTH, 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK },
646 { UCHAR_DECOMPOSITION_TYPE, 2, 0, UPROPS_DT_MASK },
647 { UCHAR_GRAPHEME_CLUSTER_BREAK, 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK },
648 { UCHAR_WORD_BREAK, 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK },
649 { UCHAR_SENTENCE_BREAK, 2, UPROPS_SB_SHIFT, UPROPS_SB_MASK },
650 { UCHAR_LINE_BREAK, 2, UPROPS_LB_SHIFT, UPROPS_LB_MASK },
651 };
652
653 void
setProps(const UniProps & props,const UnicodeSet & newValues,UErrorCode & errorCode)654 CorePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
655 UErrorCode &errorCode) {
656 setGcAndNumeric(props, newValues, errorCode);
657 if(U_FAILURE(errorCode)) { return; }
658
659 UChar32 start=props.start;
660 UChar32 end=props.end;
661 if(start==0 && end==0x10ffff) {
662 // Also set bits for initialValue and errorValue.
663 end=UPVEC_MAX_CP;
664 }
665
666 if(newValues.containsSome(0, UCHAR_BINARY_LIMIT-1)) {
667 for(int32_t i=0; i<LENGTHOF(propToBinaries); ++i) {
668 const PropToBinary &p2b=propToBinaries[i];
669 U_ASSERT(p2b.vecShift<32);
670 if(newValues.contains(p2b.prop)) {
671 uint32_t mask=U_MASK(p2b.vecShift);
672 uint32_t value= props.binProps[p2b.prop] ? mask : 0;
673 upvec_setValue(pv, start, end, p2b.vecWord, value, mask, &errorCode);
674 }
675 }
676 }
677
678 // Set int property values.
679 if(newValues.containsSome(UCHAR_INT_START, UCHAR_INT_LIMIT-1)) {
680 for(int32_t i=0; i<LENGTHOF(propToEnums); ++i) {
681 const PropToEnum &p2e=propToEnums[i];
682 U_ASSERT(p2e.vecShift<32);
683 if(newValues.contains(p2e.prop)) {
684 uint32_t mask=p2e.vecMask;
685 uint32_t value=(uint32_t)(props.getIntProp(p2e.prop)<<p2e.vecShift);
686 U_ASSERT((value&mask)==value);
687 upvec_setValue(pv, start, end, p2e.vecWord, value, mask, &errorCode);
688 }
689 }
690 }
691 if(newValues.contains(UCHAR_AGE)) {
692 if(props.age[0]>15 || props.age[1]>15 || props.age[2]!=0 || props.age[3]!=0) {
693 char buffer[U_MAX_VERSION_STRING_LENGTH];
694 u_versionToString(props.age, buffer);
695 fprintf(stderr, "genprops error: age %s cannot be encoded\n", buffer);
696 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
697 return;
698 }
699 uint32_t version=(props.age[0]<<4)|props.age[1];
700 upvec_setValue(pv, start, end,
701 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK,
702 &errorCode);
703 }
704
705 // Set the script value if the Script_Extensions revert to {Script}.
706 // Otherwise we would have to duplicate the code for doing so.
707 // Script and Script_Extensions share a bit field, so that by setting it to just the script
708 // we remove the Script_Extensions.
709 // (We do not just set the script bit in newValues because that is const.)
710 // For example, for U+3000:
711 // block;3000..303F;age=1.1;...;sc=Zyyy;scx=Bopo Hang Hani Hira Kana Yiii;vo=U
712 // cp;3000;...;gc=Zs;lb=BA;na=IDEOGRAPHIC SPACE;...;SB=SP;scx=<script>;WSpace
713 UBool revertToScript=
714 newValues.contains(UCHAR_SCRIPT_EXTENSIONS) && props.scx.isEmpty() &&
715 !newValues.contains(UCHAR_SCRIPT);
716 if(newValues.contains(UCHAR_SCRIPT) || revertToScript) {
717 int32_t script=props.getIntProp(UCHAR_SCRIPT);
718 uint32_t value=splitScriptCodeOrIndex(script);
719 // Use UPROPS_SCRIPT_X_MASK:
720 // When writing a Script code, remove Script_Extensions bits as well.
721 // If needed, they will get written again.
722 upvec_setValue(pv, start, end, 0, value, UPROPS_SCRIPT_X_MASK, &errorCode);
723 }
724 // Write a new (Script, Script_Extensions) value if there are Script_Extensions
725 // and either Script or Script_Extensions are new on the current line.
726 // (If only Script is new, then it just clobbered the relevant bits.)
727 if( !props.scx.isEmpty() &&
728 (newValues.contains(UCHAR_SCRIPT) || newValues.contains(UCHAR_SCRIPT_EXTENSIONS))
729 ) {
730 UnicodeString codes; // vector of 16-bit UScriptCode values
731 UnicodeSetIterator iter(props.scx);
732 while(iter.next()) { codes.append((UChar)iter.getCodepoint()); }
733
734 // Set bit 15 on the last script code, for termination.
735 int32_t length=codes.length();
736 codes.setCharAt(length-1, (UChar)(codes[length-1]|0x8000));
737 // Find this list of codes in the Script_Extensions data so far, or add this list.
738 int32_t index=scriptExtensions.indexOf(codes);
739 if(index<0) {
740 index=scriptExtensions.length();
741 scriptExtensions.append(codes);
742 }
743
744 // Encode the (Script, Script_Extensions index) pair.
745 int32_t script=props.getIntProp(UCHAR_SCRIPT);
746 uint32_t scriptX;
747 if(script==USCRIPT_COMMON) {
748 scriptX=UPROPS_SCRIPT_X_WITH_COMMON;
749 } else if(script==USCRIPT_INHERITED) {
750 scriptX=UPROPS_SCRIPT_X_WITH_INHERITED;
751 } else {
752 // Store an additional pair of 16-bit units for an unusual main Script code
753 // together with the Script_Extensions index.
754 UnicodeString codeIndexPair;
755 codeIndexPair.append((UChar)script).append((UChar)index);
756 index=scriptExtensions.indexOf(codeIndexPair);
757 if(index<0) {
758 index=scriptExtensions.length();
759 scriptExtensions.append(codeIndexPair);
760 }
761 scriptX=UPROPS_SCRIPT_X_WITH_OTHER;
762 }
763 if(index>UPROPS_MAX_SCRIPT) {
764 fprintf(stderr, "genprops: Script_Extensions indexes overflow bit fields\n");
765 errorCode=U_BUFFER_OVERFLOW_ERROR;
766 return;
767 }
768 scriptX|=splitScriptCodeOrIndex(index);
769 upvec_setValue(pv, start, end, 0, scriptX, UPROPS_SCRIPT_X_MASK, &errorCode);
770 }
771 if(U_FAILURE(errorCode)) {
772 fprintf(stderr, "genprops error: unable to set props2 values for %04lX..%04lX: %s\n",
773 (long)start, (long)end, u_errorName(errorCode));
774 }
775 }
776
777 static int32_t indexes[UPROPS_INDEX_COUNT]={
778 0, 0, 0, 0,
779 0, 0, 0, 0,
780 0, 0, 0, 0,
781 0, 0, 0, 0
782 };
783
784 static uint8_t trieBlock[100000];
785 static int32_t trieSize;
786 static uint8_t props2TrieBlock[100000];
787 static int32_t props2TrieSize;
788
789 static int32_t totalSize;
790
791 void
build(UErrorCode & errorCode)792 CorePropsBuilder::build(UErrorCode &errorCode) {
793 if(U_FAILURE(errorCode)) { return; }
794
795 utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode);
796 if(U_FAILURE(errorCode)) {
797 fprintf(stderr,
798 "genprops/core error: utrie2_freeze(main trie) failed: %s\n",
799 u_errorName(errorCode));
800 return;
801 }
802 trieSize=utrie2_serialize(pTrie, trieBlock, sizeof(trieBlock), &errorCode);
803 if(U_FAILURE(errorCode)) {
804 fprintf(stderr,
805 "genprops/core error: utrie2_serialize(main trie) failed: %s (length %ld)\n",
806 u_errorName(errorCode), (long)trieSize);
807 return;
808 }
809
810 props2Trie=upvec_compactToUTrie2WithRowIndexes(pv, &errorCode);
811 if(U_FAILURE(errorCode)) {
812 fprintf(stderr, "genprops/core error: unable to build trie for additional properties: %s\n",
813 u_errorName(errorCode));
814 return;
815 }
816
817 props2TrieSize=utrie2_serialize(props2Trie,
818 props2TrieBlock, (int32_t)sizeof(props2TrieBlock),
819 &errorCode);
820 if(U_FAILURE(errorCode)) {
821 fprintf(stderr,
822 "genprops/core error: utrie2_freeze(additional properties)+utrie2_serialize() "
823 "failed: %s\n",
824 u_errorName(errorCode));
825 return;
826 }
827
828 int32_t pvRows;
829 upvec_getArray(pv, &pvRows, NULL);
830 int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
831
832 /* round up scriptExtensions to multiple of 4 bytes */
833 if(scriptExtensions.length()&1) {
834 scriptExtensions.append((UChar)0);
835 }
836
837 /* set indexes */
838 int32_t offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */
839 offset+=trieSize>>2;
840 indexes[UPROPS_PROPS32_INDEX]= /* set indexes to the same offsets for empty */
841 indexes[UPROPS_EXCEPTIONS_INDEX]= /* structures from the old format version 3 */
842 indexes[UPROPS_EXCEPTIONS_TOP_INDEX]= /* so that less runtime code has to be changed */
843 indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
844
845 offset+=props2TrieSize/4;
846 indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=offset;
847 indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
848 offset+=pvCount;
849 indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]=offset;
850 offset+=scriptExtensions.length()/2;
851 indexes[UPROPS_RESERVED_INDEX_7]=offset;
852 indexes[UPROPS_RESERVED_INDEX_8]=offset;
853 indexes[UPROPS_DATA_TOP_INDEX]=offset;
854 totalSize=4*offset;
855
856 indexes[UPROPS_MAX_VALUES_INDEX]=
857 (((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
858 (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
859 (int32_t)splitScriptCodeOrIndex(USCRIPT_CODE_LIMIT-1);
860 indexes[UPROPS_MAX_VALUES_2_INDEX]=
861 (((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
862 (((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
863 (((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
864 (((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
865 ((int32_t)U_DT_COUNT-1);
866
867 if(!beQuiet) {
868 puts("* uprops.icu stats *");
869 printf("trie size in bytes: %5u\n", (int)trieSize);
870 printf("size in bytes of additional props trie:%5u\n", (int)props2TrieSize);
871 printf("number of additional props vectors: %5u\n", (int)pvRows);
872 printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
873 printf("number of 16-bit scriptExtensions: %5u\n", (int)scriptExtensions.length());
874 printf("data size: %6ld\n", (long)totalSize);
875 }
876 }
877
878 void
writeCSourceFile(const char * path,UErrorCode & errorCode)879 CorePropsBuilder::writeCSourceFile(const char *path, UErrorCode &errorCode) {
880 if(U_FAILURE(errorCode)) { return; }
881
882 int32_t pvRows;
883 const uint32_t *pvArray=upvec_getArray(pv, &pvRows, NULL);
884 int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
885
886 FILE *f=usrc_create(path, "uchar_props_data.h", 2016,
887 "icu/tools/unicode/c/genprops/corepropsbuilder.cpp");
888 if(f==NULL) {
889 errorCode=U_FILE_ACCESS_ERROR;
890 return;
891 }
892 fputs("#ifdef INCLUDED_FROM_UCHAR_C\n\n", f);
893 usrc_writeArray(f,
894 "static const UVersionInfo dataVersion={",
895 dataInfo.dataVersion, 8, 4,
896 "",
897 "};\n\n");
898 usrc_writeUTrie2Arrays(f,
899 "static const uint16_t propsTrie_index[%ld]={\n", NULL,
900 pTrie,
901 "\n};\n\n");
902 usrc_writeUTrie2Struct(f,
903 "static const UTrie2 propsTrie={\n",
904 pTrie, "propsTrie_index", NULL,
905 "};\n\n");
906
907 usrc_writeUTrie2Arrays(f,
908 "static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL,
909 props2Trie,
910 "\n};\n\n");
911 usrc_writeUTrie2Struct(f,
912 "static const UTrie2 propsVectorsTrie={\n",
913 props2Trie, "propsVectorsTrie_index", NULL,
914 "};\n\n");
915
916 usrc_writeArray(f,
917 "static const uint32_t propsVectors[%ld]={\n",
918 pvArray, 32, pvCount,
919 "",
920 "};\n\n");
921 fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
922 fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)UPROPS_VECTOR_WORDS);
923
924 usrc_writeArray(f,
925 "static const uint16_t scriptExtensions[%ld]={\n",
926 scriptExtensions.getBuffer(), 16, scriptExtensions.length(),
927 "",
928 "};\n\n");
929
930 usrc_writeArray(f,
931 "static const int32_t indexes[UPROPS_INDEX_COUNT]={",
932 indexes, 32, UPROPS_INDEX_COUNT,
933 "",
934 "};\n\n");
935 fputs("#endif // INCLUDED_FROM_UCHAR_C\n", f);
936 fclose(f);
937 }
938
939 void
writeBinaryData(const char * path,UBool withCopyright,UErrorCode & errorCode)940 CorePropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) {
941 if(U_FAILURE(errorCode)) { return; }
942
943 int32_t pvRows;
944 const uint32_t *pvArray=upvec_getArray(pv, &pvRows, NULL);
945 int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
946
947 UNewDataMemory *pData=udata_create(path, "icu", "uprops", &dataInfo,
948 withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
949 if(U_FAILURE(errorCode)) {
950 fprintf(stderr, "genprops: udata_create(%s, uprops.icu) failed - %s\n",
951 path, u_errorName(errorCode));
952 return;
953 }
954
955 udata_writeBlock(pData, indexes, sizeof(indexes));
956 udata_writeBlock(pData, trieBlock, trieSize);
957 udata_writeBlock(pData, props2TrieBlock, props2TrieSize);
958 udata_writeBlock(pData, pvArray, pvCount*4);
959 udata_writeBlock(pData, scriptExtensions.getBuffer(), scriptExtensions.length()*2);
960
961 long dataLength=udata_finish(pData, &errorCode);
962 if(U_FAILURE(errorCode)) {
963 fprintf(stderr, "genprops: error %s writing the output file\n", u_errorName(errorCode));
964 return;
965 }
966
967 if(dataLength!=(long)totalSize) {
968 fprintf(stderr,
969 "udata_finish(uprops.icu) reports %ld bytes written but should be %ld\n",
970 dataLength, (long)totalSize);
971 errorCode=U_INTERNAL_PROGRAM_ERROR;
972 }
973 }
974
975 PropsBuilder *
createCorePropsBuilder(UErrorCode & errorCode)976 createCorePropsBuilder(UErrorCode &errorCode) {
977 if(U_FAILURE(errorCode)) { return NULL; }
978 PropsBuilder *pb=new CorePropsBuilder(errorCode);
979 if(pb==NULL) {
980 errorCode=U_MEMORY_ALLOCATION_ERROR;
981 }
982 return pb;
983 }
984
985 /*
986 * Hey, Emacs, please set the following:
987 *
988 * Local Variables:
989 * indent-tabs-mode: nil
990 * End:
991 *
992 */
993