1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 1999-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: corepropsbuilder.cpp (was store.c & props2.cpp)
11 * encoding: US-ASCII
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 1999dec11
16 * created by: Markus W. Scherer
17 *
18 * Store Unicode character properties efficiently for
19 * random access.
20 */
21
22 #include <stdio.h>
23 #include "unicode/utypes.h"
24 #include "unicode/uchar.h"
25 #include "unicode/udata.h"
26 #include "unicode/uniset.h"
27 #include "unicode/unistr.h"
28 #include "unicode/usetiter.h"
29 #include "unicode/uscript.h"
30 #include "cmemory.h"
31 #include "cstring.h"
32 #include "genprops.h"
33 #include "propsvec.h"
34 #include "uassert.h"
35 #include "unewdata.h"
36 #include "uprops.h"
37 #include "utrie2.h"
38 #include "writesrc.h"
39
40 /* Unicode character properties file format ------------------------------------
41
42 The file format prepared and written here contains several data
43 structures that store indexes or data.
44
45 Before the data contents described below, there are the headers required by
46 the udata API for loading ICU data. Especially, a UDataInfo structure
47 precedes the actual data. It contains platform properties values and the
48 file format version.
49
50 The following is a description of format version 7.7 .
51
52 Data contents:
53
54 The contents is a parsed, binary form of several Unicode character
55 database files, most prominently UnicodeData.txt.
56
57 Any Unicode code point from 0 to 0x10ffff can be looked up to get
58 the properties, if any, for that code point. This means that the input
59 to the lookup are 21-bit unsigned integers, with not all of the
60 21-bit range used.
61
62 It is assumed that client code keeps a uint32_t pointer
63 to the beginning of the data:
64
65 const uint32_t *p32;
66
67 Formally, the file contains the following structures:
68
69 const int32_t indexes[16] with values i0..i15:
70
71 i0 indicates the length of the main trie.
72 i0..i3 all have the same value in format versions 4.0 and higher;
73 the related props32[] and exceptions[] and uchars[] were used in format version 3
74
75 i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
76 i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
77 i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
78
79 i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
80 i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
81 i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
82
83 i6 scriptExtensionsIndex; -- 32-bit unit index to the Script_Extensions data
84 i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data
85 i8 reservedIndex8; -- for now: i7, i8 and i9 have the same values
86 i9 dataTopIndex; -- size of the data file (number of 32-bit units after the header)
87
88 i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
89 i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
90 i12..i15 reservedIndexes; -- reserved values; 0 for now
91
92 PT serialized properties trie, see utrie2.h (byte size: 4*(i0-16))
93
94 P, E, and U are not used (empty) in format versions 4 and above
95
96 P const uint32_t props32[i1-i0];
97 E const uint32_t exceptions[i2-i1];
98 U const UChar uchars[2*(i3-i2)];
99
100 AT serialized trie for additional properties (byte size: 4*(i4-i3))
101 PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
102
103 SCX const uint16_t scriptExtensions[2*(i7-i6)];
104
105 SCX contains Script_Extensions lists and (Script code, Script_Extensions index) pairs.
106 A Script_Extensions list is a sequence of UScriptCode values in ascending order,
107 with the last code having bit 15 set for termination.
108 A (Script code, Script_Extensions index) pair is the main UScriptCode (Script value)
109 followed by the index of the Script_Extensions list.
110 If the propsVectors[] column 0 value indicates that there are Script_Extensions,
111 then the script-code-or-index bit fields are an index to either a list or a pair in SCX,
112 rather than the Script itself. The high bits in the UPROPS_SCRIPT_X_MASK fields
113 indicate whether the main Script value is Common or Inherited (and the index is to a list)
114 vs. another value (and the index is to a pair).
115 (See UPROPS_SCRIPT_X_WITH_COMMON etc. in uprops.h.)
116
117 Trie lookup and properties:
118
119 In order to condense the data for the 21-bit code space, several properties of
120 the Unicode code assignment are exploited:
121 - The code space is sparse.
122 - There are several 10k of consecutive codes with the same properties.
123 - Characters and scripts are allocated in groups of 16 code points.
124 - Inside blocks for scripts the properties are often repetitive.
125 - The 21-bit space is not fully used for Unicode.
126
127 The lookup of properties for a given code point is done with a trie lookup,
128 using the UTrie implementation.
129 The trie lookup result is a 16-bit properties word.
130
131 With a given Unicode code point
132
133 UChar32 c;
134
135 and 0<=c<0x110000, the lookup is done like this:
136
137 uint16_t props;
138 UTRIE_GET16(trie, c, props);
139
140 Each 16-bit properties word contains:
141
142 0.. 4 general category
143 5 reserved
144 6..15 numeric type and value (ntv)
145
146 Encoding of numeric type and value in the 10-bit ntv field:
147 ntv type value
148 0 U_NT_NONE 0
149 1..10 U_NT_DECIMAL 0..9
150 11..20 U_NT_DIGIT 0..9
151 21..0x3ff U_NT_NUMERIC see below
152
153 For U_NT_NUMERIC:
154 ntv value
155 21..0xaf integer 0..154
156 0xb0..0x1df fraction ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16
157 0x1e0..0x2ff large int ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
158 (only one significant decimal digit)
159 0x300..0x323 base-60 (sexagesimal) integer (new in format version 7.1)
160 ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
161 0x324..0x34b fraction-20 (new in format version 7.3)
162 frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640
163 numerator: num = 2*(frac20&3)+1
164 denominator: den = 20<<(frac20>>2)
165 0x34c..0x35b fraction-32 (new in format version 7.6)
166 frac32 = ntv-0x34c = 0..15 -> 1|3|5|7 / 32|64|128|256
167 numerator: num = 2*(frac32&3)+1
168 denominator: den = 32<<(frac32>>2)
169 0x35c..0x3ff reserved
170
171 --- Additional properties (new in format version 2.1) ---
172
173 The second trie for additional properties (AT) is also a UTrie with 16-bit data.
174 The data words consist of 32-bit unit indexes (not row indexes!) into the
175 table of unique properties vectors (PV).
176 Each vector contains a set of properties.
177 The width of a vector (number of uint32_t per row) may change
178 with the formatVersion, it is stored in i5.
179
180 Current properties: see icu/source/common/uprops.h
181
182 --- Changes in format version 3.1 ---
183
184 See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
185
186 --- Changes in format version 3.2 ---
187
188 - The tries use linear Latin-1 ranges.
189 - The additional properties bits store full properties XYZ instead
190 of partial Other_XYZ, so that changes in the derivation formulas
191 need not be tracked in runtime library code.
192 - Joining Type and Line Break are also stored completely, so that uprops.c
193 needs no runtime formulas for enumerated properties either.
194 - Store the case-sensitive flag in the main properties word.
195 - i10 also contains U_LB_COUNT and U_EA_COUNT.
196 - i11 contains maxValues2 for vector word 2.
197
198 --- Changes in format version 4 ---
199
200 The format changes between version 3 and 4 because the properties related to
201 case mappings and bidi/shaping are pulled out into separate files
202 for modularization.
203 In order to reduce the need for code changes, some of the previous data
204 structures are omitted, rather than rearranging everything.
205
206 (The change to format version 4 is for ICU 3.4. The last CVS revision of
207 genprops/store.c for format version 3.2 is 1.48.)
208
209 The main trie's data is significantly simplified:
210 - The trie's 16-bit data word is used directly instead of as an index
211 into props32[].
212 - The trie uses the default trie folding functions instead of custom ones.
213 - Numeric values are stored directly in the trie data word, with special
214 encodings.
215 - No more exception data (the data that needed it was pulled out, or, in the
216 case of numeric values, encoded differently).
217 - No more string data (pulled out - was for case mappings).
218
219 Also, some of the previously used properties vector bits are reserved again.
220
221 The indexes[] values for the omitted structures are still filled in
222 (indicating zero-length arrays) so that the swapper code remains unchanged.
223
224 --- Changes in format version 5 ---
225
226 Format version 5 became necessary because the bit field for script codes
227 overflowed. The changes are incompatible because
228 old code would have seen nonsensically low values for new, higher script codes.
229
230 Rearranged bit fields in the second trie (AT) and widened three (Script, Block,
231 Word_Break) by one bit each.
232
233 Modified bit fields in icu/source/common/uprops.h
234
235 --- Changes in format version 6 ---
236
237 Format version 6 became necessary because Unicode 5.2 adds fractions with
238 denominators 9, 10 and 16, and it was easier to redesign the encoding of numeric
239 types and values rather than add another variant to the previous format.
240
241 --- Changes in format version 7 ---
242
243 Unicode 6.0 adds Script_Extensions. For characters with script extensions data,
244 the script code bits are an index into the new Script_Extensions array rather
245 than a script code.
246
247 Change from UTrie to UTrie2.
248
249 --- Changes in format version 7.1 ---
250
251 Unicode 6.2 adds sexagesimal (base-60) numeric values:
252 cp;12432;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH;nv=216000
253 cp;12433;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN;nv=432000
254
255 The encoding of numeric values was extended to handle such values.
256
257 --- Changes in format version 7.2 ---
258
259 ICU 57 adds 4 Emoji properties to vector word 2.
260 http://bugs.icu-project.org/trac/ticket/11802
261 http://www.unicode.org/reports/tr51/#Emoji_Properties
262
263 --- Changes in format version 7.3 ---
264
265 ICU 58 adds fraction-20 numeric values for new Unicode 9 Malayalam fraction characters.
266
267 --- Changes in format version 7.4 ---
268
269 ICU 60 adds the Prepended_Concatenation_Mark property to vector word 1.
270
271 ICU 60 adds the Emoji_Component property to vector word 2, for emoji 5.
272 http://bugs.icu-project.org/trac/ticket/13062
273 http://www.unicode.org/reports/tr51/#Emoji_Properties
274
275 --- Changes in format version 7.5 ---
276
277 ICU 62 adds the Extended_Pictographic property to vector word 2, for emoji 11.
278 http://www.unicode.org/reports/tr51/#Emoji_Properties
279
280 --- Changes in format version 7.6 ---
281
282 ICU 64 adds fraction-32 numeric values for new Unicode 12 Tamil fraction characters.
283
284 --- Changes in format version 7.7 ---
285
286 ICU 66 adds two bits for the UScriptCode or Script_Extensions index in vector word 0.
287 The value is split across bits 21..20 & 7..0.
288
289 ----------------------------------------------------------------------------- */
290
291 U_NAMESPACE_USE
292
293 /* UDataInfo cf. udata.h */
294 static UDataInfo dataInfo={
295 sizeof(UDataInfo),
296 0,
297
298 U_IS_BIG_ENDIAN,
299 U_CHARSET_FAMILY,
300 U_SIZEOF_UCHAR,
301 0,
302
303 { 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
304 { 7, 7, 0, 0 }, /* formatVersion */
305 { 10, 0, 0, 0 } /* dataVersion */
306 };
307
splitScriptCodeOrIndex(uint32_t v)308 inline uint32_t splitScriptCodeOrIndex(uint32_t v) {
309 return
310 ((v << UPROPS_SCRIPT_HIGH_SHIFT) & UPROPS_SCRIPT_HIGH_MASK) |
311 (v & UPROPS_SCRIPT_LOW_MASK);
312 }
313
314 class CorePropsBuilder : public PropsBuilder {
315 public:
316 CorePropsBuilder(UErrorCode &errorCode);
317 virtual ~CorePropsBuilder();
318
319 virtual void setUnicodeVersion(const UVersionInfo version);
320 virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
321 virtual void build(UErrorCode &errorCode);
322 virtual void writeCSourceFile(const char *path, UErrorCode &errorCode);
323 virtual void writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode);
324
325 private:
326 void setGcAndNumeric(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
327
328 UTrie2 *pTrie;
329 UTrie2 *props2Trie;
330 UPropsVectors *pv;
331 UnicodeString scriptExtensions;
332 };
333
CorePropsBuilder(UErrorCode & errorCode)334 CorePropsBuilder::CorePropsBuilder(UErrorCode &errorCode)
335 : pTrie(NULL), props2Trie(NULL), pv(NULL) {
336 pTrie=utrie2_open(0, 0, &errorCode);
337 if(U_FAILURE(errorCode)) {
338 fprintf(stderr, "genprops error: corepropsbuilder utrie2_open() failed - %s\n",
339 u_errorName(errorCode));
340 }
341 pv=upvec_open(UPROPS_VECTOR_WORDS, &errorCode);
342 if(U_FAILURE(errorCode)) {
343 fprintf(stderr, "genprops error: corepropsbuilder upvec_open() failed - %s\n",
344 u_errorName(errorCode));
345 }
346 }
347
~CorePropsBuilder()348 CorePropsBuilder::~CorePropsBuilder() {
349 utrie2_close(pTrie);
350 utrie2_close(props2Trie);
351 upvec_close(pv);
352 }
353
354 void
setUnicodeVersion(const UVersionInfo version)355 CorePropsBuilder::setUnicodeVersion(const UVersionInfo version) {
356 uprv_memcpy(dataInfo.dataVersion, version, 4);
357 }
358
encodeFractional20(int32_t value,int32_t den)359 static int32_t encodeFractional20(int32_t value, int32_t den) {
360 if(den<20 || 640<den) { return -1; }
361 int32_t frac20;
362 if(value==1 || value==3 || value==5 || value==7) {
363 frac20=value/2;
364 } else {
365 return -1;
366 }
367 // Denominator: 20 times which power of 2: 0..5 into bits 4..2
368 do {
369 if(den==20) {
370 return UPROPS_NTV_FRACTION20_START+frac20;
371 }
372 if(den&1) {
373 return -1; // odd denominator, and we would lose the low bit in den/=2
374 }
375 den/=2;
376 frac20+=4;
377 } while(den>=20);
378 return -1;
379 }
380
encodeFractional32(int32_t value,int32_t den)381 static int32_t encodeFractional32(int32_t value, int32_t den) {
382 if(den<32 || 256<den) { return -1; }
383 int32_t frac32;
384 if(value==1 || value==3 || value==5 || value==7) {
385 frac32=value/2;
386 } else {
387 return -1;
388 }
389 // Denominator: 32 times which power of 2: 0..3 into bits 3..2
390 do {
391 if(den==32) {
392 return UPROPS_NTV_FRACTION32_START+frac32;
393 }
394 if(den&1) {
395 return -1; // odd denominator, and we would lose the low bit in den/=2
396 }
397 den/=2;
398 frac32+=4;
399 } while(den>=32);
400 return -1;
401 }
402
403 // For nt=U_NT_NUMERIC.
404 static int32_t
encodeNumericValue(UChar32 start,const char * s,UErrorCode & errorCode)405 encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) {
406 const char *original=s;
407 /* get a possible minus sign */
408 UBool isNegative;
409 if(*s=='-') {
410 isNegative=TRUE;
411 ++s;
412 } else {
413 isNegative=FALSE;
414 }
415
416 int32_t value=0, den=0, exp=0, ntv=0;
417 char *numberLimit;
418 /* try large, single-significant-digit numbers, may otherwise overflow strtoul() */
419 if('1'<=s[0] && s[0]<='9' && s[1]=='0' && s[2]=='0') {
420 value=s[0]-'0';
421 numberLimit=const_cast<char *>(s);
422 while(*(++numberLimit)=='0') {
423 ++exp;
424 }
425 } else {
426 /* normal number parsing */
427 unsigned long ul=uprv_strtoul(s, &numberLimit, 10);
428 if(s==numberLimit || (*numberLimit!=0 && *numberLimit!='/') || ul>0x7fffffff) {
429 ntv=-1;
430 } else {
431 value=(int32_t)ul;
432 }
433 if(ntv>=0 && *numberLimit=='/') {
434 /* fractional value, get the denominator */
435 s=numberLimit+1;
436 ul=uprv_strtoul(s, &numberLimit, 10);
437 if(s==numberLimit || *numberLimit!=0 || ul==0 || ul>0x7fffffff) {
438 ntv=-1;
439 } else {
440 den=(int32_t)ul;
441 }
442 }
443 }
444 if(isNegative) {
445 value=-(int32_t)value;
446 }
447
448 if(ntv<0) {
449 // pass
450 } else if(den==0 && value>=0) {
451 if(exp==2 && (value*100)<=UPROPS_NTV_MAX_SMALL_INT) {
452 /* small integer parsed like a large one */
453 ntv=UPROPS_NTV_NUMERIC_START+value*100;
454 } else if(exp==0) {
455 if(value<=UPROPS_NTV_MAX_SMALL_INT) {
456 /* small integer */
457 ntv=UPROPS_NTV_NUMERIC_START+value;
458 } else {
459 /* large integer parsed like a small one */
460 /* split the value into mantissa and exponent, base 10 */
461 int32_t mant=value;
462 while((mant%10)==0) {
463 mant/=10;
464 ++exp;
465 }
466 // Note: value<=0x7fffffff guarantees exp<=33
467 if(mant<=9) {
468 ntv=((mant+14)<<5)+(exp-2);
469 } else {
470 // Try sexagesimal (base 60) numbers.
471 mant=value;
472 exp=0;
473 while((mant%60)==0) {
474 mant/=60;
475 ++exp;
476 }
477 if(mant<=9 && exp<=4) {
478 ntv=((mant+0xbf)<<2)+(exp-1);
479 } else {
480 ntv=-1;
481 }
482 }
483 }
484 } else if(2<=exp && exp<=33 && 1<=value && value<=9) {
485 /* large, single-significant-digit integer */
486 ntv=((value+14)<<5)+(exp-2);
487 } else {
488 ntv=-1;
489 }
490 } else if(exp==0 && -1<=value && value<=17 && 1<=den && den<=16) {
491 /* fraction */
492 ntv=((value+12)<<4)+(den-1);
493 } else if(exp==0 && value==-1 && den==0) {
494 /* -1 parsed with den=0, encoded as pseudo-fraction -1/1 */
495 ntv=((value+12)<<4);
496 } else if(exp==0 && (ntv=encodeFractional20(value, den))>=0) {
497 // fits into fractional-20 format
498 } else if(exp==0 && (ntv=encodeFractional32(value, den))>=0) {
499 // fits into fractional-32 format
500 } else {
501 ntv=-1;
502 }
503 if(ntv<0 || *numberLimit!=0) {
504 fprintf(stderr, "genprops error: unable to encode numeric value nv=%s\n", original);
505 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
506 }
507 return ntv;
508 }
509
510 void
setGcAndNumeric(const UniProps & props,const UnicodeSet & newValues,UErrorCode & errorCode)511 CorePropsBuilder::setGcAndNumeric(const UniProps &props, const UnicodeSet &newValues,
512 UErrorCode &errorCode) {
513 if(U_FAILURE(errorCode)) { return; }
514 UChar32 start=props.start;
515 UChar32 end=props.end;
516
517 int32_t type=props.getIntProp(UCHAR_NUMERIC_TYPE);
518 const char *nvString=props.numericValue;
519 if(type!=U_NT_NONE && nvString==NULL && start==end) {
520 fprintf(stderr, "genprops error: cp line has Numeric_Type but no Numeric_Value\n");
521 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
522 return;
523 }
524
525 if(!newValues.contains(UCHAR_GENERAL_CATEGORY) && !newValues.contains(UCHAR_NUMERIC_VALUE)) {
526 return;
527 }
528
529 int32_t ntv=UPROPS_NTV_NONE; // numeric type & value
530 if(nvString!=NULL && uprv_strcmp(nvString, "NaN")!=0) {
531 int32_t digitValue=props.digitValue;
532 if( type<=U_NT_NONE || U_NT_NUMERIC<type ||
533 ((type==U_NT_DECIMAL || type==U_NT_DIGIT) && digitValue<0)
534 ) {
535 fprintf(stderr, "genprops error: nt=%d but nv=%s\n",
536 (int)type, nvString==NULL ? "NULL" : nvString);
537 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
538 return;
539 }
540
541 switch(type) {
542 case U_NT_NONE:
543 ntv=UPROPS_NTV_NONE;
544 break;
545 case U_NT_DECIMAL:
546 ntv=UPROPS_NTV_DECIMAL_START+digitValue;
547 break;
548 case U_NT_DIGIT:
549 ntv=UPROPS_NTV_DIGIT_START+digitValue;
550 break;
551 case U_NT_NUMERIC:
552 if(digitValue>=0) {
553 ntv=UPROPS_NTV_NUMERIC_START+digitValue;
554 } else {
555 ntv=encodeNumericValue(start, nvString, errorCode);
556 if(U_FAILURE(errorCode)) {
557 return;
558 }
559 }
560 default:
561 break; // unreachable
562 }
563 }
564
565 uint32_t value=
566 (uint32_t)props.getIntProp(UCHAR_GENERAL_CATEGORY) |
567 (ntv<<UPROPS_NUMERIC_TYPE_VALUE_SHIFT);
568 if(start==end) {
569 utrie2_set32(pTrie, start, value, &errorCode);
570 } else {
571 utrie2_setRange32(pTrie, start, end, value, TRUE, &errorCode);
572 }
573 if(U_FAILURE(errorCode)) {
574 fprintf(stderr, "error: utrie2_setRange32(properties trie %04lX..%04lX) failed - %s\n",
575 (long)start, (long)end, u_errorName(errorCode));
576 }
577 }
578
579 struct PropToBinary {
580 int32_t prop; // UProperty
581 int32_t vecWord, vecShift;
582 };
583
584 static const PropToBinary
585 propToBinaries[]={
586 { UCHAR_WHITE_SPACE, 1, UPROPS_WHITE_SPACE },
587 { UCHAR_DASH, 1, UPROPS_DASH },
588 // Note: The Hyphen property is stabilized since Unicode 4.0
589 // and deprecated since Unicode 6.0.
590 { UCHAR_HYPHEN, 1, UPROPS_HYPHEN },
591 { UCHAR_QUOTATION_MARK, 1, UPROPS_QUOTATION_MARK },
592 { UCHAR_TERMINAL_PUNCTUATION, 1, UPROPS_TERMINAL_PUNCTUATION },
593 // Note: The Hex_Digit and ASCII_Hex_Digit properties are probably stable enough
594 // so that they could be hardcoded.
595 { UCHAR_HEX_DIGIT, 1, UPROPS_HEX_DIGIT },
596 { UCHAR_ASCII_HEX_DIGIT, 1, UPROPS_ASCII_HEX_DIGIT },
597 { UCHAR_IDEOGRAPHIC, 1, UPROPS_IDEOGRAPHIC },
598 { UCHAR_DIACRITIC, 1, UPROPS_DIACRITIC },
599 { UCHAR_EXTENDER, 1, UPROPS_EXTENDER },
600 // Note: The Noncharacter_Code_Point property is probably stable enough
601 // so that it could be hardcoded.
602 { UCHAR_NONCHARACTER_CODE_POINT, 1, UPROPS_NONCHARACTER_CODE_POINT },
603 // Note: The Grapheme_Link property is deprecated since Unicode 5.0
604 // because it is a "Duplication of ccc=9" (UAX #44).
605 { UCHAR_GRAPHEME_LINK, 1, UPROPS_GRAPHEME_LINK },
606 { UCHAR_IDS_BINARY_OPERATOR, 1, UPROPS_IDS_BINARY_OPERATOR },
607 { UCHAR_IDS_TRINARY_OPERATOR, 1, UPROPS_IDS_TRINARY_OPERATOR },
608 { UCHAR_RADICAL, 1, UPROPS_RADICAL },
609 { UCHAR_UNIFIED_IDEOGRAPH, 1, UPROPS_UNIFIED_IDEOGRAPH },
610 { UCHAR_DEPRECATED, 1, UPROPS_DEPRECATED },
611 { UCHAR_LOGICAL_ORDER_EXCEPTION, 1, UPROPS_LOGICAL_ORDER_EXCEPTION },
612 { UCHAR_S_TERM, 1, UPROPS_S_TERM },
613 { UCHAR_VARIATION_SELECTOR, 1, UPROPS_VARIATION_SELECTOR },
614 // Note: Pattern_Syntax & Pattern_White_Space are available via
615 // the internal PatternProps class and need not be stored here any more.
616 { UCHAR_PATTERN_SYNTAX, 1, UPROPS_PATTERN_SYNTAX },
617 { UCHAR_PATTERN_WHITE_SPACE, 1, UPROPS_PATTERN_WHITE_SPACE },
618 { UCHAR_XID_START, 1, UPROPS_XID_START },
619 { UCHAR_XID_CONTINUE, 1, UPROPS_XID_CONTINUE },
620 { UCHAR_MATH, 1, UPROPS_MATH },
621 { UCHAR_ALPHABETIC, 1, UPROPS_ALPHABETIC },
622 { UCHAR_GRAPHEME_EXTEND, 1, UPROPS_GRAPHEME_EXTEND },
623 { UCHAR_DEFAULT_IGNORABLE_CODE_POINT, 1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
624 { UCHAR_ID_START, 1, UPROPS_ID_START },
625 { UCHAR_ID_CONTINUE, 1, UPROPS_ID_CONTINUE },
626 { UCHAR_GRAPHEME_BASE, 1, UPROPS_GRAPHEME_BASE },
627
628 { UCHAR_EMOJI, 2, UPROPS_2_EMOJI },
629 { UCHAR_EMOJI_PRESENTATION, 2, UPROPS_2_EMOJI_PRESENTATION },
630 { UCHAR_EMOJI_MODIFIER, 2, UPROPS_2_EMOJI_MODIFIER },
631 { UCHAR_EMOJI_MODIFIER_BASE, 2, UPROPS_2_EMOJI_MODIFIER_BASE },
632 { UCHAR_EMOJI_COMPONENT, 2, UPROPS_2_EMOJI_COMPONENT },
633 { UCHAR_PREPENDED_CONCATENATION_MARK, 1, UPROPS_PREPENDED_CONCATENATION_MARK },
634 { UCHAR_EXTENDED_PICTOGRAPHIC, 2, UPROPS_2_EXTENDED_PICTOGRAPHIC },
635 };
636
637 struct PropToEnum {
638 int32_t prop; // UProperty
639 int32_t vecWord, vecShift;
640 uint32_t vecMask;
641 };
642
643 static const PropToEnum
644 propToEnums[]={
645 { UCHAR_BLOCK, 0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK },
646 { UCHAR_EAST_ASIAN_WIDTH, 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK },
647 { UCHAR_DECOMPOSITION_TYPE, 2, 0, UPROPS_DT_MASK },
648 { UCHAR_GRAPHEME_CLUSTER_BREAK, 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK },
649 { UCHAR_WORD_BREAK, 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK },
650 { UCHAR_SENTENCE_BREAK, 2, UPROPS_SB_SHIFT, UPROPS_SB_MASK },
651 { UCHAR_LINE_BREAK, 2, UPROPS_LB_SHIFT, UPROPS_LB_MASK },
652 };
653
654 void
setProps(const UniProps & props,const UnicodeSet & newValues,UErrorCode & errorCode)655 CorePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
656 UErrorCode &errorCode) {
657 setGcAndNumeric(props, newValues, errorCode);
658 if(U_FAILURE(errorCode)) { return; }
659
660 UChar32 start=props.start;
661 UChar32 end=props.end;
662 if(start==0 && end==0x10ffff) {
663 // Also set bits for initialValue and errorValue.
664 end=UPVEC_MAX_CP;
665 }
666
667 if(newValues.containsSome(0, UCHAR_BINARY_LIMIT-1)) {
668 for(int32_t i=0; i<LENGTHOF(propToBinaries); ++i) {
669 const PropToBinary &p2b=propToBinaries[i];
670 U_ASSERT(p2b.vecShift<32);
671 if(newValues.contains(p2b.prop)) {
672 uint32_t mask=U_MASK(p2b.vecShift);
673 uint32_t value= props.binProps[p2b.prop] ? mask : 0;
674 upvec_setValue(pv, start, end, p2b.vecWord, value, mask, &errorCode);
675 }
676 }
677 }
678
679 // Set int property values.
680 if(newValues.containsSome(UCHAR_INT_START, UCHAR_INT_LIMIT-1)) {
681 for(int32_t i=0; i<LENGTHOF(propToEnums); ++i) {
682 const PropToEnum &p2e=propToEnums[i];
683 U_ASSERT(p2e.vecShift<32);
684 if(newValues.contains(p2e.prop)) {
685 uint32_t mask=p2e.vecMask;
686 uint32_t value=(uint32_t)(props.getIntProp(p2e.prop)<<p2e.vecShift);
687 U_ASSERT((value&mask)==value);
688 upvec_setValue(pv, start, end, p2e.vecWord, value, mask, &errorCode);
689 }
690 }
691 }
692 if(newValues.contains(UCHAR_AGE)) {
693 if(props.age[0]>15 || props.age[1]>15 || props.age[2]!=0 || props.age[3]!=0) {
694 char buffer[U_MAX_VERSION_STRING_LENGTH];
695 u_versionToString(props.age, buffer);
696 fprintf(stderr, "genprops error: age %s cannot be encoded\n", buffer);
697 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
698 return;
699 }
700 uint32_t version=(props.age[0]<<4)|props.age[1];
701 upvec_setValue(pv, start, end,
702 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK,
703 &errorCode);
704 }
705
706 // Set the script value if the Script_Extensions revert to {Script}.
707 // Otherwise we would have to duplicate the code for doing so.
708 // Script and Script_Extensions share a bit field, so that by setting it to just the script
709 // we remove the Script_Extensions.
710 // (We do not just set the script bit in newValues because that is const.)
711 // For example, for U+3000:
712 // block;3000..303F;age=1.1;...;sc=Zyyy;scx=Bopo Hang Hani Hira Kana Yiii;vo=U
713 // cp;3000;...;gc=Zs;lb=BA;na=IDEOGRAPHIC SPACE;...;SB=SP;scx=<script>;WSpace
714 UBool revertToScript=
715 newValues.contains(UCHAR_SCRIPT_EXTENSIONS) && props.scx.isEmpty() &&
716 !newValues.contains(UCHAR_SCRIPT);
717 if(newValues.contains(UCHAR_SCRIPT) || revertToScript) {
718 int32_t script=props.getIntProp(UCHAR_SCRIPT);
719 uint32_t value=splitScriptCodeOrIndex(script);
720 // Use UPROPS_SCRIPT_X_MASK:
721 // When writing a Script code, remove Script_Extensions bits as well.
722 // If needed, they will get written again.
723 upvec_setValue(pv, start, end, 0, value, UPROPS_SCRIPT_X_MASK, &errorCode);
724 }
725 // Write a new (Script, Script_Extensions) value if there are Script_Extensions
726 // and either Script or Script_Extensions are new on the current line.
727 // (If only Script is new, then it just clobbered the relevant bits.)
728 if( !props.scx.isEmpty() &&
729 (newValues.contains(UCHAR_SCRIPT) || newValues.contains(UCHAR_SCRIPT_EXTENSIONS))
730 ) {
731 UnicodeString codes; // vector of 16-bit UScriptCode values
732 UnicodeSetIterator iter(props.scx);
733 while(iter.next()) { codes.append((UChar)iter.getCodepoint()); }
734
735 // Set bit 15 on the last script code, for termination.
736 int32_t length=codes.length();
737 codes.setCharAt(length-1, (UChar)(codes[length-1]|0x8000));
738 // Find this list of codes in the Script_Extensions data so far, or add this list.
739 int32_t index=scriptExtensions.indexOf(codes);
740 if(index<0) {
741 index=scriptExtensions.length();
742 scriptExtensions.append(codes);
743 }
744
745 // Encode the (Script, Script_Extensions index) pair.
746 int32_t script=props.getIntProp(UCHAR_SCRIPT);
747 uint32_t scriptX;
748 if(script==USCRIPT_COMMON) {
749 scriptX=UPROPS_SCRIPT_X_WITH_COMMON;
750 } else if(script==USCRIPT_INHERITED) {
751 scriptX=UPROPS_SCRIPT_X_WITH_INHERITED;
752 } else {
753 // Store an additional pair of 16-bit units for an unusual main Script code
754 // together with the Script_Extensions index.
755 UnicodeString codeIndexPair;
756 codeIndexPair.append((UChar)script).append((UChar)index);
757 index=scriptExtensions.indexOf(codeIndexPair);
758 if(index<0) {
759 index=scriptExtensions.length();
760 scriptExtensions.append(codeIndexPair);
761 }
762 scriptX=UPROPS_SCRIPT_X_WITH_OTHER;
763 }
764 if(index>UPROPS_MAX_SCRIPT) {
765 fprintf(stderr, "genprops: Script_Extensions indexes overflow bit fields\n");
766 errorCode=U_BUFFER_OVERFLOW_ERROR;
767 return;
768 }
769 scriptX|=splitScriptCodeOrIndex(index);
770 upvec_setValue(pv, start, end, 0, scriptX, UPROPS_SCRIPT_X_MASK, &errorCode);
771 }
772 if(U_FAILURE(errorCode)) {
773 fprintf(stderr, "genprops error: unable to set props2 values for %04lX..%04lX: %s\n",
774 (long)start, (long)end, u_errorName(errorCode));
775 }
776 }
777
778 static int32_t indexes[UPROPS_INDEX_COUNT]={
779 0, 0, 0, 0,
780 0, 0, 0, 0,
781 0, 0, 0, 0,
782 0, 0, 0, 0
783 };
784
785 static uint8_t trieBlock[100000];
786 static int32_t trieSize;
787 static uint8_t props2TrieBlock[100000];
788 static int32_t props2TrieSize;
789
790 static int32_t totalSize;
791
792 void
build(UErrorCode & errorCode)793 CorePropsBuilder::build(UErrorCode &errorCode) {
794 if(U_FAILURE(errorCode)) { return; }
795
796 utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode);
797 if(U_FAILURE(errorCode)) {
798 fprintf(stderr,
799 "genprops/core error: utrie2_freeze(main trie) failed: %s\n",
800 u_errorName(errorCode));
801 return;
802 }
803 trieSize=utrie2_serialize(pTrie, trieBlock, sizeof(trieBlock), &errorCode);
804 if(U_FAILURE(errorCode)) {
805 fprintf(stderr,
806 "genprops/core error: utrie2_serialize(main trie) failed: %s (length %ld)\n",
807 u_errorName(errorCode), (long)trieSize);
808 return;
809 }
810
811 props2Trie=upvec_compactToUTrie2WithRowIndexes(pv, &errorCode);
812 if(U_FAILURE(errorCode)) {
813 fprintf(stderr, "genprops/core error: unable to build trie for additional properties: %s\n",
814 u_errorName(errorCode));
815 return;
816 }
817
818 props2TrieSize=utrie2_serialize(props2Trie,
819 props2TrieBlock, (int32_t)sizeof(props2TrieBlock),
820 &errorCode);
821 if(U_FAILURE(errorCode)) {
822 fprintf(stderr,
823 "genprops/core error: utrie2_freeze(additional properties)+utrie2_serialize() "
824 "failed: %s\n",
825 u_errorName(errorCode));
826 return;
827 }
828
829 int32_t pvRows;
830 const uint32_t *pvArray=upvec_getArray(pv, &pvRows, NULL);
831 int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
832
833 /* round up scriptExtensions to multiple of 4 bytes */
834 if(scriptExtensions.length()&1) {
835 scriptExtensions.append((UChar)0);
836 }
837
838 /* set indexes */
839 int32_t offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */
840 offset+=trieSize>>2;
841 indexes[UPROPS_PROPS32_INDEX]= /* set indexes to the same offsets for empty */
842 indexes[UPROPS_EXCEPTIONS_INDEX]= /* structures from the old format version 3 */
843 indexes[UPROPS_EXCEPTIONS_TOP_INDEX]= /* so that less runtime code has to be changed */
844 indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
845
846 offset+=props2TrieSize/4;
847 indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=offset;
848 indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
849 offset+=pvCount;
850 indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]=offset;
851 offset+=scriptExtensions.length()/2;
852 indexes[UPROPS_RESERVED_INDEX_7]=offset;
853 indexes[UPROPS_RESERVED_INDEX_8]=offset;
854 indexes[UPROPS_DATA_TOP_INDEX]=offset;
855 totalSize=4*offset;
856
857 indexes[UPROPS_MAX_VALUES_INDEX]=
858 (((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
859 (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
860 (int32_t)splitScriptCodeOrIndex(USCRIPT_CODE_LIMIT-1);
861 indexes[UPROPS_MAX_VALUES_2_INDEX]=
862 (((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
863 (((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
864 (((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
865 (((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
866 ((int32_t)U_DT_COUNT-1);
867
868 if(!beQuiet) {
869 puts("* uprops.icu stats *");
870 printf("trie size in bytes: %5u\n", (int)trieSize);
871 printf("size in bytes of additional props trie:%5u\n", (int)props2TrieSize);
872 printf("number of additional props vectors: %5u\n", (int)pvRows);
873 printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
874 printf("number of 16-bit scriptExtensions: %5u\n", (int)scriptExtensions.length());
875 printf("data size: %6ld\n", (long)totalSize);
876 }
877 }
878
879 void
writeCSourceFile(const char * path,UErrorCode & errorCode)880 CorePropsBuilder::writeCSourceFile(const char *path, UErrorCode &errorCode) {
881 if(U_FAILURE(errorCode)) { return; }
882
883 int32_t pvRows;
884 const uint32_t *pvArray=upvec_getArray(pv, &pvRows, NULL);
885 int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
886
887 FILE *f=usrc_create(path, "uchar_props_data.h", 2016,
888 "icu/tools/unicode/c/genprops/corepropsbuilder.cpp");
889 if(f==NULL) {
890 errorCode=U_FILE_ACCESS_ERROR;
891 return;
892 }
893 fputs("#ifdef INCLUDED_FROM_UCHAR_C\n\n", f);
894 usrc_writeArray(f,
895 "static const UVersionInfo dataVersion={",
896 dataInfo.dataVersion, 8, 4,
897 "};\n\n");
898 usrc_writeUTrie2Arrays(f,
899 "static const uint16_t propsTrie_index[%ld]={\n", NULL,
900 pTrie,
901 "\n};\n\n");
902 usrc_writeUTrie2Struct(f,
903 "static const UTrie2 propsTrie={\n",
904 pTrie, "propsTrie_index", NULL,
905 "};\n\n");
906
907 usrc_writeUTrie2Arrays(f,
908 "static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL,
909 props2Trie,
910 "\n};\n\n");
911 usrc_writeUTrie2Struct(f,
912 "static const UTrie2 propsVectorsTrie={\n",
913 props2Trie, "propsVectorsTrie_index", NULL,
914 "};\n\n");
915
916 usrc_writeArray(f,
917 "static const uint32_t propsVectors[%ld]={\n",
918 pvArray, 32, pvCount,
919 "};\n\n");
920 fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
921 fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)UPROPS_VECTOR_WORDS);
922
923 usrc_writeArray(f,
924 "static const uint16_t scriptExtensions[%ld]={\n",
925 scriptExtensions.getBuffer(), 16, scriptExtensions.length(),
926 "};\n\n");
927
928 usrc_writeArray(f,
929 "static const int32_t indexes[UPROPS_INDEX_COUNT]={",
930 indexes, 32, UPROPS_INDEX_COUNT,
931 "};\n\n");
932 fputs("#endif // INCLUDED_FROM_UCHAR_C\n", f);
933 fclose(f);
934 }
935
936 void
writeBinaryData(const char * path,UBool withCopyright,UErrorCode & errorCode)937 CorePropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) {
938 if(U_FAILURE(errorCode)) { return; }
939
940 int32_t pvRows;
941 const uint32_t *pvArray=upvec_getArray(pv, &pvRows, NULL);
942 int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
943
944 UNewDataMemory *pData=udata_create(path, "icu", "uprops", &dataInfo,
945 withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
946 if(U_FAILURE(errorCode)) {
947 fprintf(stderr, "genprops: udata_create(%s, uprops.icu) failed - %s\n",
948 path, u_errorName(errorCode));
949 return;
950 }
951
952 udata_writeBlock(pData, indexes, sizeof(indexes));
953 udata_writeBlock(pData, trieBlock, trieSize);
954 udata_writeBlock(pData, props2TrieBlock, props2TrieSize);
955 udata_writeBlock(pData, pvArray, pvCount*4);
956 udata_writeBlock(pData, scriptExtensions.getBuffer(), scriptExtensions.length()*2);
957
958 long dataLength=udata_finish(pData, &errorCode);
959 if(U_FAILURE(errorCode)) {
960 fprintf(stderr, "genprops: error %s writing the output file\n", u_errorName(errorCode));
961 return;
962 }
963
964 if(dataLength!=(long)totalSize) {
965 fprintf(stderr,
966 "udata_finish(uprops.icu) reports %ld bytes written but should be %ld\n",
967 dataLength, (long)totalSize);
968 errorCode=U_INTERNAL_PROGRAM_ERROR;
969 }
970 }
971
972 PropsBuilder *
createCorePropsBuilder(UErrorCode & errorCode)973 createCorePropsBuilder(UErrorCode &errorCode) {
974 if(U_FAILURE(errorCode)) { return NULL; }
975 PropsBuilder *pb=new CorePropsBuilder(errorCode);
976 if(pb==NULL) {
977 errorCode=U_MEMORY_ALLOCATION_ERROR;
978 }
979 return pb;
980 }
981
982 /*
983 * Hey, Emacs, please set the following:
984 *
985 * Local Variables:
986 * indent-tabs-mode: nil
987 * End:
988 *
989 */
990