• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1999-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  corepropsbuilder.cpp (was store.c & props2.cpp)
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 1999dec11
16 *   created by: Markus W. Scherer
17 *
18 *   Store Unicode character properties efficiently for
19 *   random access.
20 */
21 
22 #include <stdio.h>
23 #include "unicode/utypes.h"
24 #include "unicode/uchar.h"
25 #include "unicode/udata.h"
26 #include "unicode/uniset.h"
27 #include "unicode/unistr.h"
28 #include "unicode/usetiter.h"
29 #include "unicode/uscript.h"
30 #include "cmemory.h"
31 #include "cstring.h"
32 #include "genprops.h"
33 #include "propsvec.h"
34 #include "uassert.h"
35 #include "unewdata.h"
36 #include "uprops.h"
37 #include "utrie2.h"
38 #include "writesrc.h"
39 
40 /* Unicode character properties file format ------------------------------------
41 
42 The file format prepared and written here contains several data
43 structures that store indexes or data.
44 
45 Before the data contents described below, there are the headers required by
46 the udata API for loading ICU data. Especially, a UDataInfo structure
47 precedes the actual data. It contains platform properties values and the
48 file format version.
49 
50 The following is a description of format version 7.7 .
51 
52 Data contents:
53 
54 The contents is a parsed, binary form of several Unicode character
55 database files, most prominently UnicodeData.txt.
56 
57 Any Unicode code point from 0 to 0x10ffff can be looked up to get
58 the properties, if any, for that code point. This means that the input
59 to the lookup are 21-bit unsigned integers, with not all of the
60 21-bit range used.
61 
62 It is assumed that client code keeps a uint32_t pointer
63 to the beginning of the data:
64 
65     const uint32_t *p32;
66 
67 Formally, the file contains the following structures:
68 
69     const int32_t indexes[16] with values i0..i15:
70 
71   i0 indicates the length of the main trie.
72   i0..i3 all have the same value in format versions 4.0 and higher;
73          the related props32[] and exceptions[] and uchars[] were used in format version 3
74 
75     i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
76     i1 exceptionsIndex;  -- 32-bit unit index to the table of 32-bit exception words
77     i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
78 
79     i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
80     i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
81     i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
82 
83     i6 scriptExtensionsIndex; -- 32-bit unit index to the Script_Extensions data
84     i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data
85     i8 reservedIndex8; -- for now: i7, i8 and i9 have the same values
86     i9 dataTopIndex; -- size of the data file (number of 32-bit units after the header)
87 
88     i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
89     i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
90     i12..i15 reservedIndexes; -- reserved values; 0 for now
91 
92     PT serialized properties trie, see utrie2.h (byte size: 4*(i0-16))
93 
94   P, E, and U are not used (empty) in format versions 4 and above
95 
96     P  const uint32_t props32[i1-i0];
97     E  const uint32_t exceptions[i2-i1];
98     U  const UChar uchars[2*(i3-i2)];
99 
100     AT serialized trie for additional properties (byte size: 4*(i4-i3))
101     PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
102 
103     SCX const uint16_t scriptExtensions[2*(i7-i6)];
104 
105       SCX contains Script_Extensions lists and (Script code, Script_Extensions index) pairs.
106       A Script_Extensions list is a sequence of UScriptCode values in ascending order,
107       with the last code having bit 15 set for termination.
108       A (Script code, Script_Extensions index) pair is the main UScriptCode (Script value)
109       followed by the index of the Script_Extensions list.
110       If the propsVectors[] column 0 value indicates that there are Script_Extensions,
111       then the script-code-or-index bit fields are an index to either a list or a pair in SCX,
112       rather than the Script itself. The high bits in the UPROPS_SCRIPT_X_MASK fields
113       indicate whether the main Script value is Common or Inherited (and the index is to a list)
114       vs. another value (and the index is to a pair).
115       (See UPROPS_SCRIPT_X_WITH_COMMON etc. in uprops.h.)
116 
117 Trie lookup and properties:
118 
119 In order to condense the data for the 21-bit code space, several properties of
120 the Unicode code assignment are exploited:
121 - The code space is sparse.
122 - There are several 10k of consecutive codes with the same properties.
123 - Characters and scripts are allocated in groups of 16 code points.
124 - Inside blocks for scripts the properties are often repetitive.
125 - The 21-bit space is not fully used for Unicode.
126 
127 The lookup of properties for a given code point is done with a trie lookup,
128 using the UTrie implementation.
129 The trie lookup result is a 16-bit properties word.
130 
131 With a given Unicode code point
132 
133     UChar32 c;
134 
135 and 0<=c<0x110000, the lookup is done like this:
136 
137     uint16_t props;
138     UTRIE_GET16(trie, c, props);
139 
140 Each 16-bit properties word contains:
141 
142  0.. 4  general category
143      5  reserved
144  6..15  numeric type and value (ntv)
145 
146 Encoding of numeric type and value in the 10-bit ntv field:
147     ntv             type            value
148     0               U_NT_NONE       0
149     1..10           U_NT_DECIMAL    0..9
150     11..20          U_NT_DIGIT      0..9
151     21..0x3ff       U_NT_NUMERIC    see below
152 
153     For U_NT_NUMERIC:
154     ntv             value
155     21..0xaf        integer     0..154
156     0xb0..0x1df     fraction    ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16
157     0x1e0..0x2ff    large int   ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
158                     (only one significant decimal digit)
159     0x300..0x323    base-60 (sexagesimal) integer (new in format version 7.1)
160                                 ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
161     0x324..0x34b    fraction-20 (new in format version 7.3)
162                                 frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640
163                                 numerator: num = 2*(frac20&3)+1
164                                 denominator: den = 20<<(frac20>>2)
165     0x34c..0x35b    fraction-32 (new in format version 7.6)
166                                 frac32 = ntv-0x34c = 0..15 -> 1|3|5|7 / 32|64|128|256
167                                 numerator: num = 2*(frac32&3)+1
168                                 denominator: den = 32<<(frac32>>2)
169     0x35c..0x3ff    reserved
170 
171 --- Additional properties (new in format version 2.1) ---
172 
173 The second trie for additional properties (AT) is also a UTrie with 16-bit data.
174 The data words consist of 32-bit unit indexes (not row indexes!) into the
175 table of unique properties vectors (PV).
176 Each vector contains a set of properties.
177 The width of a vector (number of uint32_t per row) may change
178 with the formatVersion, it is stored in i5.
179 
180 Current properties: see icu/source/common/uprops.h
181 
182 --- Changes in format version 3.1 ---
183 
184 See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
185 
186 --- Changes in format version 3.2 ---
187 
188 - The tries use linear Latin-1 ranges.
189 - The additional properties bits store full properties XYZ instead
190   of partial Other_XYZ, so that changes in the derivation formulas
191   need not be tracked in runtime library code.
192 - Joining Type and Line Break are also stored completely, so that uprops.c
193   needs no runtime formulas for enumerated properties either.
194 - Store the case-sensitive flag in the main properties word.
195 - i10 also contains U_LB_COUNT and U_EA_COUNT.
196 - i11 contains maxValues2 for vector word 2.
197 
198 --- Changes in format version 4 ---
199 
200 The format changes between version 3 and 4 because the properties related to
201 case mappings and bidi/shaping are pulled out into separate files
202 for modularization.
203 In order to reduce the need for code changes, some of the previous data
204 structures are omitted, rather than rearranging everything.
205 
206 (The change to format version 4 is for ICU 3.4. The last CVS revision of
207 genprops/store.c for format version 3.2 is 1.48.)
208 
209 The main trie's data is significantly simplified:
210 - The trie's 16-bit data word is used directly instead of as an index
211   into props32[].
212 - The trie uses the default trie folding functions instead of custom ones.
213 - Numeric values are stored directly in the trie data word, with special
214   encodings.
215 - No more exception data (the data that needed it was pulled out, or, in the
216   case of numeric values, encoded differently).
217 - No more string data (pulled out - was for case mappings).
218 
219 Also, some of the previously used properties vector bits are reserved again.
220 
221 The indexes[] values for the omitted structures are still filled in
222 (indicating zero-length arrays) so that the swapper code remains unchanged.
223 
224 --- Changes in format version 5 ---
225 
226 Format version 5 became necessary because the bit field for script codes
227 overflowed. The changes are incompatible because
228 old code would have seen nonsensically low values for new, higher script codes.
229 
230 Rearranged bit fields in the second trie (AT) and widened three (Script, Block,
231 Word_Break) by one bit each.
232 
233 Modified bit fields in icu/source/common/uprops.h
234 
235 --- Changes in format version 6 ---
236 
237 Format version 6 became necessary because Unicode 5.2 adds fractions with
238 denominators 9, 10 and 16, and it was easier to redesign the encoding of numeric
239 types and values rather than add another variant to the previous format.
240 
241 --- Changes in format version 7 ---
242 
243 Unicode 6.0 adds Script_Extensions. For characters with script extensions data,
244 the script code bits are an index into the new Script_Extensions array rather
245 than a script code.
246 
247 Change from UTrie to UTrie2.
248 
249 --- Changes in format version 7.1 ---
250 
251 Unicode 6.2 adds sexagesimal (base-60) numeric values:
252     cp;12432;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH;nv=216000
253     cp;12433;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN;nv=432000
254 
255 The encoding of numeric values was extended to handle such values.
256 
257 --- Changes in format version 7.2 ---
258 
259 ICU 57 adds 4 Emoji properties to vector word 2.
260 http://bugs.icu-project.org/trac/ticket/11802
261 http://www.unicode.org/reports/tr51/#Emoji_Properties
262 
263 --- Changes in format version 7.3 ---
264 
265 ICU 58 adds fraction-20 numeric values for new Unicode 9 Malayalam fraction characters.
266 
267 --- Changes in format version 7.4 ---
268 
269 ICU 60 adds the Prepended_Concatenation_Mark property to vector word 1.
270 
271 ICU 60 adds the Emoji_Component property to vector word 2, for emoji 5.
272 http://bugs.icu-project.org/trac/ticket/13062
273 http://www.unicode.org/reports/tr51/#Emoji_Properties
274 
275 --- Changes in format version 7.5 ---
276 
277 ICU 62 adds the Extended_Pictographic property to vector word 2, for emoji 11.
278 http://www.unicode.org/reports/tr51/#Emoji_Properties
279 
280 --- Changes in format version 7.6 ---
281 
282 ICU 64 adds fraction-32 numeric values for new Unicode 12 Tamil fraction characters.
283 
284 --- Changes in format version 7.7 ---
285 
286 ICU 66 adds two bits for the UScriptCode or Script_Extensions index in vector word 0.
287 The value is split across bits 21..20 & 7..0.
288 
289 ----------------------------------------------------------------------------- */
290 
291 U_NAMESPACE_USE
292 
293 /* UDataInfo cf. udata.h */
294 static UDataInfo dataInfo={
295     sizeof(UDataInfo),
296     0,
297 
298     U_IS_BIG_ENDIAN,
299     U_CHARSET_FAMILY,
300     U_SIZEOF_UCHAR,
301     0,
302 
303     { 0x55, 0x50, 0x72, 0x6f },                 /* dataFormat="UPro" */
304     { 7, 7, 0, 0 },                             /* formatVersion */
305     { 10, 0, 0, 0 }                             /* dataVersion */
306 };
307 
splitScriptCodeOrIndex(uint32_t v)308 inline uint32_t splitScriptCodeOrIndex(uint32_t v) {
309     return
310         ((v << UPROPS_SCRIPT_HIGH_SHIFT) & UPROPS_SCRIPT_HIGH_MASK) |
311         (v & UPROPS_SCRIPT_LOW_MASK);
312 }
313 
314 class CorePropsBuilder : public PropsBuilder {
315 public:
316     CorePropsBuilder(UErrorCode &errorCode);
317     virtual ~CorePropsBuilder();
318 
319     virtual void setUnicodeVersion(const UVersionInfo version);
320     virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
321     virtual void build(UErrorCode &errorCode);
322     virtual void writeCSourceFile(const char *path, UErrorCode &errorCode);
323     virtual void writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode);
324 
325 private:
326     void setGcAndNumeric(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
327 
328     UTrie2 *pTrie;
329     UTrie2 *props2Trie;
330     UPropsVectors *pv;
331     UnicodeString scriptExtensions;
332 };
333 
CorePropsBuilder(UErrorCode & errorCode)334 CorePropsBuilder::CorePropsBuilder(UErrorCode &errorCode)
335         : pTrie(NULL), props2Trie(NULL), pv(NULL) {
336     pTrie=utrie2_open(0, 0, &errorCode);
337     if(U_FAILURE(errorCode)) {
338         fprintf(stderr, "genprops error: corepropsbuilder utrie2_open() failed - %s\n",
339                 u_errorName(errorCode));
340     }
341     pv=upvec_open(UPROPS_VECTOR_WORDS, &errorCode);
342     if(U_FAILURE(errorCode)) {
343         fprintf(stderr, "genprops error: corepropsbuilder upvec_open() failed - %s\n",
344                 u_errorName(errorCode));
345     }
346 }
347 
~CorePropsBuilder()348 CorePropsBuilder::~CorePropsBuilder() {
349     utrie2_close(pTrie);
350     utrie2_close(props2Trie);
351     upvec_close(pv);
352 }
353 
354 void
setUnicodeVersion(const UVersionInfo version)355 CorePropsBuilder::setUnicodeVersion(const UVersionInfo version) {
356     uprv_memcpy(dataInfo.dataVersion, version, 4);
357 }
358 
encodeFractional20(int32_t value,int32_t den)359 static int32_t encodeFractional20(int32_t value, int32_t den) {
360     if(den<20 || 640<den) { return -1; }
361     int32_t frac20;
362     if(value==1 || value==3 || value==5 || value==7) {
363         frac20=value/2;
364     } else {
365         return -1;
366     }
367     // Denominator: 20 times which power of 2: 0..5 into bits 4..2
368     do {
369         if(den==20) {
370             return UPROPS_NTV_FRACTION20_START+frac20;
371         }
372         if(den&1) {
373             return -1;  // odd denominator, and we would lose the low bit in den/=2
374         }
375         den/=2;
376         frac20+=4;
377     } while(den>=20);
378     return -1;
379 }
380 
encodeFractional32(int32_t value,int32_t den)381 static int32_t encodeFractional32(int32_t value, int32_t den) {
382     if(den<32 || 256<den) { return -1; }
383     int32_t frac32;
384     if(value==1 || value==3 || value==5 || value==7) {
385         frac32=value/2;
386     } else {
387         return -1;
388     }
389     // Denominator: 32 times which power of 2: 0..3 into bits 3..2
390     do {
391         if(den==32) {
392             return UPROPS_NTV_FRACTION32_START+frac32;
393         }
394         if(den&1) {
395             return -1;  // odd denominator, and we would lose the low bit in den/=2
396         }
397         den/=2;
398         frac32+=4;
399     } while(den>=32);
400     return -1;
401 }
402 
403 // For nt=U_NT_NUMERIC.
404 static int32_t
encodeNumericValue(UChar32 start,const char * s,UErrorCode & errorCode)405 encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) {
406     const char *original=s;
407     /* get a possible minus sign */
408     UBool isNegative;
409     if(*s=='-') {
410         isNegative=TRUE;
411         ++s;
412     } else {
413         isNegative=FALSE;
414     }
415 
416     int32_t value=0, den=0, exp=0, ntv=0;
417     char *numberLimit;
418     /* try large, single-significant-digit numbers, may otherwise overflow strtoul() */
419     if('1'<=s[0] && s[0]<='9' && s[1]=='0' && s[2]=='0') {
420         value=s[0]-'0';
421         numberLimit=const_cast<char *>(s);
422         while(*(++numberLimit)=='0') {
423             ++exp;
424         }
425     } else {
426         /* normal number parsing */
427         unsigned long ul=uprv_strtoul(s, &numberLimit, 10);
428         if(s==numberLimit || (*numberLimit!=0 && *numberLimit!='/') || ul>0x7fffffff) {
429             ntv=-1;
430         } else {
431             value=(int32_t)ul;
432         }
433         if(ntv>=0 && *numberLimit=='/') {
434             /* fractional value, get the denominator */
435             s=numberLimit+1;
436             ul=uprv_strtoul(s, &numberLimit, 10);
437             if(s==numberLimit || *numberLimit!=0 || ul==0 || ul>0x7fffffff) {
438                 ntv=-1;
439             } else {
440                 den=(int32_t)ul;
441             }
442         }
443     }
444     if(isNegative) {
445         value=-(int32_t)value;
446     }
447 
448     if(ntv<0) {
449         // pass
450     } else if(den==0 && value>=0) {
451         if(exp==2 && (value*100)<=UPROPS_NTV_MAX_SMALL_INT) {
452             /* small integer parsed like a large one */
453             ntv=UPROPS_NTV_NUMERIC_START+value*100;
454         } else if(exp==0) {
455             if(value<=UPROPS_NTV_MAX_SMALL_INT) {
456                 /* small integer */
457                 ntv=UPROPS_NTV_NUMERIC_START+value;
458             } else {
459                 /* large integer parsed like a small one */
460                 /* split the value into mantissa and exponent, base 10 */
461                 int32_t mant=value;
462                 while((mant%10)==0) {
463                     mant/=10;
464                     ++exp;
465                 }
466                 // Note: value<=0x7fffffff guarantees exp<=33
467                 if(mant<=9) {
468                     ntv=((mant+14)<<5)+(exp-2);
469                 } else {
470                     // Try sexagesimal (base 60) numbers.
471                     mant=value;
472                     exp=0;
473                     while((mant%60)==0) {
474                         mant/=60;
475                         ++exp;
476                     }
477                     if(mant<=9 && exp<=4) {
478                         ntv=((mant+0xbf)<<2)+(exp-1);
479                     } else {
480                         ntv=-1;
481                     }
482                 }
483             }
484         } else if(2<=exp && exp<=33 && 1<=value && value<=9) {
485             /* large, single-significant-digit integer */
486             ntv=((value+14)<<5)+(exp-2);
487         } else {
488             ntv=-1;
489         }
490     } else if(exp==0 && -1<=value && value<=17 && 1<=den && den<=16) {
491         /* fraction */
492         ntv=((value+12)<<4)+(den-1);
493     } else if(exp==0 && value==-1 && den==0) {
494         /* -1 parsed with den=0, encoded as pseudo-fraction -1/1 */
495         ntv=((value+12)<<4);
496     } else if(exp==0 && (ntv=encodeFractional20(value, den))>=0) {
497         // fits into fractional-20 format
498     } else if(exp==0 && (ntv=encodeFractional32(value, den))>=0) {
499         // fits into fractional-32 format
500     } else {
501         ntv=-1;
502     }
503     if(ntv<0 || *numberLimit!=0) {
504         fprintf(stderr, "genprops error: unable to encode numeric value nv=%s\n", original);
505         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
506     }
507     return ntv;
508 }
509 
510 void
setGcAndNumeric(const UniProps & props,const UnicodeSet & newValues,UErrorCode & errorCode)511 CorePropsBuilder::setGcAndNumeric(const UniProps &props, const UnicodeSet &newValues,
512                                   UErrorCode &errorCode) {
513     if(U_FAILURE(errorCode)) { return; }
514     UChar32 start=props.start;
515     UChar32 end=props.end;
516 
517     int32_t type=props.getIntProp(UCHAR_NUMERIC_TYPE);
518     const char *nvString=props.numericValue;
519     if(type!=U_NT_NONE && nvString==NULL && start==end) {
520         fprintf(stderr, "genprops error: cp line has Numeric_Type but no Numeric_Value\n");
521         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
522         return;
523     }
524 
525     if(!newValues.contains(UCHAR_GENERAL_CATEGORY) && !newValues.contains(UCHAR_NUMERIC_VALUE)) {
526         return;
527     }
528 
529     int32_t ntv=UPROPS_NTV_NONE;  // numeric type & value
530     if(nvString!=NULL && uprv_strcmp(nvString, "NaN")!=0) {
531         int32_t digitValue=props.digitValue;
532         if( type<=U_NT_NONE || U_NT_NUMERIC<type ||
533             ((type==U_NT_DECIMAL || type==U_NT_DIGIT) && digitValue<0)
534         ) {
535             fprintf(stderr, "genprops error: nt=%d but nv=%s\n",
536                     (int)type, nvString==NULL ? "NULL" : nvString);
537             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
538             return;
539         }
540 
541         switch(type) {
542         case U_NT_NONE:
543             ntv=UPROPS_NTV_NONE;
544             break;
545         case U_NT_DECIMAL:
546             ntv=UPROPS_NTV_DECIMAL_START+digitValue;
547             break;
548         case U_NT_DIGIT:
549             ntv=UPROPS_NTV_DIGIT_START+digitValue;
550             break;
551         case U_NT_NUMERIC:
552             if(digitValue>=0) {
553                 ntv=UPROPS_NTV_NUMERIC_START+digitValue;
554             } else {
555                 ntv=encodeNumericValue(start, nvString, errorCode);
556                 if(U_FAILURE(errorCode)) {
557                     return;
558                 }
559             }
560         default:
561             break;  // unreachable
562         }
563     }
564 
565     uint32_t value=
566         (uint32_t)props.getIntProp(UCHAR_GENERAL_CATEGORY) |
567         (ntv<<UPROPS_NUMERIC_TYPE_VALUE_SHIFT);
568     if(start==end) {
569         utrie2_set32(pTrie, start, value, &errorCode);
570     } else {
571         utrie2_setRange32(pTrie, start, end, value, TRUE, &errorCode);
572     }
573     if(U_FAILURE(errorCode)) {
574         fprintf(stderr, "error: utrie2_setRange32(properties trie %04lX..%04lX) failed - %s\n",
575                 (long)start, (long)end, u_errorName(errorCode));
576     }
577 }
578 
579 struct PropToBinary {
580     int32_t prop;  // UProperty
581     int32_t vecWord, vecShift;
582 };
583 
584 static const PropToBinary
585 propToBinaries[]={
586     { UCHAR_WHITE_SPACE,                    1, UPROPS_WHITE_SPACE },
587     { UCHAR_DASH,                           1, UPROPS_DASH },
588     // Note: The Hyphen property is stabilized since Unicode 4.0
589     // and deprecated since Unicode 6.0.
590     { UCHAR_HYPHEN,                         1, UPROPS_HYPHEN },
591     { UCHAR_QUOTATION_MARK,                 1, UPROPS_QUOTATION_MARK },
592     { UCHAR_TERMINAL_PUNCTUATION,           1, UPROPS_TERMINAL_PUNCTUATION },
593     // Note: The Hex_Digit and ASCII_Hex_Digit properties are probably stable enough
594     // so that they could be hardcoded.
595     { UCHAR_HEX_DIGIT,                      1, UPROPS_HEX_DIGIT },
596     { UCHAR_ASCII_HEX_DIGIT,                1, UPROPS_ASCII_HEX_DIGIT },
597     { UCHAR_IDEOGRAPHIC,                    1, UPROPS_IDEOGRAPHIC },
598     { UCHAR_DIACRITIC,                      1, UPROPS_DIACRITIC },
599     { UCHAR_EXTENDER,                       1, UPROPS_EXTENDER },
600     // Note: The Noncharacter_Code_Point property is probably stable enough
601     // so that it could be hardcoded.
602     { UCHAR_NONCHARACTER_CODE_POINT,        1, UPROPS_NONCHARACTER_CODE_POINT },
603     // Note: The Grapheme_Link property is deprecated since Unicode 5.0
604     // because it is a "Duplication of ccc=9" (UAX #44).
605     { UCHAR_GRAPHEME_LINK,                  1, UPROPS_GRAPHEME_LINK },
606     { UCHAR_IDS_BINARY_OPERATOR,            1, UPROPS_IDS_BINARY_OPERATOR },
607     { UCHAR_IDS_TRINARY_OPERATOR,           1, UPROPS_IDS_TRINARY_OPERATOR },
608     { UCHAR_RADICAL,                        1, UPROPS_RADICAL },
609     { UCHAR_UNIFIED_IDEOGRAPH,              1, UPROPS_UNIFIED_IDEOGRAPH },
610     { UCHAR_DEPRECATED,                     1, UPROPS_DEPRECATED },
611     { UCHAR_LOGICAL_ORDER_EXCEPTION,        1, UPROPS_LOGICAL_ORDER_EXCEPTION },
612     { UCHAR_S_TERM,                         1, UPROPS_S_TERM },
613     { UCHAR_VARIATION_SELECTOR,             1, UPROPS_VARIATION_SELECTOR },
614     // Note: Pattern_Syntax & Pattern_White_Space are available via
615     // the internal PatternProps class and need not be stored here any more.
616     { UCHAR_PATTERN_SYNTAX,                 1, UPROPS_PATTERN_SYNTAX },
617     { UCHAR_PATTERN_WHITE_SPACE,            1, UPROPS_PATTERN_WHITE_SPACE },
618     { UCHAR_XID_START,                      1, UPROPS_XID_START },
619     { UCHAR_XID_CONTINUE,                   1, UPROPS_XID_CONTINUE },
620     { UCHAR_MATH,                           1, UPROPS_MATH },
621     { UCHAR_ALPHABETIC,                     1, UPROPS_ALPHABETIC },
622     { UCHAR_GRAPHEME_EXTEND,                1, UPROPS_GRAPHEME_EXTEND },
623     { UCHAR_DEFAULT_IGNORABLE_CODE_POINT,   1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
624     { UCHAR_ID_START,                       1, UPROPS_ID_START },
625     { UCHAR_ID_CONTINUE,                    1, UPROPS_ID_CONTINUE },
626     { UCHAR_GRAPHEME_BASE,                  1, UPROPS_GRAPHEME_BASE },
627 
628     { UCHAR_EMOJI,                          2, UPROPS_2_EMOJI },
629     { UCHAR_EMOJI_PRESENTATION,             2, UPROPS_2_EMOJI_PRESENTATION },
630     { UCHAR_EMOJI_MODIFIER,                 2, UPROPS_2_EMOJI_MODIFIER },
631     { UCHAR_EMOJI_MODIFIER_BASE,            2, UPROPS_2_EMOJI_MODIFIER_BASE },
632     { UCHAR_EMOJI_COMPONENT,                2, UPROPS_2_EMOJI_COMPONENT },
633     { UCHAR_PREPENDED_CONCATENATION_MARK,   1, UPROPS_PREPENDED_CONCATENATION_MARK },
634     { UCHAR_EXTENDED_PICTOGRAPHIC,          2, UPROPS_2_EXTENDED_PICTOGRAPHIC },
635 };
636 
637 struct PropToEnum {
638     int32_t prop;  // UProperty
639     int32_t vecWord, vecShift;
640     uint32_t vecMask;
641 };
642 
643 static const PropToEnum
644 propToEnums[]={
645     { UCHAR_BLOCK,                      0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK },
646     { UCHAR_EAST_ASIAN_WIDTH,           0, UPROPS_EA_SHIFT, UPROPS_EA_MASK },
647     { UCHAR_DECOMPOSITION_TYPE,         2, 0, UPROPS_DT_MASK },
648     { UCHAR_GRAPHEME_CLUSTER_BREAK,     2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK },
649     { UCHAR_WORD_BREAK,                 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK },
650     { UCHAR_SENTENCE_BREAK,             2, UPROPS_SB_SHIFT, UPROPS_SB_MASK },
651     { UCHAR_LINE_BREAK,                 2, UPROPS_LB_SHIFT, UPROPS_LB_MASK },
652 };
653 
654 void
setProps(const UniProps & props,const UnicodeSet & newValues,UErrorCode & errorCode)655 CorePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
656                            UErrorCode &errorCode) {
657     setGcAndNumeric(props, newValues, errorCode);
658     if(U_FAILURE(errorCode)) { return; }
659 
660     UChar32 start=props.start;
661     UChar32 end=props.end;
662     if(start==0 && end==0x10ffff) {
663         // Also set bits for initialValue and errorValue.
664         end=UPVEC_MAX_CP;
665     }
666 
667     if(newValues.containsSome(0, UCHAR_BINARY_LIMIT-1)) {
668         for(int32_t i=0; i<LENGTHOF(propToBinaries); ++i) {
669             const PropToBinary &p2b=propToBinaries[i];
670             U_ASSERT(p2b.vecShift<32);
671             if(newValues.contains(p2b.prop)) {
672                 uint32_t mask=U_MASK(p2b.vecShift);
673                 uint32_t value= props.binProps[p2b.prop] ? mask : 0;
674                 upvec_setValue(pv, start, end, p2b.vecWord, value, mask, &errorCode);
675             }
676         }
677     }
678 
679     // Set int property values.
680     if(newValues.containsSome(UCHAR_INT_START, UCHAR_INT_LIMIT-1)) {
681         for(int32_t i=0; i<LENGTHOF(propToEnums); ++i) {
682             const PropToEnum &p2e=propToEnums[i];
683             U_ASSERT(p2e.vecShift<32);
684             if(newValues.contains(p2e.prop)) {
685                 uint32_t mask=p2e.vecMask;
686                 uint32_t value=(uint32_t)(props.getIntProp(p2e.prop)<<p2e.vecShift);
687                 U_ASSERT((value&mask)==value);
688                 upvec_setValue(pv, start, end, p2e.vecWord, value, mask, &errorCode);
689             }
690         }
691     }
692     if(newValues.contains(UCHAR_AGE)) {
693         if(props.age[0]>15 || props.age[1]>15 || props.age[2]!=0 || props.age[3]!=0) {
694             char buffer[U_MAX_VERSION_STRING_LENGTH];
695             u_versionToString(props.age, buffer);
696             fprintf(stderr, "genprops error: age %s cannot be encoded\n", buffer);
697             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
698             return;
699         }
700         uint32_t version=(props.age[0]<<4)|props.age[1];
701         upvec_setValue(pv, start, end,
702                        0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK,
703                        &errorCode);
704     }
705 
706     // Set the script value if the Script_Extensions revert to {Script}.
707     // Otherwise we would have to duplicate the code for doing so.
708     // Script and Script_Extensions share a bit field, so that by setting it to just the script
709     // we remove the Script_Extensions.
710     // (We do not just set the script bit in newValues because that is const.)
711     // For example, for U+3000:
712     // block;3000..303F;age=1.1;...;sc=Zyyy;scx=Bopo Hang Hani Hira Kana Yiii;vo=U
713     // cp;3000;...;gc=Zs;lb=BA;na=IDEOGRAPHIC SPACE;...;SB=SP;scx=<script>;WSpace
714     UBool revertToScript=
715         newValues.contains(UCHAR_SCRIPT_EXTENSIONS) && props.scx.isEmpty() &&
716         !newValues.contains(UCHAR_SCRIPT);
717     if(newValues.contains(UCHAR_SCRIPT) || revertToScript) {
718         int32_t script=props.getIntProp(UCHAR_SCRIPT);
719         uint32_t value=splitScriptCodeOrIndex(script);
720         // Use UPROPS_SCRIPT_X_MASK:
721         // When writing a Script code, remove Script_Extensions bits as well.
722         // If needed, they will get written again.
723         upvec_setValue(pv, start, end, 0, value, UPROPS_SCRIPT_X_MASK, &errorCode);
724     }
725     // Write a new (Script, Script_Extensions) value if there are Script_Extensions
726     // and either Script or Script_Extensions are new on the current line.
727     // (If only Script is new, then it just clobbered the relevant bits.)
728     if( !props.scx.isEmpty() &&
729         (newValues.contains(UCHAR_SCRIPT) || newValues.contains(UCHAR_SCRIPT_EXTENSIONS))
730     ) {
731         UnicodeString codes;  // vector of 16-bit UScriptCode values
732         UnicodeSetIterator iter(props.scx);
733         while(iter.next()) { codes.append((UChar)iter.getCodepoint()); }
734 
735         // Set bit 15 on the last script code, for termination.
736         int32_t length=codes.length();
737         codes.setCharAt(length-1, (UChar)(codes[length-1]|0x8000));
738         // Find this list of codes in the Script_Extensions data so far, or add this list.
739         int32_t index=scriptExtensions.indexOf(codes);
740         if(index<0) {
741             index=scriptExtensions.length();
742             scriptExtensions.append(codes);
743         }
744 
745         // Encode the (Script, Script_Extensions index) pair.
746         int32_t script=props.getIntProp(UCHAR_SCRIPT);
747         uint32_t scriptX;
748         if(script==USCRIPT_COMMON) {
749             scriptX=UPROPS_SCRIPT_X_WITH_COMMON;
750         } else if(script==USCRIPT_INHERITED) {
751             scriptX=UPROPS_SCRIPT_X_WITH_INHERITED;
752         } else {
753             // Store an additional pair of 16-bit units for an unusual main Script code
754             // together with the Script_Extensions index.
755             UnicodeString codeIndexPair;
756             codeIndexPair.append((UChar)script).append((UChar)index);
757             index=scriptExtensions.indexOf(codeIndexPair);
758             if(index<0) {
759                 index=scriptExtensions.length();
760                 scriptExtensions.append(codeIndexPair);
761             }
762             scriptX=UPROPS_SCRIPT_X_WITH_OTHER;
763         }
764         if(index>UPROPS_MAX_SCRIPT) {
765             fprintf(stderr, "genprops: Script_Extensions indexes overflow bit fields\n");
766             errorCode=U_BUFFER_OVERFLOW_ERROR;
767             return;
768         }
769         scriptX|=splitScriptCodeOrIndex(index);
770         upvec_setValue(pv, start, end, 0, scriptX, UPROPS_SCRIPT_X_MASK, &errorCode);
771     }
772     if(U_FAILURE(errorCode)) {
773         fprintf(stderr, "genprops error: unable to set props2 values for %04lX..%04lX: %s\n",
774                 (long)start, (long)end, u_errorName(errorCode));
775     }
776 }
777 
778 static int32_t indexes[UPROPS_INDEX_COUNT]={
779     0, 0, 0, 0,
780     0, 0, 0, 0,
781     0, 0, 0, 0,
782     0, 0, 0, 0
783 };
784 
785 static uint8_t trieBlock[100000];
786 static int32_t trieSize;
787 static uint8_t props2TrieBlock[100000];
788 static int32_t props2TrieSize;
789 
790 static int32_t totalSize;
791 
792 void
build(UErrorCode & errorCode)793 CorePropsBuilder::build(UErrorCode &errorCode) {
794     if(U_FAILURE(errorCode)) { return; }
795 
796     utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode);
797     if(U_FAILURE(errorCode)) {
798         fprintf(stderr,
799                 "genprops/core error: utrie2_freeze(main trie) failed: %s\n",
800                 u_errorName(errorCode));
801         return;
802     }
803     trieSize=utrie2_serialize(pTrie, trieBlock, sizeof(trieBlock), &errorCode);
804     if(U_FAILURE(errorCode)) {
805         fprintf(stderr,
806                 "genprops/core error: utrie2_serialize(main trie) failed: %s (length %ld)\n",
807                 u_errorName(errorCode), (long)trieSize);
808         return;
809     }
810 
811     props2Trie=upvec_compactToUTrie2WithRowIndexes(pv, &errorCode);
812     if(U_FAILURE(errorCode)) {
813         fprintf(stderr, "genprops/core error: unable to build trie for additional properties: %s\n",
814                 u_errorName(errorCode));
815         return;
816     }
817 
818     props2TrieSize=utrie2_serialize(props2Trie,
819                                     props2TrieBlock, (int32_t)sizeof(props2TrieBlock),
820                                     &errorCode);
821     if(U_FAILURE(errorCode)) {
822         fprintf(stderr,
823                 "genprops/core error: utrie2_freeze(additional properties)+utrie2_serialize() "
824                 "failed: %s\n",
825                 u_errorName(errorCode));
826         return;
827     }
828 
829     int32_t pvRows;
830     const uint32_t *pvArray=upvec_getArray(pv, &pvRows, NULL);
831     int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
832 
833     /* round up scriptExtensions to multiple of 4 bytes */
834     if(scriptExtensions.length()&1) {
835         scriptExtensions.append((UChar)0);
836     }
837 
838     /* set indexes */
839     int32_t offset=sizeof(indexes)/4;       /* uint32_t offset to the properties trie */
840     offset+=trieSize>>2;
841     indexes[UPROPS_PROPS32_INDEX]=          /* set indexes to the same offsets for empty */
842     indexes[UPROPS_EXCEPTIONS_INDEX]=       /* structures from the old format version 3 */
843     indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=   /* so that less runtime code has to be changed */
844     indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
845 
846     offset+=props2TrieSize/4;
847     indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=offset;
848     indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
849     offset+=pvCount;
850     indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]=offset;
851     offset+=scriptExtensions.length()/2;
852     indexes[UPROPS_RESERVED_INDEX_7]=offset;
853     indexes[UPROPS_RESERVED_INDEX_8]=offset;
854     indexes[UPROPS_DATA_TOP_INDEX]=offset;
855     totalSize=4*offset;
856 
857     indexes[UPROPS_MAX_VALUES_INDEX]=
858         (((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
859         (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
860         (int32_t)splitScriptCodeOrIndex(USCRIPT_CODE_LIMIT-1);
861     indexes[UPROPS_MAX_VALUES_2_INDEX]=
862         (((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
863         (((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
864         (((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
865         (((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
866         ((int32_t)U_DT_COUNT-1);
867 
868     if(!beQuiet) {
869         puts("* uprops.icu stats *");
870         printf("trie size in bytes:                    %5u\n", (int)trieSize);
871         printf("size in bytes of additional props trie:%5u\n", (int)props2TrieSize);
872         printf("number of additional props vectors:    %5u\n", (int)pvRows);
873         printf("number of 32-bit words per vector:     %5u\n", UPROPS_VECTOR_WORDS);
874         printf("number of 16-bit scriptExtensions:     %5u\n", (int)scriptExtensions.length());
875         printf("data size:                            %6ld\n", (long)totalSize);
876     }
877 }
878 
879 void
writeCSourceFile(const char * path,UErrorCode & errorCode)880 CorePropsBuilder::writeCSourceFile(const char *path, UErrorCode &errorCode) {
881     if(U_FAILURE(errorCode)) { return; }
882 
883     int32_t pvRows;
884     const uint32_t *pvArray=upvec_getArray(pv, &pvRows, NULL);
885     int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
886 
887     FILE *f=usrc_create(path, "uchar_props_data.h", 2016,
888                         "icu/tools/unicode/c/genprops/corepropsbuilder.cpp");
889     if(f==NULL) {
890         errorCode=U_FILE_ACCESS_ERROR;
891         return;
892     }
893     fputs("#ifdef INCLUDED_FROM_UCHAR_C\n\n", f);
894     usrc_writeArray(f,
895         "static const UVersionInfo dataVersion={",
896         dataInfo.dataVersion, 8, 4,
897         "};\n\n");
898     usrc_writeUTrie2Arrays(f,
899         "static const uint16_t propsTrie_index[%ld]={\n", NULL,
900         pTrie,
901         "\n};\n\n");
902     usrc_writeUTrie2Struct(f,
903         "static const UTrie2 propsTrie={\n",
904         pTrie, "propsTrie_index", NULL,
905         "};\n\n");
906 
907     usrc_writeUTrie2Arrays(f,
908         "static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL,
909         props2Trie,
910         "\n};\n\n");
911     usrc_writeUTrie2Struct(f,
912         "static const UTrie2 propsVectorsTrie={\n",
913         props2Trie, "propsVectorsTrie_index", NULL,
914         "};\n\n");
915 
916     usrc_writeArray(f,
917         "static const uint32_t propsVectors[%ld]={\n",
918         pvArray, 32, pvCount,
919         "};\n\n");
920     fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
921     fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)UPROPS_VECTOR_WORDS);
922 
923     usrc_writeArray(f,
924         "static const uint16_t scriptExtensions[%ld]={\n",
925         scriptExtensions.getBuffer(), 16, scriptExtensions.length(),
926         "};\n\n");
927 
928     usrc_writeArray(f,
929         "static const int32_t indexes[UPROPS_INDEX_COUNT]={",
930         indexes, 32, UPROPS_INDEX_COUNT,
931         "};\n\n");
932     fputs("#endif  // INCLUDED_FROM_UCHAR_C\n", f);
933     fclose(f);
934 }
935 
936 void
writeBinaryData(const char * path,UBool withCopyright,UErrorCode & errorCode)937 CorePropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) {
938     if(U_FAILURE(errorCode)) { return; }
939 
940     int32_t pvRows;
941     const uint32_t *pvArray=upvec_getArray(pv, &pvRows, NULL);
942     int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
943 
944     UNewDataMemory *pData=udata_create(path, "icu", "uprops", &dataInfo,
945                                        withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
946     if(U_FAILURE(errorCode)) {
947         fprintf(stderr, "genprops: udata_create(%s, uprops.icu) failed - %s\n",
948                 path, u_errorName(errorCode));
949         return;
950     }
951 
952     udata_writeBlock(pData, indexes, sizeof(indexes));
953     udata_writeBlock(pData, trieBlock, trieSize);
954     udata_writeBlock(pData, props2TrieBlock, props2TrieSize);
955     udata_writeBlock(pData, pvArray, pvCount*4);
956     udata_writeBlock(pData, scriptExtensions.getBuffer(), scriptExtensions.length()*2);
957 
958     long dataLength=udata_finish(pData, &errorCode);
959     if(U_FAILURE(errorCode)) {
960         fprintf(stderr, "genprops: error %s writing the output file\n", u_errorName(errorCode));
961         return;
962     }
963 
964     if(dataLength!=(long)totalSize) {
965         fprintf(stderr,
966                 "udata_finish(uprops.icu) reports %ld bytes written but should be %ld\n",
967                 dataLength, (long)totalSize);
968         errorCode=U_INTERNAL_PROGRAM_ERROR;
969     }
970 }
971 
972 PropsBuilder *
createCorePropsBuilder(UErrorCode & errorCode)973 createCorePropsBuilder(UErrorCode &errorCode) {
974     if(U_FAILURE(errorCode)) { return NULL; }
975     PropsBuilder *pb=new CorePropsBuilder(errorCode);
976     if(pb==NULL) {
977         errorCode=U_MEMORY_ALLOCATION_ERROR;
978     }
979     return pb;
980 }
981 
982 /*
983  * Hey, Emacs, please set the following:
984  *
985  * Local Variables:
986  * indent-tabs-mode: nil
987  * End:
988  *
989  */
990