• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1999-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  corepropsbuilder.cpp (was store.c & props2.cpp)
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 1999dec11
16 *   created by: Markus W. Scherer
17 *
18 *   Store Unicode character properties efficiently for
19 *   random access.
20 */
21 
22 #include <stdio.h>
23 #include "unicode/utypes.h"
24 #include "unicode/uchar.h"
25 #include "unicode/udata.h"
26 #include "unicode/uniset.h"
27 #include "unicode/unistr.h"
28 #include "unicode/usetiter.h"
29 #include "unicode/uscript.h"
30 #include "cmemory.h"
31 #include "cstring.h"
32 #include "genprops.h"
33 #include "propsvec.h"
34 #include "uassert.h"
35 #include "unewdata.h"
36 #include "uprops.h"
37 #include "utrie2.h"
38 #include "writesrc.h"
39 
40 /* Unicode character properties file format ------------------------------------
41 
42 The file format prepared and written here contains several data
43 structures that store indexes or data.
44 
45 Before the data contents described below, there are the headers required by
46 the udata API for loading ICU data. Especially, a UDataInfo structure
47 precedes the actual data. It contains platform properties values and the
48 file format version.
49 
50 The following is a description of format version 7.8 .
51 
52 Data contents:
53 
54 The contents is a parsed, binary form of several Unicode character
55 database files, most prominently UnicodeData.txt.
56 
57 Any Unicode code point from 0 to 0x10ffff can be looked up to get
58 the properties, if any, for that code point. This means that the input
59 to the lookup are 21-bit unsigned integers, with not all of the
60 21-bit range used.
61 
62 It is assumed that client code keeps a uint32_t pointer
63 to the beginning of the data:
64 
65     const uint32_t *p32;
66 
67 Formally, the file contains the following structures:
68 
69     const int32_t indexes[16] with values i0..i15:
70 
71   i0 indicates the length of the main trie.
72   i0..i3 all have the same value in format versions 4.0 and higher;
73          the related props32[] and exceptions[] and uchars[] were used in format version 3
74 
75     i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
76     i1 exceptionsIndex;  -- 32-bit unit index to the table of 32-bit exception words
77     i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
78 
79     i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
80     i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
81     i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
82 
83     i6 scriptExtensionsIndex; -- 32-bit unit index to the Script_Extensions data
84     i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data
85     i8 reservedIndex8; -- for now: i7, i8 and i9 have the same values
86     i9 dataTopIndex; -- size of the data file (number of 32-bit units after the header)
87 
88     i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
89     i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
90     i12..i15 reservedIndexes; -- reserved values; 0 for now
91 
92     PT serialized properties trie, see utrie2.h (byte size: 4*(i0-16))
93 
94   P, E, and U are not used (empty) in format versions 4 and above
95 
96     P  const uint32_t props32[i1-i0];
97     E  const uint32_t exceptions[i2-i1];
98     U  const UChar uchars[2*(i3-i2)];
99 
100     AT serialized trie for additional properties (byte size: 4*(i4-i3))
101     PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
102 
103     SCX const uint16_t scriptExtensions[2*(i7-i6)];
104 
105       SCX contains Script_Extensions lists and (Script code, Script_Extensions index) pairs.
106       A Script_Extensions list is a sequence of UScriptCode values in ascending order,
107       with the last code having bit 15 set for termination.
108       A (Script code, Script_Extensions index) pair is the main UScriptCode (Script value)
109       followed by the index of the Script_Extensions list.
110       If the propsVectors[] column 0 value indicates that there are Script_Extensions,
111       then the script-code-or-index bit fields are an index to either a list or a pair in SCX,
112       rather than the Script itself. The high bits in the UPROPS_SCRIPT_X_MASK fields
113       indicate whether the main Script value is Common or Inherited (and the index is to a list)
114       vs. another value (and the index is to a pair).
115       (See UPROPS_SCRIPT_X_WITH_COMMON etc. in uprops.h.)
116 
117 Trie lookup and properties:
118 
119 In order to condense the data for the 21-bit code space, several properties of
120 the Unicode code assignment are exploited:
121 - The code space is sparse.
122 - There are several 10k of consecutive codes with the same properties.
123 - Characters and scripts are allocated in groups of 16 code points.
124 - Inside blocks for scripts the properties are often repetitive.
125 - The 21-bit space is not fully used for Unicode.
126 
127 The lookup of properties for a given code point is done with a trie lookup,
128 using the UTrie implementation.
129 The trie lookup result is a 16-bit properties word.
130 
131 With a given Unicode code point
132 
133     UChar32 c;
134 
135 and 0<=c<0x110000, the lookup is done like this:
136 
137     uint16_t props;
138     UTRIE_GET16(trie, c, props);
139 
140 Each 16-bit properties word contains:
141 
142  0.. 4  general category
143      5  reserved
144  6..15  numeric type and value (ntv)
145 
146 Encoding of numeric type and value in the 10-bit ntv field:
147     ntv             type            value
148     0               U_NT_NONE       0
149     1..10           U_NT_DECIMAL    0..9
150     11..20          U_NT_DIGIT      0..9
151     21..0x3ff       U_NT_NUMERIC    see below
152 
153     For U_NT_NUMERIC:
154     ntv             value
155     21..0xaf        integer     0..154
156     0xb0..0x1df     fraction    ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16
157     0x1e0..0x2ff    large int   ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
158                     (only one significant decimal digit)
159     0x300..0x323    base-60 (sexagesimal) integer (new in format version 7.1)
160                                 ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
161     0x324..0x34b    fraction-20 (new in format version 7.3)
162                                 frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640
163                                 numerator: num = 2*(frac20&3)+1
164                                 denominator: den = 20<<(frac20>>2)
165     0x34c..0x35b    fraction-32 (new in format version 7.6)
166                                 frac32 = ntv-0x34c = 0..15 -> 1|3|5|7 / 32|64|128|256
167                                 numerator: num = 2*(frac32&3)+1
168                                 denominator: den = 32<<(frac32>>2)
169     0x35c..0x3ff    reserved
170 
171 --- Additional properties (new in format version 2.1) ---
172 
173 The second trie for additional properties (AT) is also a UTrie with 16-bit data.
174 The data words consist of 32-bit unit indexes (not row indexes!) into the
175 table of unique properties vectors (PV).
176 Each vector contains a set of properties.
177 The width of a vector (number of uint32_t per row) may change
178 with the formatVersion, it is stored in i5.
179 
180 Current properties: see icu/source/common/uprops.h
181 
182 --- Changes in format version 3.1 ---
183 
184 See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
185 
186 --- Changes in format version 3.2 ---
187 
188 - The tries use linear Latin-1 ranges.
189 - The additional properties bits store full properties XYZ instead
190   of partial Other_XYZ, so that changes in the derivation formulas
191   need not be tracked in runtime library code.
192 - Joining Type and Line Break are also stored completely, so that uprops.c
193   needs no runtime formulas for enumerated properties either.
194 - Store the case-sensitive flag in the main properties word.
195 - i10 also contains U_LB_COUNT and U_EA_COUNT.
196 - i11 contains maxValues2 for vector word 2.
197 
198 --- Changes in format version 4 ---
199 
200 The format changes between version 3 and 4 because the properties related to
201 case mappings and bidi/shaping are pulled out into separate files
202 for modularization.
203 In order to reduce the need for code changes, some of the previous data
204 structures are omitted, rather than rearranging everything.
205 
206 (The change to format version 4 is for ICU 3.4. The last CVS revision of
207 genprops/store.c for format version 3.2 is 1.48.)
208 
209 The main trie's data is significantly simplified:
210 - The trie's 16-bit data word is used directly instead of as an index
211   into props32[].
212 - The trie uses the default trie folding functions instead of custom ones.
213 - Numeric values are stored directly in the trie data word, with special
214   encodings.
215 - No more exception data (the data that needed it was pulled out, or, in the
216   case of numeric values, encoded differently).
217 - No more string data (pulled out - was for case mappings).
218 
219 Also, some of the previously used properties vector bits are reserved again.
220 
221 The indexes[] values for the omitted structures are still filled in
222 (indicating zero-length arrays) so that the swapper code remains unchanged.
223 
224 --- Changes in format version 5 ---
225 
226 Format version 5 became necessary because the bit field for script codes
227 overflowed. The changes are incompatible because
228 old code would have seen nonsensically low values for new, higher script codes.
229 
230 Rearranged bit fields in the second trie (AT) and widened three (Script, Block,
231 Word_Break) by one bit each.
232 
233 Modified bit fields in icu/source/common/uprops.h
234 
235 --- Changes in format version 6 ---
236 
237 Format version 6 became necessary because Unicode 5.2 adds fractions with
238 denominators 9, 10 and 16, and it was easier to redesign the encoding of numeric
239 types and values rather than add another variant to the previous format.
240 
241 --- Changes in format version 7 ---
242 
243 Unicode 6.0 adds Script_Extensions. For characters with script extensions data,
244 the script code bits are an index into the new Script_Extensions array rather
245 than a script code.
246 
247 Change from UTrie to UTrie2.
248 
249 --- Changes in format version 7.1 ---
250 
251 Unicode 6.2 adds sexagesimal (base-60) numeric values:
252     cp;12432;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH;nv=216000
253     cp;12433;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN;nv=432000
254 
255 The encoding of numeric values was extended to handle such values.
256 
257 --- Changes in format version 7.2 ---
258 
259 ICU 57 adds 4 Emoji properties to vector word 2.
260 https://unicode-org.atlassian.net/browse/ICU-11802
261 http://www.unicode.org/reports/tr51/#Emoji_Properties
262 
263 --- Changes in format version 7.3 ---
264 
265 ICU 58 adds fraction-20 numeric values for new Unicode 9 Malayalam fraction characters.
266 
267 --- Changes in format version 7.4 ---
268 
269 ICU 60 adds the Prepended_Concatenation_Mark property to vector word 1.
270 
271 ICU 60 adds the Emoji_Component property to vector word 2, for emoji 5.
272 https://unicode-org.atlassian.net/browse/ICU-13062
273 http://www.unicode.org/reports/tr51/#Emoji_Properties
274 
275 --- Changes in format version 7.5 ---
276 
277 ICU 62 adds the Extended_Pictographic property to vector word 2, for emoji 11.
278 http://www.unicode.org/reports/tr51/#Emoji_Properties
279 
280 --- Changes in format version 7.6 ---
281 
282 ICU 64 adds fraction-32 numeric values for new Unicode 12 Tamil fraction characters.
283 
284 --- Changes in format version 7.7 ---
285 
286 ICU 66 adds two bits for the UScriptCode or Script_Extensions index in vector word 0.
287 The value is split across bits 21..20 & 7..0.
288 
289 --- Changes in format version 7.8 ---
290 
291 ICU 70 moves the emoji properties from uprops.icu to (new) uemoji.icu.
292 The 6 bits in vector word 2 that stored emoji properties are unused again.
293 
294 ----------------------------------------------------------------------------- */
295 
296 U_NAMESPACE_USE
297 
298 /* UDataInfo cf. udata.h */
299 static UDataInfo dataInfo={
300     sizeof(UDataInfo),
301     0,
302 
303     U_IS_BIG_ENDIAN,
304     U_CHARSET_FAMILY,
305     U_SIZEOF_UCHAR,
306     0,
307 
308     { 0x55, 0x50, 0x72, 0x6f },                 /* dataFormat="UPro" */
309     { 7, 8, 0, 0 },                             /* formatVersion */
310     { 14, 0, 0, 0 }                             /* dataVersion */
311 };
312 
splitScriptCodeOrIndex(uint32_t v)313 inline uint32_t splitScriptCodeOrIndex(uint32_t v) {
314     return
315         ((v << UPROPS_SCRIPT_HIGH_SHIFT) & UPROPS_SCRIPT_HIGH_MASK) |
316         (v & UPROPS_SCRIPT_LOW_MASK);
317 }
318 
319 class CorePropsBuilder : public PropsBuilder {
320 public:
321     CorePropsBuilder(UErrorCode &errorCode);
322     virtual ~CorePropsBuilder();
323 
324     virtual void setUnicodeVersion(const UVersionInfo version);
325     virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
326     virtual void build(UErrorCode &errorCode);
327     virtual void writeCSourceFile(const char *path, UErrorCode &errorCode);
328     virtual void writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode);
329 
330 private:
331     void setGcAndNumeric(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
332 
333     UTrie2 *pTrie;
334     UTrie2 *props2Trie;
335     UPropsVectors *pv;
336     UnicodeString scriptExtensions;
337 };
338 
CorePropsBuilder(UErrorCode & errorCode)339 CorePropsBuilder::CorePropsBuilder(UErrorCode &errorCode)
340         : pTrie(NULL), props2Trie(NULL), pv(NULL) {
341     pTrie=utrie2_open(0, 0, &errorCode);
342     if(U_FAILURE(errorCode)) {
343         fprintf(stderr, "genprops error: corepropsbuilder utrie2_open() failed - %s\n",
344                 u_errorName(errorCode));
345     }
346     pv=upvec_open(UPROPS_VECTOR_WORDS, &errorCode);
347     if(U_FAILURE(errorCode)) {
348         fprintf(stderr, "genprops error: corepropsbuilder upvec_open() failed - %s\n",
349                 u_errorName(errorCode));
350     }
351 }
352 
~CorePropsBuilder()353 CorePropsBuilder::~CorePropsBuilder() {
354     utrie2_close(pTrie);
355     utrie2_close(props2Trie);
356     upvec_close(pv);
357 }
358 
359 void
setUnicodeVersion(const UVersionInfo version)360 CorePropsBuilder::setUnicodeVersion(const UVersionInfo version) {
361     uprv_memcpy(dataInfo.dataVersion, version, 4);
362 }
363 
encodeFractional20(int32_t value,int32_t den)364 static int32_t encodeFractional20(int32_t value, int32_t den) {
365     if(den<20 || 640<den) { return -1; }
366     int32_t frac20;
367     if(value==1 || value==3 || value==5 || value==7) {
368         frac20=value/2;
369     } else {
370         return -1;
371     }
372     // Denominator: 20 times which power of 2: 0..5 into bits 4..2
373     do {
374         if(den==20) {
375             return UPROPS_NTV_FRACTION20_START+frac20;
376         }
377         if(den&1) {
378             return -1;  // odd denominator, and we would lose the low bit in den/=2
379         }
380         den/=2;
381         frac20+=4;
382     } while(den>=20);
383     return -1;
384 }
385 
encodeFractional32(int32_t value,int32_t den)386 static int32_t encodeFractional32(int32_t value, int32_t den) {
387     if(den<32 || 256<den) { return -1; }
388     int32_t frac32;
389     if(value==1 || value==3 || value==5 || value==7) {
390         frac32=value/2;
391     } else {
392         return -1;
393     }
394     // Denominator: 32 times which power of 2: 0..3 into bits 3..2
395     do {
396         if(den==32) {
397             return UPROPS_NTV_FRACTION32_START+frac32;
398         }
399         if(den&1) {
400             return -1;  // odd denominator, and we would lose the low bit in den/=2
401         }
402         den/=2;
403         frac32+=4;
404     } while(den>=32);
405     return -1;
406 }
407 
408 // For nt=U_NT_NUMERIC.
409 static int32_t
encodeNumericValue(UChar32 start,const char * s,UErrorCode & errorCode)410 encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) {
411     const char *original=s;
412     /* get a possible minus sign */
413     UBool isNegative;
414     if(*s=='-') {
415         isNegative=true;
416         ++s;
417     } else {
418         isNegative=false;
419     }
420 
421     int32_t value=0, den=0, exp=0, ntv=0;
422     char *numberLimit;
423     /* try large, single-significant-digit numbers, may otherwise overflow strtoul() */
424     if('1'<=s[0] && s[0]<='9' && s[1]=='0' && s[2]=='0') {
425         value=s[0]-'0';
426         numberLimit=const_cast<char *>(s);
427         while(*(++numberLimit)=='0') {
428             ++exp;
429         }
430     } else {
431         /* normal number parsing */
432         unsigned long ul=uprv_strtoul(s, &numberLimit, 10);
433         if(s==numberLimit || (*numberLimit!=0 && *numberLimit!='/') || ul>0x7fffffff) {
434             ntv=-1;
435         } else {
436             value=(int32_t)ul;
437         }
438         if(ntv>=0 && *numberLimit=='/') {
439             /* fractional value, get the denominator */
440             s=numberLimit+1;
441             ul=uprv_strtoul(s, &numberLimit, 10);
442             if(s==numberLimit || *numberLimit!=0 || ul==0 || ul>0x7fffffff) {
443                 ntv=-1;
444             } else {
445                 den=(int32_t)ul;
446             }
447         }
448     }
449     if(isNegative) {
450         value=-(int32_t)value;
451     }
452 
453     if(ntv<0) {
454         // pass
455     } else if(den==0 && value>=0) {
456         if(exp==2 && (value*100)<=UPROPS_NTV_MAX_SMALL_INT) {
457             /* small integer parsed like a large one */
458             ntv=UPROPS_NTV_NUMERIC_START+value*100;
459         } else if(exp==0) {
460             if(value<=UPROPS_NTV_MAX_SMALL_INT) {
461                 /* small integer */
462                 ntv=UPROPS_NTV_NUMERIC_START+value;
463             } else {
464                 /* large integer parsed like a small one */
465                 /* split the value into mantissa and exponent, base 10 */
466                 int32_t mant=value;
467                 while((mant%10)==0) {
468                     mant/=10;
469                     ++exp;
470                 }
471                 // Note: value<=0x7fffffff guarantees exp<=33
472                 if(mant<=9) {
473                     ntv=((mant+14)<<5)+(exp-2);
474                 } else {
475                     // Try sexagesimal (base 60) numbers.
476                     mant=value;
477                     exp=0;
478                     while((mant%60)==0) {
479                         mant/=60;
480                         ++exp;
481                     }
482                     if(mant<=9 && exp<=4) {
483                         ntv=((mant+0xbf)<<2)+(exp-1);
484                     } else {
485                         ntv=-1;
486                     }
487                 }
488             }
489         } else if(2<=exp && exp<=33 && 1<=value && value<=9) {
490             /* large, single-significant-digit integer */
491             ntv=((value+14)<<5)+(exp-2);
492         } else {
493             ntv=-1;
494         }
495     } else if(exp==0 && -1<=value && value<=17 && 1<=den && den<=16) {
496         /* fraction */
497         ntv=((value+12)<<4)+(den-1);
498     } else if(exp==0 && value==-1 && den==0) {
499         /* -1 parsed with den=0, encoded as pseudo-fraction -1/1 */
500         ntv=((value+12)<<4);
501     } else if(exp==0 && (ntv=encodeFractional20(value, den))>=0) {
502         // fits into fractional-20 format
503     } else if(exp==0 && (ntv=encodeFractional32(value, den))>=0) {
504         // fits into fractional-32 format
505     } else {
506         ntv=-1;
507     }
508     if(ntv<0 || *numberLimit!=0) {
509         fprintf(stderr, "genprops error: unable to encode numeric value nv=%s\n", original);
510         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
511     }
512     return ntv;
513 }
514 
515 void
setGcAndNumeric(const UniProps & props,const UnicodeSet & newValues,UErrorCode & errorCode)516 CorePropsBuilder::setGcAndNumeric(const UniProps &props, const UnicodeSet &newValues,
517                                   UErrorCode &errorCode) {
518     if(U_FAILURE(errorCode)) { return; }
519     UChar32 start=props.start;
520     UChar32 end=props.end;
521 
522     int32_t type=props.getIntProp(UCHAR_NUMERIC_TYPE);
523     const char *nvString=props.numericValue;
524     if(type!=U_NT_NONE && nvString==NULL && start==end) {
525         fprintf(stderr, "genprops error: cp line has Numeric_Type but no Numeric_Value\n");
526         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
527         return;
528     }
529 
530     if(!newValues.contains(UCHAR_GENERAL_CATEGORY) && !newValues.contains(UCHAR_NUMERIC_VALUE)) {
531         return;
532     }
533 
534     int32_t ntv=UPROPS_NTV_NONE;  // numeric type & value
535     if(nvString!=NULL && uprv_strcmp(nvString, "NaN")!=0) {
536         int32_t digitValue=props.digitValue;
537         if( type<=U_NT_NONE || U_NT_NUMERIC<type ||
538             ((type==U_NT_DECIMAL || type==U_NT_DIGIT) && digitValue<0)
539         ) {
540             fprintf(stderr, "genprops error: nt=%d but nv=%s\n",
541                     (int)type, nvString==NULL ? "NULL" : nvString);
542             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
543             return;
544         }
545 
546         switch(type) {
547         case U_NT_NONE:
548             ntv=UPROPS_NTV_NONE;
549             break;
550         case U_NT_DECIMAL:
551             ntv=UPROPS_NTV_DECIMAL_START+digitValue;
552             break;
553         case U_NT_DIGIT:
554             ntv=UPROPS_NTV_DIGIT_START+digitValue;
555             break;
556         case U_NT_NUMERIC:
557             if(digitValue>=0) {
558                 ntv=UPROPS_NTV_NUMERIC_START+digitValue;
559             } else {
560                 ntv=encodeNumericValue(start, nvString, errorCode);
561                 if(U_FAILURE(errorCode)) {
562                     return;
563                 }
564             }
565         default:
566             break;  // unreachable
567         }
568     }
569 
570     uint32_t value=
571         (uint32_t)props.getIntProp(UCHAR_GENERAL_CATEGORY) |
572         (ntv<<UPROPS_NUMERIC_TYPE_VALUE_SHIFT);
573     if(start==end) {
574         utrie2_set32(pTrie, start, value, &errorCode);
575     } else {
576         utrie2_setRange32(pTrie, start, end, value, true, &errorCode);
577     }
578     if(U_FAILURE(errorCode)) {
579         fprintf(stderr, "error: utrie2_setRange32(properties trie %04lX..%04lX) failed - %s\n",
580                 (long)start, (long)end, u_errorName(errorCode));
581     }
582 }
583 
584 struct PropToBinary {
585     int32_t prop;  // UProperty
586     int32_t vecWord, vecShift;
587 };
588 
589 static const PropToBinary
590 propToBinaries[]={
591     { UCHAR_WHITE_SPACE,                    1, UPROPS_WHITE_SPACE },
592     { UCHAR_DASH,                           1, UPROPS_DASH },
593     // Note: The Hyphen property is stabilized since Unicode 4.0
594     // and deprecated since Unicode 6.0.
595     { UCHAR_HYPHEN,                         1, UPROPS_HYPHEN },
596     { UCHAR_QUOTATION_MARK,                 1, UPROPS_QUOTATION_MARK },
597     { UCHAR_TERMINAL_PUNCTUATION,           1, UPROPS_TERMINAL_PUNCTUATION },
598     // Note: The Hex_Digit and ASCII_Hex_Digit properties are probably stable enough
599     // so that they could be hardcoded.
600     { UCHAR_HEX_DIGIT,                      1, UPROPS_HEX_DIGIT },
601     { UCHAR_ASCII_HEX_DIGIT,                1, UPROPS_ASCII_HEX_DIGIT },
602     { UCHAR_IDEOGRAPHIC,                    1, UPROPS_IDEOGRAPHIC },
603     { UCHAR_DIACRITIC,                      1, UPROPS_DIACRITIC },
604     { UCHAR_EXTENDER,                       1, UPROPS_EXTENDER },
605     // Note: The Noncharacter_Code_Point property is probably stable enough
606     // so that it could be hardcoded.
607     { UCHAR_NONCHARACTER_CODE_POINT,        1, UPROPS_NONCHARACTER_CODE_POINT },
608     // Note: The Grapheme_Link property is deprecated since Unicode 5.0
609     // because it is a "Duplication of ccc=9" (UAX #44).
610     { UCHAR_GRAPHEME_LINK,                  1, UPROPS_GRAPHEME_LINK },
611     { UCHAR_IDS_BINARY_OPERATOR,            1, UPROPS_IDS_BINARY_OPERATOR },
612     { UCHAR_IDS_TRINARY_OPERATOR,           1, UPROPS_IDS_TRINARY_OPERATOR },
613     { UCHAR_RADICAL,                        1, UPROPS_RADICAL },
614     { UCHAR_UNIFIED_IDEOGRAPH,              1, UPROPS_UNIFIED_IDEOGRAPH },
615     { UCHAR_DEPRECATED,                     1, UPROPS_DEPRECATED },
616     { UCHAR_LOGICAL_ORDER_EXCEPTION,        1, UPROPS_LOGICAL_ORDER_EXCEPTION },
617     { UCHAR_S_TERM,                         1, UPROPS_S_TERM },
618     { UCHAR_VARIATION_SELECTOR,             1, UPROPS_VARIATION_SELECTOR },
619     // Note: Pattern_Syntax & Pattern_White_Space are available via
620     // the internal PatternProps class and need not be stored here any more.
621     { UCHAR_PATTERN_SYNTAX,                 1, UPROPS_PATTERN_SYNTAX },
622     { UCHAR_PATTERN_WHITE_SPACE,            1, UPROPS_PATTERN_WHITE_SPACE },
623     { UCHAR_XID_START,                      1, UPROPS_XID_START },
624     { UCHAR_XID_CONTINUE,                   1, UPROPS_XID_CONTINUE },
625     { UCHAR_MATH,                           1, UPROPS_MATH },
626     { UCHAR_ALPHABETIC,                     1, UPROPS_ALPHABETIC },
627     { UCHAR_GRAPHEME_EXTEND,                1, UPROPS_GRAPHEME_EXTEND },
628     { UCHAR_DEFAULT_IGNORABLE_CODE_POINT,   1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
629     { UCHAR_ID_START,                       1, UPROPS_ID_START },
630     { UCHAR_ID_CONTINUE,                    1, UPROPS_ID_CONTINUE },
631     { UCHAR_GRAPHEME_BASE,                  1, UPROPS_GRAPHEME_BASE },
632 
633     { UCHAR_PREPENDED_CONCATENATION_MARK,   1, UPROPS_PREPENDED_CONCATENATION_MARK },
634 };
635 
636 struct PropToEnum {
637     int32_t prop;  // UProperty
638     int32_t vecWord, vecShift;
639     uint32_t vecMask;
640 };
641 
642 static const PropToEnum
643 propToEnums[]={
644     { UCHAR_BLOCK,                      0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK },
645     { UCHAR_EAST_ASIAN_WIDTH,           0, UPROPS_EA_SHIFT, UPROPS_EA_MASK },
646     { UCHAR_DECOMPOSITION_TYPE,         2, 0, UPROPS_DT_MASK },
647     { UCHAR_GRAPHEME_CLUSTER_BREAK,     2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK },
648     { UCHAR_WORD_BREAK,                 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK },
649     { UCHAR_SENTENCE_BREAK,             2, UPROPS_SB_SHIFT, UPROPS_SB_MASK },
650     { UCHAR_LINE_BREAK,                 2, UPROPS_LB_SHIFT, UPROPS_LB_MASK },
651 };
652 
653 void
setProps(const UniProps & props,const UnicodeSet & newValues,UErrorCode & errorCode)654 CorePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
655                            UErrorCode &errorCode) {
656     setGcAndNumeric(props, newValues, errorCode);
657     if(U_FAILURE(errorCode)) { return; }
658 
659     UChar32 start=props.start;
660     UChar32 end=props.end;
661     if(start==0 && end==0x10ffff) {
662         // Also set bits for initialValue and errorValue.
663         end=UPVEC_MAX_CP;
664     }
665 
666     if(newValues.containsSome(0, UCHAR_BINARY_LIMIT-1)) {
667         for(int32_t i=0; i<LENGTHOF(propToBinaries); ++i) {
668             const PropToBinary &p2b=propToBinaries[i];
669             U_ASSERT(p2b.vecShift<32);
670             if(newValues.contains(p2b.prop)) {
671                 uint32_t mask=U_MASK(p2b.vecShift);
672                 uint32_t value= props.binProps[p2b.prop] ? mask : 0;
673                 upvec_setValue(pv, start, end, p2b.vecWord, value, mask, &errorCode);
674             }
675         }
676     }
677 
678     // Set int property values.
679     if(newValues.containsSome(UCHAR_INT_START, UCHAR_INT_LIMIT-1)) {
680         for(int32_t i=0; i<LENGTHOF(propToEnums); ++i) {
681             const PropToEnum &p2e=propToEnums[i];
682             U_ASSERT(p2e.vecShift<32);
683             if(newValues.contains(p2e.prop)) {
684                 uint32_t mask=p2e.vecMask;
685                 uint32_t value=(uint32_t)(props.getIntProp(p2e.prop)<<p2e.vecShift);
686                 U_ASSERT((value&mask)==value);
687                 upvec_setValue(pv, start, end, p2e.vecWord, value, mask, &errorCode);
688             }
689         }
690     }
691     if(newValues.contains(UCHAR_AGE)) {
692         if(props.age[0]>15 || props.age[1]>15 || props.age[2]!=0 || props.age[3]!=0) {
693             char buffer[U_MAX_VERSION_STRING_LENGTH];
694             u_versionToString(props.age, buffer);
695             fprintf(stderr, "genprops error: age %s cannot be encoded\n", buffer);
696             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
697             return;
698         }
699         uint32_t version=(props.age[0]<<4)|props.age[1];
700         upvec_setValue(pv, start, end,
701                        0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK,
702                        &errorCode);
703     }
704 
705     // Set the script value if the Script_Extensions revert to {Script}.
706     // Otherwise we would have to duplicate the code for doing so.
707     // Script and Script_Extensions share a bit field, so that by setting it to just the script
708     // we remove the Script_Extensions.
709     // (We do not just set the script bit in newValues because that is const.)
710     // For example, for U+3000:
711     // block;3000..303F;age=1.1;...;sc=Zyyy;scx=Bopo Hang Hani Hira Kana Yiii;vo=U
712     // cp;3000;...;gc=Zs;lb=BA;na=IDEOGRAPHIC SPACE;...;SB=SP;scx=<script>;WSpace
713     UBool revertToScript=
714         newValues.contains(UCHAR_SCRIPT_EXTENSIONS) && props.scx.isEmpty() &&
715         !newValues.contains(UCHAR_SCRIPT);
716     if(newValues.contains(UCHAR_SCRIPT) || revertToScript) {
717         int32_t script=props.getIntProp(UCHAR_SCRIPT);
718         uint32_t value=splitScriptCodeOrIndex(script);
719         // Use UPROPS_SCRIPT_X_MASK:
720         // When writing a Script code, remove Script_Extensions bits as well.
721         // If needed, they will get written again.
722         upvec_setValue(pv, start, end, 0, value, UPROPS_SCRIPT_X_MASK, &errorCode);
723     }
724     // Write a new (Script, Script_Extensions) value if there are Script_Extensions
725     // and either Script or Script_Extensions are new on the current line.
726     // (If only Script is new, then it just clobbered the relevant bits.)
727     if( !props.scx.isEmpty() &&
728         (newValues.contains(UCHAR_SCRIPT) || newValues.contains(UCHAR_SCRIPT_EXTENSIONS))
729     ) {
730         UnicodeString codes;  // vector of 16-bit UScriptCode values
731         UnicodeSetIterator iter(props.scx);
732         while(iter.next()) { codes.append((UChar)iter.getCodepoint()); }
733 
734         // Set bit 15 on the last script code, for termination.
735         int32_t length=codes.length();
736         codes.setCharAt(length-1, (UChar)(codes[length-1]|0x8000));
737         // Find this list of codes in the Script_Extensions data so far, or add this list.
738         int32_t index=scriptExtensions.indexOf(codes);
739         if(index<0) {
740             index=scriptExtensions.length();
741             scriptExtensions.append(codes);
742         }
743 
744         // Encode the (Script, Script_Extensions index) pair.
745         int32_t script=props.getIntProp(UCHAR_SCRIPT);
746         uint32_t scriptX;
747         if(script==USCRIPT_COMMON) {
748             scriptX=UPROPS_SCRIPT_X_WITH_COMMON;
749         } else if(script==USCRIPT_INHERITED) {
750             scriptX=UPROPS_SCRIPT_X_WITH_INHERITED;
751         } else {
752             // Store an additional pair of 16-bit units for an unusual main Script code
753             // together with the Script_Extensions index.
754             UnicodeString codeIndexPair;
755             codeIndexPair.append((UChar)script).append((UChar)index);
756             index=scriptExtensions.indexOf(codeIndexPair);
757             if(index<0) {
758                 index=scriptExtensions.length();
759                 scriptExtensions.append(codeIndexPair);
760             }
761             scriptX=UPROPS_SCRIPT_X_WITH_OTHER;
762         }
763         if(index>UPROPS_MAX_SCRIPT) {
764             fprintf(stderr, "genprops: Script_Extensions indexes overflow bit fields\n");
765             errorCode=U_BUFFER_OVERFLOW_ERROR;
766             return;
767         }
768         scriptX|=splitScriptCodeOrIndex(index);
769         upvec_setValue(pv, start, end, 0, scriptX, UPROPS_SCRIPT_X_MASK, &errorCode);
770     }
771     if(U_FAILURE(errorCode)) {
772         fprintf(stderr, "genprops error: unable to set props2 values for %04lX..%04lX: %s\n",
773                 (long)start, (long)end, u_errorName(errorCode));
774     }
775 }
776 
777 static int32_t indexes[UPROPS_INDEX_COUNT]={
778     0, 0, 0, 0,
779     0, 0, 0, 0,
780     0, 0, 0, 0,
781     0, 0, 0, 0
782 };
783 
784 static uint8_t trieBlock[100000];
785 static int32_t trieSize;
786 static uint8_t props2TrieBlock[100000];
787 static int32_t props2TrieSize;
788 
789 static int32_t totalSize;
790 
791 void
build(UErrorCode & errorCode)792 CorePropsBuilder::build(UErrorCode &errorCode) {
793     if(U_FAILURE(errorCode)) { return; }
794 
795     utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode);
796     if(U_FAILURE(errorCode)) {
797         fprintf(stderr,
798                 "genprops/core error: utrie2_freeze(main trie) failed: %s\n",
799                 u_errorName(errorCode));
800         return;
801     }
802     trieSize=utrie2_serialize(pTrie, trieBlock, sizeof(trieBlock), &errorCode);
803     if(U_FAILURE(errorCode)) {
804         fprintf(stderr,
805                 "genprops/core error: utrie2_serialize(main trie) failed: %s (length %ld)\n",
806                 u_errorName(errorCode), (long)trieSize);
807         return;
808     }
809 
810     props2Trie=upvec_compactToUTrie2WithRowIndexes(pv, &errorCode);
811     if(U_FAILURE(errorCode)) {
812         fprintf(stderr, "genprops/core error: unable to build trie for additional properties: %s\n",
813                 u_errorName(errorCode));
814         return;
815     }
816 
817     props2TrieSize=utrie2_serialize(props2Trie,
818                                     props2TrieBlock, (int32_t)sizeof(props2TrieBlock),
819                                     &errorCode);
820     if(U_FAILURE(errorCode)) {
821         fprintf(stderr,
822                 "genprops/core error: utrie2_freeze(additional properties)+utrie2_serialize() "
823                 "failed: %s\n",
824                 u_errorName(errorCode));
825         return;
826     }
827 
828     int32_t pvRows;
829     upvec_getArray(pv, &pvRows, NULL);
830     int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
831 
832     /* round up scriptExtensions to multiple of 4 bytes */
833     if(scriptExtensions.length()&1) {
834         scriptExtensions.append((UChar)0);
835     }
836 
837     /* set indexes */
838     int32_t offset=sizeof(indexes)/4;       /* uint32_t offset to the properties trie */
839     offset+=trieSize>>2;
840     indexes[UPROPS_PROPS32_INDEX]=          /* set indexes to the same offsets for empty */
841     indexes[UPROPS_EXCEPTIONS_INDEX]=       /* structures from the old format version 3 */
842     indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=   /* so that less runtime code has to be changed */
843     indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
844 
845     offset+=props2TrieSize/4;
846     indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=offset;
847     indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
848     offset+=pvCount;
849     indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]=offset;
850     offset+=scriptExtensions.length()/2;
851     indexes[UPROPS_RESERVED_INDEX_7]=offset;
852     indexes[UPROPS_RESERVED_INDEX_8]=offset;
853     indexes[UPROPS_DATA_TOP_INDEX]=offset;
854     totalSize=4*offset;
855 
856     indexes[UPROPS_MAX_VALUES_INDEX]=
857         (((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
858         (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
859         (int32_t)splitScriptCodeOrIndex(USCRIPT_CODE_LIMIT-1);
860     indexes[UPROPS_MAX_VALUES_2_INDEX]=
861         (((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
862         (((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
863         (((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
864         (((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
865         ((int32_t)U_DT_COUNT-1);
866 
867     if(!beQuiet) {
868         puts("* uprops.icu stats *");
869         printf("trie size in bytes:                    %5u\n", (int)trieSize);
870         printf("size in bytes of additional props trie:%5u\n", (int)props2TrieSize);
871         printf("number of additional props vectors:    %5u\n", (int)pvRows);
872         printf("number of 32-bit words per vector:     %5u\n", UPROPS_VECTOR_WORDS);
873         printf("number of 16-bit scriptExtensions:     %5u\n", (int)scriptExtensions.length());
874         printf("data size:                            %6ld\n", (long)totalSize);
875     }
876 }
877 
878 void
writeCSourceFile(const char * path,UErrorCode & errorCode)879 CorePropsBuilder::writeCSourceFile(const char *path, UErrorCode &errorCode) {
880     if(U_FAILURE(errorCode)) { return; }
881 
882     int32_t pvRows;
883     const uint32_t *pvArray=upvec_getArray(pv, &pvRows, NULL);
884     int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
885 
886     FILE *f=usrc_create(path, "uchar_props_data.h", 2016,
887                         "icu/tools/unicode/c/genprops/corepropsbuilder.cpp");
888     if(f==NULL) {
889         errorCode=U_FILE_ACCESS_ERROR;
890         return;
891     }
892     fputs("#ifdef INCLUDED_FROM_UCHAR_C\n\n", f);
893     usrc_writeArray(f,
894         "static const UVersionInfo dataVersion={",
895         dataInfo.dataVersion, 8, 4,
896         "",
897         "};\n\n");
898     usrc_writeUTrie2Arrays(f,
899         "static const uint16_t propsTrie_index[%ld]={\n", NULL,
900         pTrie,
901         "\n};\n\n");
902     usrc_writeUTrie2Struct(f,
903         "static const UTrie2 propsTrie={\n",
904         pTrie, "propsTrie_index", NULL,
905         "};\n\n");
906 
907     usrc_writeUTrie2Arrays(f,
908         "static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL,
909         props2Trie,
910         "\n};\n\n");
911     usrc_writeUTrie2Struct(f,
912         "static const UTrie2 propsVectorsTrie={\n",
913         props2Trie, "propsVectorsTrie_index", NULL,
914         "};\n\n");
915 
916     usrc_writeArray(f,
917         "static const uint32_t propsVectors[%ld]={\n",
918         pvArray, 32, pvCount,
919         "",
920         "};\n\n");
921     fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
922     fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)UPROPS_VECTOR_WORDS);
923 
924     usrc_writeArray(f,
925         "static const uint16_t scriptExtensions[%ld]={\n",
926         scriptExtensions.getBuffer(), 16, scriptExtensions.length(),
927         "",
928         "};\n\n");
929 
930     usrc_writeArray(f,
931         "static const int32_t indexes[UPROPS_INDEX_COUNT]={",
932         indexes, 32, UPROPS_INDEX_COUNT,
933         "",
934         "};\n\n");
935     fputs("#endif  // INCLUDED_FROM_UCHAR_C\n", f);
936     fclose(f);
937 }
938 
939 void
writeBinaryData(const char * path,UBool withCopyright,UErrorCode & errorCode)940 CorePropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) {
941     if(U_FAILURE(errorCode)) { return; }
942 
943     int32_t pvRows;
944     const uint32_t *pvArray=upvec_getArray(pv, &pvRows, NULL);
945     int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
946 
947     UNewDataMemory *pData=udata_create(path, "icu", "uprops", &dataInfo,
948                                        withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
949     if(U_FAILURE(errorCode)) {
950         fprintf(stderr, "genprops: udata_create(%s, uprops.icu) failed - %s\n",
951                 path, u_errorName(errorCode));
952         return;
953     }
954 
955     udata_writeBlock(pData, indexes, sizeof(indexes));
956     udata_writeBlock(pData, trieBlock, trieSize);
957     udata_writeBlock(pData, props2TrieBlock, props2TrieSize);
958     udata_writeBlock(pData, pvArray, pvCount*4);
959     udata_writeBlock(pData, scriptExtensions.getBuffer(), scriptExtensions.length()*2);
960 
961     long dataLength=udata_finish(pData, &errorCode);
962     if(U_FAILURE(errorCode)) {
963         fprintf(stderr, "genprops: error %s writing the output file\n", u_errorName(errorCode));
964         return;
965     }
966 
967     if(dataLength!=(long)totalSize) {
968         fprintf(stderr,
969                 "udata_finish(uprops.icu) reports %ld bytes written but should be %ld\n",
970                 dataLength, (long)totalSize);
971         errorCode=U_INTERNAL_PROGRAM_ERROR;
972     }
973 }
974 
975 PropsBuilder *
createCorePropsBuilder(UErrorCode & errorCode)976 createCorePropsBuilder(UErrorCode &errorCode) {
977     if(U_FAILURE(errorCode)) { return NULL; }
978     PropsBuilder *pb=new CorePropsBuilder(errorCode);
979     if(pb==NULL) {
980         errorCode=U_MEMORY_ALLOCATION_ERROR;
981     }
982     return pb;
983 }
984 
985 /*
986  * Hey, Emacs, please set the following:
987  *
988  * Local Variables:
989  * indent-tabs-mode: nil
990  * End:
991  *
992  */
993