• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2000-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  genuca.cpp
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created at the end of XX century
16 *   created by: Vladimir Weinstein,
17 *   modified in 2013-2014 by Markus Scherer
18 *
19 *   This program reads the Fractional UCA table and generates
20 *   internal format for UCA table as well as inverse UCA table.
21 *   It then writes the ucadata.icu binary file containing the data.
22 */
23 
24 #define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1
25 
26 #include <stdio.h>
27 #include "unicode/utypes.h"
28 #include "unicode/errorcode.h"
29 #include "unicode/localpointer.h"
30 #include "unicode/ucol.h"
31 #include "unicode/uscript.h"
32 #include "unicode/utf8.h"
33 #include "charstr.h"
34 #include "cmemory.h"
35 #include "collation.h"
36 #include "collationbasedatabuilder.h"
37 #include "collationdata.h"
38 #include "collationdatabuilder.h"
39 #include "collationdatareader.h"
40 #include "collationdatawriter.h"
41 #include "collationinfo.h"
42 #include "collationrootelements.h"
43 #include "collationruleparser.h"
44 #include "collationtailoring.h"
45 #include "cstring.h"
46 #include "normalizer2impl.h"
47 #include "toolutil.h"
48 #include "unewdata.h"
49 #include "uoptions.h"
50 #include "uparse.h"
51 #include "writesrc.h"
52 
53 #if UCONFIG_NO_COLLATION
54 
55 extern "C" int
main(int argc,char * argv[])56 main(int argc, char* argv[]) {
57     (void)argc;
58     (void)argv;
59     return 1;
60 }
61 
62 #else
63 
64 U_NAMESPACE_USE
65 
66 enum HanOrderValue {
67     HAN_NO_ORDER = -1,
68     HAN_IMPLICIT,
69     HAN_RADICAL_STROKE
70 };
71 
72 static UBool beVerbose=FALSE, withCopyright=TRUE;
73 
74 static HanOrderValue hanOrder = HAN_NO_ORDER;
75 
76 static UVersionInfo UCAVersion={ 0, 0, 0, 0 };
77 
78 static UDataInfo ucaDataInfo={
79     sizeof(UDataInfo),
80     0,
81 
82     U_IS_BIG_ENDIAN,
83     U_CHARSET_FAMILY,
84     U_SIZEOF_UCHAR,
85     0,
86 
87     { 0x55, 0x43, 0x6f, 0x6c },         // dataFormat="UCol"
88     { 5, 0, 0, 0 },                     // formatVersion
89     { 6, 3, 0, 0 }                      // dataVersion
90 };
91 
skipWhiteSpace(char * s)92 static char *skipWhiteSpace(char *s) {
93     while(*s == ' ' || *s == '\t') { ++s; }
94     return s;
95 }
96 
hex2num(char hex)97 static int32_t hex2num(char hex) {
98     if(hex>='0' && hex <='9') {
99         return hex-'0';
100     } else if(hex>='a' && hex<='f') {
101         return hex-'a'+10;
102     } else if(hex>='A' && hex<='F') {
103         return hex-'A'+10;
104     } else {
105         return -1;
106     }
107 }
108 
parseWeight(char * & s,const char * separators,int32_t maxBytes,UErrorCode & errorCode)109 static uint32_t parseWeight(char *&s, const char *separators,
110                             int32_t maxBytes, UErrorCode &errorCode) {
111     if(U_FAILURE(errorCode)) { return 0; }
112     uint32_t weight = 0;
113     int32_t numBytes = 0;
114     for(;;) {
115         // Check one character after another, so that we don't just run over a 00.
116         int32_t nibble1, nibble2;
117         if((nibble1 = hex2num(s[0])) < 0 || (nibble2 = hex2num(s[1])) < 0) {
118             // Stop when we find something other than a pair of hex digits.
119             break;
120         }
121         if(numBytes == maxBytes || (numBytes != 0 && nibble1 == 0 && nibble2 <= 1)) {
122             // Too many bytes, or a 00 or 01 byte which is illegal inside a weight.
123             errorCode = U_INVALID_FORMAT_ERROR;
124             return 0;
125         }
126         weight = (weight << 8) | ((uint32_t)nibble1 << 4) | (uint32_t)nibble2;
127         ++numBytes;
128         s += 2;
129         if(*s != ' ') {
130             break;
131         }
132         ++s;
133     }
134     char c = *s;
135     if(c == 0 || strchr(separators, c) == NULL) {
136         errorCode = U_INVALID_FORMAT_ERROR;
137         return 0;
138     }
139     // numBytes==0 is ok, for example in [,,] or [, 82, 05]
140     // Left-align the weight.
141     while(numBytes < 4) {
142         weight <<= 8;
143         ++numBytes;
144     }
145     return weight;
146 }
147 
148 /**
149  * Parse a CE like [0A 86, 05, 17] or [U+4E00, 10].
150  * Stop with an error, or else with the pointer s after the closing bracket.
151  */
parseCE(const CollationDataBuilder & builder,char * & s,UErrorCode & errorCode)152 static int64_t parseCE(const CollationDataBuilder &builder, char *&s, UErrorCode &errorCode) {
153     if(U_FAILURE(errorCode)) { return 0; }
154     ++s;  // skip over the '['
155     if(s[0] == 'U' && s[1] == '+') {
156         // Read a code point and look up its CE.
157         // We use this especially for implicit primary weights,
158         // so that we can use different algorithms in the FractionalUCA.txt
159         // generator and the parser.
160         // The generator may not even need to compute any implicit primaries at all.
161         s += 2;
162         char *end;
163         unsigned long longCp = uprv_strtoul(s, &end, 16);
164         if(end == s || longCp > 0x10ffff) {
165             errorCode = U_INVALID_FORMAT_ERROR;
166             return 0;
167         }
168         UChar32 c = (UChar32)longCp;
169         int64_t ce = builder.getSingleCE(c, errorCode);
170         if(U_FAILURE(errorCode)) { return 0; }
171         s = end;
172         if(*s == ']') {  // [U+4E00]
173             ++s;
174             return ce;
175         }
176         if(*s != ',') {
177             errorCode = U_INVALID_FORMAT_ERROR;
178             return 0;
179         }
180         // Parse the following, secondary or tertiary weight.
181         s = skipWhiteSpace(s + 1);
182         uint32_t w = parseWeight(s, ",]", 2, errorCode);
183         if(U_FAILURE(errorCode)) { return 0; }
184         if(*s == ']') {  // [U+4E00, 10]
185             ++s;
186             // Set the tertiary weight to w.
187             return (ce & INT64_C(0xffffffffffff0000)) | (w >> 16);
188         }
189         // Set the secondary weight to w: [U+9F9C, 70, 20]
190         ce = (ce & INT64_C(0xffffffff00000000)) | w;
191         // Parse and set the tertiary weight.
192         s = skipWhiteSpace(s + 1);
193         w = parseWeight(s, "]", 2, errorCode);
194         ++s;
195         return ce | (w >> 16);
196     } else {
197         uint32_t p = parseWeight(s, ",", 4, errorCode);
198         if(U_FAILURE(errorCode)) { return 0; }
199         int64_t ce = (int64_t)p << 32;
200         s = skipWhiteSpace(s + 1);
201         uint32_t w = parseWeight(s, ",", 2, errorCode);
202         if(U_FAILURE(errorCode)) { return 0; }
203         ce |= w;
204         s = skipWhiteSpace(s + 1);
205         w = parseWeight(s, "]", 2, errorCode);
206         ++s;
207         return ce | (w >> 16);
208     }
209 }
210 
211 namespace {
212 
213 // Cached, lazy-init mapping from scripts to sample characters.
214 UChar32 sampleChars[USCRIPT_CODE_LIMIT] = { U_SENTINEL };
215 
216 }
217 
218 // Hardcoded mapping from script sample characters to script codes.
219 // Pro: Available without complete and updated UCD scripts data,
220 //      easy to add non-script codes specific to collation.
221 // Con: Needs manual update for each new script or change in sample character.
222 static const struct {
223     UChar32 sampleChar;
224     int32_t script;
225 } sampleCharsToScripts[] = {
226     { 0x00A0, UCOL_REORDER_CODE_SPACE },
227     { 0x201C, UCOL_REORDER_CODE_PUNCTUATION },
228     { 0x263A, UCOL_REORDER_CODE_SYMBOL },
229     { 0x20AC, UCOL_REORDER_CODE_CURRENCY },
230     { 0x0034, UCOL_REORDER_CODE_DIGIT },
231     { 0x004C, USCRIPT_LATIN },
232     { 0x03A9, USCRIPT_GREEK },
233     { 0x03E2, USCRIPT_COPTIC },
234     { 0x042F, USCRIPT_CYRILLIC },
235     { 0x2C00, USCRIPT_GLAGOLITIC },
236     { 0x1036B, USCRIPT_OLD_PERMIC },
237     { 0x10D3, USCRIPT_GEORGIAN },
238     { 0x0531, USCRIPT_ARMENIAN },
239     { 0x05D0, USCRIPT_HEBREW },
240     { 0x10900, USCRIPT_PHOENICIAN },
241     { 0x0800, USCRIPT_SAMARITAN },
242     { 0x0628, USCRIPT_ARABIC },
243     { 0x0710, USCRIPT_SYRIAC },
244     { 0x0840, USCRIPT_MANDAIC },
245     { 0x078C, USCRIPT_THAANA },
246     { 0x07CA, USCRIPT_NKO },
247     { 0x07D8, USCRIPT_NKO },
248     { 0x2D30, USCRIPT_TIFINAGH },
249     { 0x2D5E, USCRIPT_TIFINAGH },
250     { 0x12A0, USCRIPT_ETHIOPIC },
251     { 0x0905, USCRIPT_DEVANAGARI },
252     { 0x0995, USCRIPT_BENGALI },
253     { 0x0A15, USCRIPT_GURMUKHI },
254     { 0x0A95, USCRIPT_GUJARATI },
255     { 0x0B15, USCRIPT_ORIYA },
256     { 0x0B95, USCRIPT_TAMIL },
257     { 0x0C15, USCRIPT_TELUGU },
258     { 0x0C95, USCRIPT_KANNADA },
259     { 0x0D15, USCRIPT_MALAYALAM },
260     { 0x0D85, USCRIPT_SINHALA },
261     { 0xABC0, USCRIPT_MEITEI_MAYEK },
262     { 0xA800, USCRIPT_SYLOTI_NAGRI },
263     { 0xA882, USCRIPT_SAURASHTRA },
264     { 0x11083, USCRIPT_KAITHI },
265     { 0x11152, USCRIPT_MAHAJANI },
266     { 0x11183, USCRIPT_SHARADA },
267     { 0x11208, USCRIPT_KHOJKI },
268     { 0x112BE, USCRIPT_KHUDAWADI },
269     { 0x1128F, USCRIPT_MULTANI },
270     { 0x11315, USCRIPT_GRANTHA },
271     { 0x11412, USCRIPT_NEWA },
272     { 0x11484, USCRIPT_TIRHUTA },
273     { 0x1158E, USCRIPT_SIDDHAM },
274     { 0x1160E, USCRIPT_MODI },
275     { 0x11680, USCRIPT_TAKRI },
276     { 0x1180B, USCRIPT_DOGRA },
277     { 0x11717, USCRIPT_AHOM },
278     { 0x11D71, USCRIPT_GUNJALA_GONDI },
279     { 0x1B83, USCRIPT_SUNDANESE },
280     { 0x11005, USCRIPT_BRAHMI },
281     { 0x10A00, USCRIPT_KHAROSHTHI },
282     { 0x11C0E, USCRIPT_BHAIKSUKI },
283     { 0x0E17, USCRIPT_THAI },
284     { 0x0EA5, USCRIPT_LAO },
285     { 0xAA80, USCRIPT_TAI_VIET },
286     { 0x0F40, USCRIPT_TIBETAN },
287     { 0x11C72, USCRIPT_MARCHEN },
288     { 0x1C00, USCRIPT_LEPCHA },
289     { 0xA840, USCRIPT_PHAGS_PA },
290     { 0x1900, USCRIPT_LIMBU },
291     { 0x1703, USCRIPT_TAGALOG },
292     { 0x1723, USCRIPT_HANUNOO },
293     { 0x1743, USCRIPT_BUHID },
294     { 0x1763, USCRIPT_TAGBANWA },
295     { 0x1A00, USCRIPT_BUGINESE },
296     { 0x11EE5, USCRIPT_MAKASAR },
297     { 0x1BC0, USCRIPT_BATAK },
298     { 0xA930, USCRIPT_REJANG },
299     { 0xA90A, USCRIPT_KAYAH_LI },
300     { 0x1000, USCRIPT_MYANMAR },
301     { 0x10D12, USCRIPT_HANIFI_ROHINGYA },
302     { 0x11103, USCRIPT_CHAKMA },
303     { 0x1780, USCRIPT_KHMER },
304     { 0x1950, USCRIPT_TAI_LE },
305     { 0x1980, USCRIPT_NEW_TAI_LUE },
306     { 0x1A20, USCRIPT_LANNA },
307     { 0xAA00, USCRIPT_CHAM },
308     { 0x1B05, USCRIPT_BALINESE },
309     { 0xA984, USCRIPT_JAVANESE },
310     { 0x1826, USCRIPT_MONGOLIAN },
311     { 0x1C5A, USCRIPT_OL_CHIKI },
312     { 0x13C4, USCRIPT_CHEROKEE },
313     { 0x104B5, USCRIPT_OSAGE },
314     { 0x14C0, USCRIPT_CANADIAN_ABORIGINAL },
315     { 0x168F, USCRIPT_OGHAM },
316     { 0x16A0, USCRIPT_RUNIC },
317     { 0x10CA1, USCRIPT_OLD_HUNGARIAN },
318     { 0x10C00, USCRIPT_ORKHON },
319     { 0xA549, USCRIPT_VAI },
320     { 0xA6A0, USCRIPT_BAMUM },
321     { 0x16AE6, USCRIPT_BASSA_VAH },
322     { 0x1E802, USCRIPT_MENDE },
323     { 0x16E40, USCRIPT_MEDEFAIDRIN },
324     { 0x1E909, USCRIPT_ADLAM, },
325     { 0xAC00, USCRIPT_HANGUL },
326     { 0x304B, USCRIPT_HIRAGANA },
327     { 0x30AB, USCRIPT_KATAKANA },
328     { 0x3105, USCRIPT_BOPOMOFO },
329     { 0xA288, USCRIPT_YI },
330     { 0xA4D0, USCRIPT_LISU },
331     { 0xA4E8, USCRIPT_LISU },
332     { 0x16F00, USCRIPT_MIAO },
333     { 0x118B4, USCRIPT_WARANG_CITI },
334     { 0x11AC0, USCRIPT_PAU_CIN_HAU },
335     { 0x16B1C, USCRIPT_PAHAWH_HMONG },
336     { 0x10280, USCRIPT_LYCIAN },
337     { 0x102A0, USCRIPT_CARIAN },
338     { 0x102B7, USCRIPT_CARIAN },
339     { 0x10920, USCRIPT_LYDIAN },
340     { 0x10300, USCRIPT_OLD_ITALIC },
341     { 0x10308, USCRIPT_OLD_ITALIC },
342     { 0x10330, USCRIPT_GOTHIC },
343     { 0x10414, USCRIPT_DESERET },
344     { 0x10450, USCRIPT_SHAVIAN },
345     { 0x1BC20, USCRIPT_DUPLOYAN },
346     { 0x10480, USCRIPT_OSMANYA },
347     { 0x10500, USCRIPT_ELBASAN },
348     { 0x10537, USCRIPT_CAUCASIAN_ALBANIAN },
349     { 0x110D0, USCRIPT_SORA_SOMPENG },
350     { 0x16A4F, USCRIPT_MRO },
351     { 0x10000, USCRIPT_LINEAR_B },
352     { 0x10647, USCRIPT_LINEAR_A },
353     { 0x10800, USCRIPT_CYPRIOT },
354     { 0x10A60, USCRIPT_OLD_SOUTH_ARABIAN },
355     { 0x10A95, USCRIPT_OLD_NORTH_ARABIAN },
356     { 0x10B00, USCRIPT_AVESTAN },
357     { 0x10873, USCRIPT_PALMYRENE },
358     { 0x10896, USCRIPT_NABATAEAN },
359     { 0x108F4, USCRIPT_HATRAN },
360     { 0x10840, USCRIPT_IMPERIAL_ARAMAIC },
361     { 0x10B40, USCRIPT_INSCRIPTIONAL_PARTHIAN },
362     { 0x10B60, USCRIPT_INSCRIPTIONAL_PAHLAVI },
363     { 0x10B8F, USCRIPT_PSALTER_PAHLAVI },
364     { 0x10AC1, USCRIPT_MANICHAEAN },
365     { 0x10AD8, USCRIPT_MANICHAEAN },
366     { 0x10F19, USCRIPT_OLD_SOGDIAN },
367     { 0x10F42, USCRIPT_SOGDIAN },
368     { 0x10380, USCRIPT_UGARITIC },
369     { 0x103A0, USCRIPT_OLD_PERSIAN },
370     { 0x12000, USCRIPT_CUNEIFORM },
371     { 0x13153, USCRIPT_EGYPTIAN_HIEROGLYPHS },
372     { 0x109A0, USCRIPT_MEROITIC_CURSIVE },
373     { 0x10980, USCRIPT_MEROITIC_HIEROGLYPHS },
374     { 0x14400, USCRIPT_ANATOLIAN_HIEROGLYPHS },
375     { 0x18229, USCRIPT_TANGUT },
376     { 0x5B57, USCRIPT_HAN },
377     { 0x11D10, USCRIPT_MASARAM_GONDI },
378     { 0x11A0B, USCRIPT_ZANABAZAR_SQUARE },
379     { 0x11A5C, USCRIPT_SOYOMBO },
380     { 0x1B1C4, USCRIPT_NUSHU },
381     { 0xFDD0, USCRIPT_UNKNOWN }  // unassigned-implicit primary weights
382 };
383 
getCharScript(UChar32 c)384 static int32_t getCharScript(UChar32 c) {
385     if (sampleChars[0] < 0) {
386         // Lazy-init the script->sample cache.
387         for (int32_t script = 0; script < USCRIPT_CODE_LIMIT; ++script) {
388             UnicodeString sample = uscript_getSampleUnicodeString((UScriptCode)script);
389             if (sample.isEmpty() || sample.hasMoreChar32Than(0, INT32_MAX, 1)) {
390                 sampleChars[script] = U_SENTINEL;
391             } else {
392                 sampleChars[script] = sample.char32At(0);
393             }
394         }
395     }
396     for (int32_t script = 0; script < USCRIPT_CODE_LIMIT; ++script) {
397         if (c == sampleChars[script]) {
398             return script;
399         }
400     }
401     for(int32_t i = 0; i < UPRV_LENGTHOF(sampleCharsToScripts); ++i) {
402         if(c == sampleCharsToScripts[i].sampleChar) {
403             return sampleCharsToScripts[i].script;
404         }
405     }
406     return USCRIPT_INVALID_CODE;  // -1
407 }
408 
409 /**
410  * Maps Unified_Ideograph's to primary CEs in the given order of ranges.
411  */
412 class HanOrder {
413 public:
HanOrder(UErrorCode & errorCode)414     HanOrder(UErrorCode &errorCode) : ranges(errorCode), set(), done(FALSE) {}
415 
addRange(UChar32 start,UChar32 end,UErrorCode & errorCode)416     void addRange(UChar32 start, UChar32 end, UErrorCode &errorCode) {
417         int32_t length = ranges.size();
418         if(length > 0 && (ranges.elementAti(length - 1) + 1) == start) {
419             // The previous range end is just before this range start: Merge adjacent ranges.
420             ranges.setElementAt(end, length - 1);
421         } else {
422             ranges.addElement(start, errorCode);
423             ranges.addElement(end, errorCode);
424         }
425         set.add(start, end);
426     }
427 
setBuilderHanOrder(CollationBaseDataBuilder & builder,UErrorCode & errorCode)428     void setBuilderHanOrder(CollationBaseDataBuilder &builder, UErrorCode &errorCode) {
429         if(U_FAILURE(errorCode)) { return; }
430         builder.initHanRanges(ranges.getBuffer(), ranges.size(), errorCode);
431         done = TRUE;
432     }
433 
setDone()434     void setDone() {
435         done = TRUE;
436     }
437 
isDone()438     UBool isDone() { return done; }
439 
getSet()440     const UnicodeSet &getSet() { return set; }
441 
442 private:
443     UVector32 ranges;
444     UnicodeSet set;
445     UBool done;
446 };
447 
448 static HanOrder *implicitHanOrder = NULL;
449 static HanOrder *radicalStrokeOrder = NULL;
450 
451 enum ActionType {
452   READCE,
453   READPRIMARY,
454   READBYTE,
455   READUNIFIEDIDEOGRAPH,
456   READRADICAL,
457   READUCAVERSION,
458   READLEADBYTETOSCRIPTS,
459   IGNORE
460 };
461 
462 static struct {
463     const char *const name;
464     int64_t value;
465     const ActionType what_to_do;
466 } vt[]  = {
467     {"[first tertiary ignorable",     0, IGNORE},
468     {"[last tertiary ignorable",      0, IGNORE},
469     {"[first secondary ignorable",    0, READCE},
470     {"[last secondary ignorable",     0, READCE},
471     {"[first primary ignorable",      0, READCE},
472     {"[last primary ignorable",       0, READCE},
473     {"[first variable",               0, READCE},
474     {"[last variable",                0, READCE},
475     {"[first regular",                0, READCE},
476     {"[last regular",                 0, READCE},
477     {"[first implicit",               0, READCE},
478     {"[last implicit",                0, READCE},
479     {"[first trailing",               0, READCE},
480     {"[last trailing",                0, READCE},
481 
482     {"[Unified_Ideograph",            0, READUNIFIEDIDEOGRAPH},
483     {"[radical",                      0, READRADICAL},
484 
485     {"[fixed first implicit byte",    0, IGNORE},
486     {"[fixed last implicit byte",     0, IGNORE},
487     {"[fixed first trail byte",       0, IGNORE},
488     {"[fixed last trail byte",        0, IGNORE},
489     {"[fixed first special byte",     0, IGNORE},
490     {"[fixed last special byte",      0, IGNORE},
491     {"[fixed secondary common byte",                  0, READBYTE},
492     {"[fixed last secondary common byte",             0, READBYTE},
493     {"[fixed first ignorable secondary byte",         0, READBYTE},
494     {"[fixed tertiary common byte",                   0, READBYTE},
495     {"[fixed first ignorable tertiary byte",          0, READBYTE},
496     {"[variable top = ",              0, IGNORE},
497     {"[UCA version = ",               0, READUCAVERSION},
498     {"[top_byte",                     0, READLEADBYTETOSCRIPTS},
499     {"[reorderingTokens",             0, IGNORE},
500     {"[categories",                   0, IGNORE},
501     {"[first tertiary in secondary non-ignorable",    0, IGNORE},
502     {"[last tertiary in secondary non-ignorable",     0, IGNORE},
503     {"[first secondary in primary non-ignorable",     0, IGNORE},
504     {"[last secondary in primary non-ignorable",      0, IGNORE},
505 };
506 
getOptionValue(const char * name)507 static int64_t getOptionValue(const char *name) {
508     for (int32_t i = 0; i < UPRV_LENGTHOF(vt); ++i) {
509         if(uprv_strcmp(name, vt[i].name) == 0) {
510             return vt[i].value;
511         }
512     }
513     return 0;
514 }
515 
readAnOption(CollationBaseDataBuilder & builder,char * buffer,UErrorCode * status)516 static void readAnOption(
517         CollationBaseDataBuilder &builder, char *buffer, UErrorCode *status) {
518     for (int32_t cnt = 0; cnt<UPRV_LENGTHOF(vt); cnt++) {
519         int32_t vtLen = (int32_t)uprv_strlen(vt[cnt].name);
520         if(uprv_strncmp(buffer, vt[cnt].name, vtLen) == 0) {
521             ActionType what_to_do = vt[cnt].what_to_do;
522             char *pointer = skipWhiteSpace(buffer + vtLen);
523             if (what_to_do == IGNORE) { //vt[cnt].what_to_do == IGNORE
524                 return;
525             } else if (what_to_do == READCE) {
526                 vt[cnt].value = parseCE(builder, pointer, *status);
527                 if(U_SUCCESS(*status) && *pointer != ']') {
528                     *status = U_INVALID_FORMAT_ERROR;
529                 }
530                 if(U_FAILURE(*status)) {
531                     fprintf(stderr, "Syntax error: unable to parse the CE from line '%s'\n", buffer);
532                     return;
533                 }
534             } else if(what_to_do == READPRIMARY) {
535                 vt[cnt].value = parseWeight(pointer, "]", 4, *status);
536                 if(U_FAILURE(*status)) {
537                     fprintf(stderr, "Value of \"%s\" is not a primary weight\n", buffer);
538                     return;
539                 }
540             } else if(what_to_do == READBYTE) {
541                 vt[cnt].value = parseWeight(pointer, "]", 1, *status) >> 24;
542                 if(U_FAILURE(*status)) {
543                     fprintf(stderr, "Value of \"%s\" is not a valid byte\n", buffer);
544                     return;
545                 }
546             } else if(what_to_do == READUNIFIEDIDEOGRAPH) {
547                 if(implicitHanOrder != NULL) {
548                     fprintf(stderr, "duplicate [Unified_Ideograph] lines\n");
549                     *status = U_INVALID_FORMAT_ERROR;
550                     return;
551                 }
552                 implicitHanOrder = new HanOrder(*status);
553                 if(U_FAILURE(*status)) { return; }
554                 for(;;) {
555                     if(*pointer == ']') { break; }
556                     if(*pointer == 0) {
557                         // Missing ] after ranges.
558                         *status = U_INVALID_FORMAT_ERROR;
559                         return;
560                     }
561                     char *s = pointer;
562                     while(*s != ' ' && *s != '\t' && *s != ']' && *s != '\0') { ++s; }
563                     char c = *s;
564                     *s = 0;
565                     uint32_t start, end;
566                     u_parseCodePointRange(pointer, &start, &end, status);
567                     *s = c;
568                     if(U_FAILURE(*status)) {
569                         fprintf(stderr, "Syntax error: unable to parse one of the ranges from line '%s'\n", buffer);
570                         *status = U_INVALID_FORMAT_ERROR;
571                         return;
572                     }
573                     implicitHanOrder->addRange((UChar32)start, (UChar32)end, *status);
574                     pointer = skipWhiteSpace(s);
575                 }
576                 if(hanOrder == HAN_IMPLICIT) {
577                     implicitHanOrder->setBuilderHanOrder(builder, *status);
578                 }
579                 implicitHanOrder->setDone();
580             } else if(what_to_do == READRADICAL) {
581                 if(radicalStrokeOrder == NULL) {
582                     if(implicitHanOrder == NULL) {
583                         fprintf(stderr, "[radical] section before [Unified_Ideograph] line\n");
584                         *status = U_INVALID_FORMAT_ERROR;
585                         return;
586                     }
587                     radicalStrokeOrder = new HanOrder(*status);
588                     if(U_FAILURE(*status)) { return; }
589                 } else if(radicalStrokeOrder->isDone()) {
590                     fprintf(stderr, "duplicate [radical] sections\n");
591                     *status = U_INVALID_FORMAT_ERROR;
592                     return;
593                 }
594                 UBool ok;
595                 if(uprv_strcmp(pointer, "end]") == 0) {
596                     if(radicalStrokeOrder->getSet() != implicitHanOrder->getSet()) {
597                         fprintf(stderr, "[radical end]: "
598                                 "some of [Unified_Ideograph] missing from [radical] lines\n");
599                         *status = U_INVALID_FORMAT_ERROR;
600                         return;
601                     }
602                     if(hanOrder == HAN_RADICAL_STROKE) {
603                         radicalStrokeOrder->setBuilderHanOrder(builder, *status);
604                     }
605                     radicalStrokeOrder->setDone();
606                 } else {
607                     // Read Han characters and ranges between : and ].
608                     // Ignore the radical data before the :.
609                     char *startPointer = uprv_strchr(pointer, ':');
610                     char *limitPointer = uprv_strchr(pointer, ']');
611                     if(startPointer == NULL || limitPointer == NULL ||
612                             (startPointer + 1) >= limitPointer) {
613                         fprintf(stderr, "[radical]: no Han characters listed between : and ]\n");
614                         *status = U_INVALID_FORMAT_ERROR;
615                         return;
616                     }
617                     pointer = startPointer + 1;
618                     int32_t length = (int32_t)(limitPointer - pointer);
619                     for(int32_t i = 0; i < length;) {
620                         UChar32 start;
621                         U8_NEXT(pointer, i, length, start);
622                         UChar32 end;
623                         if(pointer[i] == '-') {
624                             ++i;
625                             U8_NEXT(pointer, i, length, end);
626                         } else {
627                             end = start;
628                         }
629                         if(radicalStrokeOrder->getSet().containsSome(start, end)) {
630                             fprintf(stderr, "[radical]: some of U+%04x..U+%04x occur "
631                                     "multiple times in the radical-stroke order\n",
632                                     start, end);
633                             *status = U_INVALID_FORMAT_ERROR;
634                             return;
635                         }
636                         if(!implicitHanOrder->getSet().contains(start, end)) {
637                             fprintf(stderr, "[radical]: some of U+%04x..U+%04x are "
638                                     "not Unified_Ideograph\n",
639                                     start, end);
640                             *status = U_INVALID_FORMAT_ERROR;
641                             return;
642                         }
643                         radicalStrokeOrder->addRange(start, end, *status);
644                     }
645                 }
646             } else if (what_to_do == READUCAVERSION) {
647                 u_versionFromString(UCAVersion, pointer);
648                 if(beVerbose) {
649                     char uca[U_MAX_VERSION_STRING_LENGTH];
650                     u_versionToString(UCAVersion, uca);
651                     printf("UCA version %s\n", uca);
652                 }
653                 UVersionInfo UCDVersion;
654                 u_getUnicodeVersion(UCDVersion);
655                 if (UCAVersion[0] != UCDVersion[0] || UCAVersion[1] != UCDVersion[1]) {
656                     char uca[U_MAX_VERSION_STRING_LENGTH];
657                     char ucd[U_MAX_VERSION_STRING_LENGTH];
658                     u_versionToString(UCAVersion, uca);
659                     u_versionToString(UCDVersion, ucd);
660                     // Warning, not error, to permit bootstrapping during a version upgrade.
661                     fprintf(stderr, "warning: UCA version %s != UCD version %s\n", uca, ucd);
662                 }
663             } else if (what_to_do == READLEADBYTETOSCRIPTS) {
664                 if (strstr(pointer, "COMPRESS") != NULL) {
665                     uint16_t leadByte = (hex2num(*pointer++) * 16);
666                     leadByte += hex2num(*pointer++);
667                     builder.setCompressibleLeadByte(leadByte);
668                 }
669                 // We do not need the list of scripts on this line.
670             }
671             return;
672         }
673     }
674     fprintf(stderr, "Warning: unrecognized option: %s\n", buffer);
675 }
676 
677 static UBool
readAnElement(char * line,CollationBaseDataBuilder & builder,UnicodeString & prefix,UnicodeString & s,int64_t ces[32],int32_t & cesLength,UErrorCode * status)678 readAnElement(char *line,
679         CollationBaseDataBuilder &builder,
680         UnicodeString &prefix, UnicodeString &s,
681         int64_t ces[32], int32_t &cesLength,
682         UErrorCode *status) {
683     if(U_FAILURE(*status)) {
684         return FALSE;
685     }
686     int32_t lineLength = (int32_t)uprv_strlen(line);
687     while(lineLength>0 && (line[lineLength-1] == '\r' || line[lineLength-1] == '\n')) {
688       line[--lineLength] = 0;
689     }
690 
691     if(lineLength >= 3 && line[0] == (char)0xef &&
692             line[1] == (char)0xbb && line[2] == (char)0xbf) {
693         // U+FEFF UTF-8 signature byte sequence.
694         // Ignore, assuming it is at the start of the file.
695         line += 3;
696         lineLength -= 3;
697     }
698     if(line[0] == 0 || line[0] == '#') {
699         return FALSE; // just a comment, skip whole line
700     }
701 
702     // Directives.
703     if(line[0] == '[') {
704         readAnOption(builder, line, status);
705         return FALSE;
706     }
707 
708     CharString input;
709     char *startCodePoint = line;
710     char *endCodePoint = strchr(startCodePoint, ';');
711     if(endCodePoint == NULL) {
712         fprintf(stderr, "error - line with no code point:\n%s\n", line);
713         *status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */
714         return FALSE;
715     }
716 
717     char *pipePointer = strchr(line, '|');
718     if (pipePointer != NULL) {
719         // Read the prefix string which precedes the actual string.
720         input.append(startCodePoint, (int32_t)(pipePointer - startCodePoint), *status);
721         UChar *prefixChars = prefix.getBuffer(32);
722         int32_t prefixSize =
723             u_parseString(input.data(),
724                           prefixChars, prefix.getCapacity(),
725                           NULL, status);
726         if(U_FAILURE(*status)) {
727             prefix.releaseBuffer(0);
728             fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n%s\n",
729                     input.data(), line, u_errorName(*status));
730             *status = U_INVALID_FORMAT_ERROR;
731             return FALSE;
732         }
733         prefix.releaseBuffer(prefixSize);
734         startCodePoint = pipePointer + 1;
735         input.clear();
736     }
737 
738     // Read the string which gets the CE(s) assigned.
739     input.append(startCodePoint, (int32_t)(endCodePoint - startCodePoint), *status);
740     UChar *uchars = s.getBuffer(32);
741     int32_t cSize =
742         u_parseString(input.data(),
743                       uchars, s.getCapacity(),
744                       NULL, status);
745     if(U_FAILURE(*status)) {
746         s.releaseBuffer(0);
747         fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n%s\n",
748                 input.data(), line, u_errorName(*status));
749         *status = U_INVALID_FORMAT_ERROR;
750         return FALSE;
751     }
752     s.releaseBuffer(cSize);
753 
754     char *pointer = endCodePoint + 1;
755 
756     char *commentStart = strchr(pointer, '#');
757     if(commentStart == NULL) {
758         commentStart = strchr(pointer, 0);
759     }
760 
761     cesLength = 0;
762     for(;;) {
763         pointer = skipWhiteSpace(pointer);
764         if(pointer == commentStart) {
765             break;
766         }
767         if(cesLength >= 31) {
768             fprintf(stderr, "Error: Too many CEs on line '%s'\n", line);
769             *status = U_INVALID_FORMAT_ERROR;
770             return FALSE;
771         }
772         ces[cesLength++] = parseCE(builder, pointer, *status);
773         if(U_FAILURE(*status)) {
774             fprintf(stderr, "Syntax error parsing CE from line '%s' - %s\n",
775                     line, u_errorName(*status));
776             return FALSE;
777         }
778     }
779 
780     if(s.length() == 1 && s[0] == 0xfffe) {
781         // UCA 6.0 gives U+FFFE a special minimum weight using the
782         // byte 02 which is the merge-sort-key separator and illegal for any
783         // other characters.
784     } else {
785         // Rudimentary check for valid bytes in CE weights.
786         // For a more comprehensive check see CollationTest::TestRootElements(),
787         // intltest collate/CollationTest/TestRootElements
788         for (int32_t i = 0; i < cesLength; ++i) {
789             int64_t ce = ces[i];
790             UBool isCompressible = FALSE;
791             for (int j = 7; j >= 0; --j) {
792                 uint8_t b = (uint8_t)(ce >> (j * 8));
793                 if(j <= 1) { b &= 0x3f; }  // tertiary bytes use 6 bits
794                 if (b == 1) {
795                     fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", line);
796                     return FALSE;
797                 }
798                 if (j == 7 && b == 2) {
799                     fprintf(stderr, "Warning: invalid UCA primary weight lead byte 02 for %s\n", line);
800                     return FALSE;
801                 }
802                 if (j == 7) {
803                     isCompressible = builder.isCompressibleLeadByte(b);
804                 } else if (j == 6) {
805                     // Primary second bytes 03 and FF are compression terminators.
806                     // 02, 03 and FF are usable when the lead byte is not compressible.
807                     // 02 is unusable and 03 is the low compression terminator when the lead byte is compressible.
808                     if (isCompressible && (b <= 3 || b == 0xff)) {
809                         fprintf(stderr, "Warning: invalid UCA primary second weight byte %02X for %s\n",
810                                 b, line);
811                         return FALSE;
812                     }
813                 }
814             }
815         }
816     }
817 
818     return TRUE;
819 }
820 
821 static void
parseFractionalUCA(const char * filename,CollationBaseDataBuilder & builder,UErrorCode * status)822 parseFractionalUCA(const char *filename,
823                    CollationBaseDataBuilder &builder,
824                    UErrorCode *status)
825 {
826     if(U_FAILURE(*status)) { return; }
827     FILE *data = fopen(filename, "r");
828     if(data == NULL) {
829         fprintf(stderr, "Couldn't open file: %s\n", filename);
830         *status = U_FILE_ACCESS_ERROR;
831         return;
832     }
833     int32_t lineNumber = 0;
834     char buffer[30000];
835 
836     UChar32 maxCodePoint = 0;
837     while(!feof(data)) {
838         if(U_FAILURE(*status)) {
839             fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n",
840                 *status, u_errorName(*status), (int)lineNumber, filename);
841             exit(*status);
842         }
843 
844         lineNumber++;
845         char *line = fgets(buffer, sizeof(buffer), data);
846         if(line == NULL) {
847             if(feof(data)) {
848                 break;
849             } else {
850                 fprintf(stderr, "no more input line and also no EOF!\n");
851                 *status = U_INVALID_FORMAT_ERROR;
852                 return;
853             }
854         }
855 
856         UnicodeString prefix;
857         UnicodeString s;
858         int64_t ces[32];
859         int32_t cesLength = 0;
860         if(readAnElement(line, builder, prefix, s, ces, cesLength, status)) {
861             // we have read the line, now do something sensible with the read data!
862             uint32_t p = (uint32_t)(ces[0] >> 32);
863 
864             if(s.length() > 1 && s[0] == 0xFDD0) {
865                 // FractionalUCA.txt contractions starting with U+FDD0
866                 // are only entered into the inverse table,
867                 // not into the normal collation data.
868                 builder.addRootElements(ces, cesLength, *status);
869                 if(s.length() == 2 && cesLength == 1) {
870                     switch(s[1]) {
871                     case 0x34:
872                         // Lead byte for numeric sorting.
873                         builder.setNumericPrimary(p);
874                         break;
875                     case 0xFF21:
876                         builder.addScriptStart(CollationData::REORDER_RESERVED_BEFORE_LATIN, p);
877                         break;
878                     case 0xFF3A:
879                         builder.addScriptStart(CollationData::REORDER_RESERVED_AFTER_LATIN, p);
880                         break;
881                     default:
882                         break;
883                     }
884                 }
885             } else {
886                 UChar32 c = s.char32At(0);
887                 if(c > maxCodePoint) { maxCodePoint = c; }
888 
889                 // We ignore the CEs for U+FFFD..U+FFFF and for the unassigned first primary.
890                 // CollationBaseDataBuilder::init() maps them to special CEs.
891                 // Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt.
892                 if(0xfffd <= c && c <= 0xffff) { continue; }
893                 if(s.length() >= 2 && c == 0xFDD1) {
894                     UChar32 c2 = s.char32At(1);
895                     int32_t script = getCharScript(c2);
896                     if(script < 0) {
897                         fprintf(stderr,
898                                 "Error: Unknown script for first-primary sample character "
899                                 "U+%04X on line %u of %s:\n"
900                                 "%s\n"
901                                 "    (add the character to genuca.cpp sampleCharsToScripts[])\n",
902                                 c2, (int)lineNumber, filename, line);
903                         exit(U_INVALID_FORMAT_ERROR);
904                     }
905                     if(script == USCRIPT_UNKNOWN) {
906                         // FDD1 FDD0, first unassigned-implicit primary
907                         builder.addScriptStart(script, Collation::FIRST_UNASSIGNED_PRIMARY);
908                         continue;
909                     }
910                     builder.addScriptStart(script, p);
911                     if(script == USCRIPT_HIRAGANA) {
912                         builder.addScriptStart(USCRIPT_KATAKANA_OR_HIRAGANA, p);
913                     } else if(script == USCRIPT_HAN) {
914                         builder.addScriptStart(USCRIPT_SIMPLIFIED_HAN, p);
915                         builder.addScriptStart(USCRIPT_TRADITIONAL_HAN, p);
916                     }
917                 }
918 
919                 if(0xe0000000 <= p && p < 0xf0000000) {
920                     fprintf(stderr,
921                             "Error: Unexpected mapping to an implicit or trailing primary"
922                             " on line %u of %s:\n"
923                             "%s\n",
924                             (int)lineNumber, filename, line);
925                     exit(U_INVALID_FORMAT_ERROR);
926                 }
927 
928                 builder.add(prefix, s, ces, cesLength, *status);
929             }
930         }
931     }
932 
933     int32_t numRanges = 0;
934     int32_t numRangeCodePoints = 0;
935     UChar32 rangeFirst = U_SENTINEL;
936     UChar32 rangeLast = U_SENTINEL;
937     uint32_t rangeFirstPrimary = 0;
938     uint32_t rangeLastPrimary = 0;
939     int32_t rangeStep = -1;
940 
941     // Detect ranges of characters in primary code point order,
942     // with 3-byte primaries and
943     // with consistent "step" differences between adjacent primaries.
944     // This relies on the FractionalUCA generator using the same primary-weight incrementation.
945     // Start at U+0180: No ranges for common Latin characters.
946     // Go one beyond maxCodePoint in case a range ends there.
947     for(UChar32 c = 0x180; c <= (maxCodePoint + 1); ++c) {
948         UBool action;
949         uint32_t p = builder.getLongPrimaryIfSingleCE(c);
950         if(p != 0) {
951             // p is a "long" (three-byte) primary.
952             if(rangeFirst >= 0 && c == (rangeLast + 1) && p > rangeLastPrimary) {
953                 // Find the offset between the two primaries.
954                 int32_t step = CollationBaseDataBuilder::diffThreeBytePrimaries(
955                     rangeLastPrimary, p, builder.isCompressiblePrimary(p));
956                 if(rangeFirst == rangeLast && step >= 2) {
957                     // c == rangeFirst + 1, store the "step" between range primaries.
958                     rangeStep = step;
959                     rangeLast = c;
960                     rangeLastPrimary = p;
961                     action = 0;  // continue range
962                 } else if(rangeStep == step) {
963                     // Continue the range with the same "step" difference.
964                     rangeLast = c;
965                     rangeLastPrimary = p;
966                     action = 0;  // continue range
967                 } else {
968                     action = 1;  // maybe finish range, start a new one
969                 }
970             } else {
971                 action = 1;  // maybe finish range, start a new one
972             }
973         } else {
974             action = -1;  // maybe finish range, do not start a new one
975         }
976         if(action != 0 && rangeFirst >= 0) {
977             // Finish a range.
978             // Set offset CE32s for a long range, leave single CEs for a short range.
979             UBool didSetRange = builder.maybeSetPrimaryRange(
980                 rangeFirst, rangeLast,
981                 rangeFirstPrimary, rangeStep, *status);
982             if(U_FAILURE(*status)) {
983                 fprintf(stderr,
984                         "failure setting code point order range U+%04lx..U+%04lx "
985                         "%08lx..%08lx step %d - %s\n",
986                         (long)rangeFirst, (long)rangeLast,
987                         (long)rangeFirstPrimary, (long)rangeLastPrimary,
988                         (int)rangeStep, u_errorName(*status));
989             } else if(didSetRange) {
990                 int32_t rangeLength = rangeLast - rangeFirst + 1;
991                 if(beVerbose) {
992                     printf("* set code point order range U+%04lx..U+%04lx [%d] "
993                             "%08lx..%08lx step %d\n",
994                             (long)rangeFirst, (long)rangeLast,
995                             (int)rangeLength,
996                             (long)rangeFirstPrimary, (long)rangeLastPrimary,
997                             (int)rangeStep);
998                 }
999                 ++numRanges;
1000                 numRangeCodePoints += rangeLength;
1001             }
1002             rangeFirst = U_SENTINEL;
1003             rangeStep = -1;
1004         }
1005         if(action > 0) {
1006             // Start a new range.
1007             rangeFirst = rangeLast = c;
1008             rangeFirstPrimary = rangeLastPrimary = p;
1009         }
1010     }
1011     printf("** set %d ranges with %d code points\n", (int)numRanges, (int)numRangeCodePoints);
1012 
1013     // Idea: Probably best to work in two passes.
1014     // Pass 1 for reading all data, setting isCompressible flags (and reordering groups)
1015     // and finding ranges.
1016     // Then set the ranges in a newly initialized builder
1017     // for optimal compression (makes sure that adjacent blocks can overlap easily).
1018     // Then set all mappings outside the ranges.
1019     //
1020     // In the first pass, we could store mappings in a simple list,
1021     // with single-character/single-long-primary-CE mappings in a UTrie2;
1022     // or store the mappings in a temporary builder;
1023     // or we could just parse the input file again in the second pass.
1024     //
1025     // Ideally set/copy U+0000..U+017F before setting anything else,
1026     // then set default Han/Hangul, then set the ranges, then copy non-range mappings.
1027     // It should be easy to copy mappings from an un-built builder to a new one.
1028     // Add CollationDataBuilder::copyFrom(builder, code point, errorCode) -- copy contexts & expansions.
1029 
1030     if(UCAVersion[0] == 0 && UCAVersion[1] == 0 && UCAVersion[2] == 0 && UCAVersion[3] == 0) {
1031         fprintf(stderr, "UCA version not specified. Cannot create data file!\n");
1032         fclose(data);
1033         return;
1034     }
1035 
1036     if (beVerbose) {
1037         printf("\nLines read: %u\n", (int)lineNumber);
1038     }
1039 
1040     fclose(data);
1041 
1042     return;
1043 }
1044 
1045 static void
buildAndWriteBaseData(CollationBaseDataBuilder & builder,const char * path,UErrorCode & errorCode)1046 buildAndWriteBaseData(CollationBaseDataBuilder &builder,
1047                       const char *path, UErrorCode &errorCode) {
1048     if(U_FAILURE(errorCode)) { return; }
1049 
1050     if(getOptionValue("[fixed secondary common byte") != Collation::COMMON_BYTE) {
1051         fprintf(stderr, "error: unexpected [fixed secondary common byte]");
1052         errorCode = U_INVALID_FORMAT_ERROR;
1053         return;
1054     }
1055     if(getOptionValue("[fixed tertiary common byte") != Collation::COMMON_BYTE) {
1056         fprintf(stderr, "error: unexpected [fixed tertiary common byte]");
1057         errorCode = U_INVALID_FORMAT_ERROR;
1058         return;
1059     }
1060 
1061     CollationData data(*Normalizer2Factory::getNFCImpl(errorCode));
1062     builder.enableFastLatin();
1063     builder.build(data, errorCode);
1064     if(U_FAILURE(errorCode)) {
1065         fprintf(stderr, "builder.build() failed: %s\n",
1066                 u_errorName(errorCode));
1067         return;
1068     }
1069 
1070     // The CollationSettings constructor gives us the properly encoded
1071     // default options, so that we need not duplicate them here.
1072     CollationSettings settings;
1073 
1074     UVector32 rootElements(errorCode);
1075     for(int32_t i = 0; i < CollationRootElements::IX_COUNT; ++i) {
1076         rootElements.addElement(0, errorCode);
1077     }
1078     builder.buildRootElementsTable(rootElements, errorCode);
1079     if(U_FAILURE(errorCode)) {
1080         fprintf(stderr, "builder.buildRootElementsTable() failed: %s\n",
1081                 u_errorName(errorCode));
1082         return;
1083     }
1084     int32_t index = CollationRootElements::IX_COUNT;
1085     rootElements.setElementAt(index, CollationRootElements::IX_FIRST_TERTIARY_INDEX);
1086 
1087     while((rootElements.elementAti(index) & 0xffff0000) == 0) { ++index; }
1088     rootElements.setElementAt(index, CollationRootElements::IX_FIRST_SECONDARY_INDEX);
1089 
1090     while((rootElements.elementAti(index) & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
1091         ++index;
1092     }
1093     rootElements.setElementAt(index, CollationRootElements::IX_FIRST_PRIMARY_INDEX);
1094 
1095     rootElements.setElementAt(Collation::COMMON_SEC_AND_TER_CE,
1096                               CollationRootElements::IX_COMMON_SEC_AND_TER_CE);
1097 
1098     int32_t secTerBoundaries = (int32_t)getOptionValue("[fixed last secondary common byte") << 24;
1099     secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable secondary byte") << 16;
1100     secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable tertiary byte");
1101     rootElements.setElementAt(secTerBoundaries, CollationRootElements::IX_SEC_TER_BOUNDARIES);
1102 
1103     LocalMemory<uint8_t> buffer;
1104     int32_t capacity = 1000000;
1105     uint8_t *dest = buffer.allocateInsteadAndCopy(capacity);
1106     if(dest == NULL) {
1107         fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n",
1108                 (long)capacity);
1109         errorCode = U_MEMORY_ALLOCATION_ERROR;
1110         return;
1111     }
1112     int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
1113     int32_t totalSize = CollationDataWriter::writeBase(
1114             data, settings,
1115             rootElements.getBuffer(), rootElements.size(),
1116             indexes, dest, capacity,
1117             errorCode);
1118     if(U_FAILURE(errorCode)) {
1119         fprintf(stderr, "CollationDataWriter::writeBase(capacity = %ld) failed: %s\n",
1120                 (long)capacity, u_errorName(errorCode));
1121         return;
1122     }
1123     printf("*** CLDR root collation part sizes ***\n");
1124     CollationInfo::printSizes(totalSize, indexes);
1125     printf("*** CLDR root collation size:   %6ld (with file header but no copyright string)\n",
1126            (long)totalSize + 32);  // 32 bytes = DataHeader rounded up to 16-byte boundary
1127 
1128     CollationTailoring::makeBaseVersion(UCAVersion, ucaDataInfo.dataVersion);
1129     const char *dataName =
1130         hanOrder == HAN_IMPLICIT ? "ucadata-implicithan" :
1131         "ucadata-unihan";
1132     UNewDataMemory *pData=udata_create(path, "icu", dataName, &ucaDataInfo,
1133                                        withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
1134     if(U_FAILURE(errorCode)) {
1135         fprintf(stderr, "genuca: udata_create(%s, ucadata.icu) failed - %s\n",
1136                 path, u_errorName(errorCode));
1137         return;
1138     }
1139 
1140     udata_writeBlock(pData, dest, totalSize);
1141     long dataLength = udata_finish(pData, &errorCode);
1142     if(U_FAILURE(errorCode)) {
1143         fprintf(stderr, "genuca: error %s writing the output file\n", u_errorName(errorCode));
1144         return;
1145     }
1146 
1147     if(dataLength != (long)totalSize) {
1148         fprintf(stderr,
1149                 "udata_finish(ucadata.icu) reports %ld bytes written but should be %ld\n",
1150                 dataLength, (long)totalSize);
1151         errorCode=U_INTERNAL_PROGRAM_ERROR;
1152     }
1153 }
1154 
1155 /**
1156  * Adds each lead surrogate to the bmp set if any of the 1024
1157  * associated supplementary code points is in the supp set.
1158  * These can be one and the same set.
1159  */
1160 static void
setLeadSurrogatesForAssociatedSupplementary(UnicodeSet & bmp,const UnicodeSet & supp)1161 setLeadSurrogatesForAssociatedSupplementary(UnicodeSet &bmp, const UnicodeSet &supp) {
1162     UChar32 c = 0x10000;
1163     for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
1164         if(supp.containsSome(c, c + 0x3ff)) {
1165             bmp.add(lead);
1166         }
1167     }
1168 }
1169 
1170 static int32_t
makeBMPFoldedBitSet(const UnicodeSet & set,uint8_t index[0x800],uint32_t bits[256],UErrorCode & errorCode)1171 makeBMPFoldedBitSet(const UnicodeSet &set, uint8_t index[0x800], uint32_t bits[256],
1172                     UErrorCode &errorCode) {
1173     if(U_FAILURE(errorCode)) { return 0; }
1174     bits[0] = 0;  // no bits set
1175     bits[1] = 0xffffffff;  // all bits set
1176     int32_t bitsLength = 2;
1177     int32_t i = 0;
1178     for(UChar32 c = 0; c <= 0xffff; c += 0x20, ++i) {
1179         if(set.containsNone(c, c + 0x1f)) {
1180             index[i] = 0;
1181         } else if(set.contains(c, c + 0x1f)) {
1182             index[i] = 1;
1183         } else {
1184             uint32_t b = 0;
1185             for(int32_t j = 0; j <= 0x1f; ++j) {
1186                 if(set.contains(c + j)) {
1187                     b |= (uint32_t)1 << j;
1188                 }
1189             }
1190             int32_t k;
1191             for(k = 2;; ++k) {
1192                 if(k == bitsLength) {
1193                     // new bit combination
1194                     if(bitsLength == 256) {
1195                         errorCode = U_BUFFER_OVERFLOW_ERROR;
1196                         return 0;
1197                     }
1198                     bits[bitsLength++] = b;
1199                     break;
1200                 }
1201                 if(bits[k] == b) {
1202                     // duplicate bit combination
1203                     break;
1204                 }
1205             }
1206             index[i] = k;
1207         }
1208     }
1209     return bitsLength;
1210 }
1211 
1212 // TODO: Make preparseucd.py write fcd_data.h mapping code point ranges to FCD16 values,
1213 // use that rather than properties APIs.
1214 // Then consider moving related logic for the unsafeBwdSet back from the loader into this builder.
1215 
1216 /**
1217  * Builds data for the FCD check fast path.
1218  * For details see the CollationFCD class comments.
1219  */
1220 static void
buildAndWriteFCDData(const char * path,UErrorCode & errorCode)1221 buildAndWriteFCDData(const char *path, UErrorCode &errorCode) {
1222     UnicodeSet lcccSet(UNICODE_STRING_SIMPLE("[[:^lccc=0:][\\udc00-\\udfff]]"), errorCode);
1223     UnicodeSet tcccSet(UNICODE_STRING_SIMPLE("[:^tccc=0:]"), errorCode);
1224     if(U_FAILURE(errorCode)) { return; }
1225     setLeadSurrogatesForAssociatedSupplementary(tcccSet, tcccSet);
1226     // The following supp(lccc)->lead(tccc) should be unnecessary
1227     // after the previous supp(tccc)->lead(tccc)
1228     // because there should not be any characters with lccc!=0 and tccc=0.
1229     // It is safe and harmless.
1230     setLeadSurrogatesForAssociatedSupplementary(tcccSet, lcccSet);
1231     setLeadSurrogatesForAssociatedSupplementary(lcccSet, lcccSet);
1232     uint8_t lcccIndex[0x800], tcccIndex[0x800];
1233     uint32_t lcccBits[256], tcccBits[256];
1234     int32_t lcccBitsLength = makeBMPFoldedBitSet(lcccSet, lcccIndex, lcccBits, errorCode);
1235     int32_t tcccBitsLength = makeBMPFoldedBitSet(tcccSet, tcccIndex, tcccBits, errorCode);
1236     printf("@@@ lcccBitsLength=%d -> %d bytes\n", lcccBitsLength, 0x800 + lcccBitsLength * 4);
1237     printf("@@@ tcccBitsLength=%d -> %d bytes\n", tcccBitsLength, 0x800 + tcccBitsLength * 4);
1238 
1239     if(U_FAILURE(errorCode)) { return; }
1240 
1241     FILE *f=usrc_create(path, "collationfcd.cpp", 2016,
1242                         "icu/tools/unicode/c/genuca/genuca.cpp");
1243     if(f==NULL) {
1244         errorCode=U_FILE_ACCESS_ERROR;
1245         return;
1246     }
1247     fputs("#include \"unicode/utypes.h\"\n\n", f);
1248     fputs("#if !UCONFIG_NO_COLLATION\n\n", f);
1249     fputs("#include \"collationfcd.h\"\n\n", f);
1250     fputs("U_NAMESPACE_BEGIN\n\n", f);
1251     usrc_writeArray(f,
1252         "const uint8_t CollationFCD::lcccIndex[%ld]={\n",
1253         lcccIndex, 8, 0x800,
1254         "\n};\n\n");
1255     usrc_writeArray(f,
1256         "const uint32_t CollationFCD::lcccBits[%ld]={\n",
1257         lcccBits, 32, lcccBitsLength,
1258         "\n};\n\n");
1259     usrc_writeArray(f,
1260         "const uint8_t CollationFCD::tcccIndex[%ld]={\n",
1261         tcccIndex, 8, 0x800,
1262         "\n};\n\n");
1263     usrc_writeArray(f,
1264         "const uint32_t CollationFCD::tcccBits[%ld]={\n",
1265         tcccBits, 32, tcccBitsLength,
1266         "\n};\n\n");
1267     fputs("U_NAMESPACE_END\n\n", f);
1268     fputs("#endif  // !UCONFIG_NO_COLLATION\n", f);
1269     fclose(f);
1270 }
1271 
1272 static void
parseAndWriteCollationRootData(const char * fracUCAPath,const char * binaryDataPath,const char * sourceCodePath,UErrorCode & errorCode)1273 parseAndWriteCollationRootData(
1274         const char *fracUCAPath,
1275         const char *binaryDataPath,
1276         const char *sourceCodePath,
1277         UErrorCode &errorCode) {
1278     if(U_FAILURE(errorCode)) { return; }
1279     CollationBaseDataBuilder builder(errorCode);
1280     builder.init(errorCode);
1281     parseFractionalUCA(fracUCAPath, builder, &errorCode);
1282     buildAndWriteBaseData(builder, binaryDataPath, errorCode);
1283     buildAndWriteFCDData(sourceCodePath, errorCode);
1284 }
1285 
1286 // ------------------------------------------------------------------------- ***
1287 
1288 enum {
1289     HELP_H,
1290     HELP_QUESTION_MARK,
1291     VERBOSE,
1292     COPYRIGHT,
1293     HAN_ORDER
1294 };
1295 
1296 static UOption options[]={
1297     UOPTION_HELP_H,
1298     UOPTION_HELP_QUESTION_MARK,
1299     UOPTION_VERBOSE,
1300     UOPTION_COPYRIGHT,
1301     UOPTION_DEF("hanOrder", '\x01', UOPT_REQUIRES_ARG)
1302 };
1303 
1304 extern "C" int
main(int argc,char * argv[])1305 main(int argc, char* argv[]) {
1306     U_MAIN_INIT_ARGS(argc, argv);
1307 
1308     argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
1309 
1310     /* error handling, printing usage message */
1311     if(argc<0) {
1312         fprintf(stderr,
1313             "error in command line argument \"%s\"\n",
1314             argv[-argc]);
1315     }
1316     if(options[HAN_ORDER].doesOccur) {
1317         const char *order = options[HAN_ORDER].value;
1318         if(uprv_strcmp(order, "implicit") == 0) {
1319             hanOrder = HAN_IMPLICIT;
1320         } else if(uprv_strcmp(order, "radical-stroke") == 0) {
1321             hanOrder = HAN_RADICAL_STROKE;
1322         }
1323     }
1324     if(hanOrder == HAN_NO_ORDER) {
1325         argc = -1;
1326     }
1327     if( argc<2 ||
1328         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
1329     ) {
1330         /*
1331          * Broken into chunks because the C89 standard says the minimum
1332          * required supported string length is 509 bytes.
1333          */
1334         fprintf(stderr,
1335             "Usage: %s [-options] --hanOrder (implicit|radical-stroke) path/to/ICU/src/root\n"
1336             "\n"
1337             "Reads path/to/ICU/src/root/source/data/unidata/FractionalUCA.txt and\n"
1338             "writes source and binary data files with the collation root data.\n"
1339             "\n",
1340             argv[0]);
1341         fprintf(stderr,
1342             "Options:\n"
1343             "\t-h or -? or --help  this usage text\n"
1344             "\t-v or --verbose     verbose output\n"
1345             "\t-c or --copyright   include a copyright notice\n"
1346             "\t      --hanOrder    implicit or radical-stroke\n");
1347         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1348     }
1349 
1350     beVerbose=options[VERBOSE].doesOccur;
1351     withCopyright=options[COPYRIGHT].doesOccur;
1352 
1353     IcuToolErrorCode errorCode("genuca");
1354 
1355     CharString icuSrcRoot(argv[1], errorCode);
1356 
1357     CharString icuSource(icuSrcRoot, errorCode);
1358     icuSource.appendPathPart("source", errorCode);
1359 
1360     CharString icuSourceData(icuSource, errorCode);
1361     icuSourceData.appendPathPart("data", errorCode);
1362 
1363     CharString fracUCAPath(icuSourceData, errorCode);
1364     fracUCAPath.appendPathPart("unidata", errorCode);
1365     fracUCAPath.appendPathPart("FractionalUCA.txt", errorCode);
1366 
1367     CharString sourceDataInColl(icuSourceData, errorCode);
1368     sourceDataInColl.appendPathPart("in", errorCode);
1369     sourceDataInColl.appendPathPart("coll", errorCode);
1370 
1371     CharString sourceI18n(icuSource, errorCode);
1372     sourceI18n.appendPathPart("i18n", errorCode);
1373 
1374     errorCode.assertSuccess();
1375 
1376     parseAndWriteCollationRootData(
1377         fracUCAPath.data(),
1378         sourceDataInColl.data(),
1379         sourceI18n.data(),
1380         errorCode);
1381 
1382     return errorCode;
1383 }
1384 
1385 #endif  // UCONFIG_NO_COLLATION
1386