1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2000-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: genuca.cpp
11 * encoding: US-ASCII
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created at the end of XX century
16 * created by: Vladimir Weinstein,
17 * modified in 2013-2014 by Markus Scherer
18 *
19 * This program reads the Fractional UCA table and generates
20 * internal format for UCA table as well as inverse UCA table.
21 * It then writes the ucadata.icu binary file containing the data.
22 */
23
24 #define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1
25
26 #include <stdio.h>
27 #include <stdint.h>
28 #include "unicode/utypes.h"
29 #include "unicode/errorcode.h"
30 #include "unicode/localpointer.h"
31 #include "unicode/ucol.h"
32 #include "unicode/uscript.h"
33 #include "unicode/utf8.h"
34 #include "charstr.h"
35 #include "cmemory.h"
36 #include "collation.h"
37 #include "collationbasedatabuilder.h"
38 #include "collationdata.h"
39 #include "collationdatabuilder.h"
40 #include "collationdatareader.h"
41 #include "collationdatawriter.h"
42 #include "collationinfo.h"
43 #include "collationrootelements.h"
44 #include "collationruleparser.h"
45 #include "collationtailoring.h"
46 #include "cstring.h"
47 #include "normalizer2impl.h"
48 #include "toolutil.h"
49 #include "unewdata.h"
50 #include "uoptions.h"
51 #include "uparse.h"
52 #include "writesrc.h"
53
54 #if UCONFIG_NO_COLLATION
55
56 extern "C" int
main(int argc,char * argv[])57 main(int argc, char* argv[]) {
58 (void)argc;
59 (void)argv;
60 return 1;
61 }
62
63 #else
64
65 U_NAMESPACE_USE
66
67 enum HanOrderValue {
68 HAN_NO_ORDER = -1,
69 HAN_IMPLICIT,
70 HAN_RADICAL_STROKE
71 };
72
73 static UBool beVerbose=false, withCopyright=true, icu4xMode=false;
74
75 static HanOrderValue hanOrder = HAN_NO_ORDER;
76
77 static UVersionInfo UCAVersion={ 0, 0, 0, 0 };
78
79 static UDataInfo ucaDataInfo={
80 sizeof(UDataInfo),
81 0,
82
83 U_IS_BIG_ENDIAN,
84 U_CHARSET_FAMILY,
85 U_SIZEOF_UCHAR,
86 0,
87
88 { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol"
89 { 5, 0, 0, 0 }, // formatVersion
90 { 6, 3, 0, 0 } // dataVersion
91 };
92
skipWhiteSpace(char * s)93 static char *skipWhiteSpace(char *s) {
94 while(*s == ' ' || *s == '\t') { ++s; }
95 return s;
96 }
97
hex2num(char hex)98 static int32_t hex2num(char hex) {
99 if(hex>='0' && hex <='9') {
100 return hex-'0';
101 } else if(hex>='a' && hex<='f') {
102 return hex-'a'+10;
103 } else if(hex>='A' && hex<='F') {
104 return hex-'A'+10;
105 } else {
106 return -1;
107 }
108 }
109
parseWeight(char * & s,const char * separators,int32_t maxBytes,UErrorCode & errorCode)110 static uint32_t parseWeight(char *&s, const char *separators,
111 int32_t maxBytes, UErrorCode &errorCode) {
112 if(U_FAILURE(errorCode)) { return 0; }
113 uint32_t weight = 0;
114 int32_t numBytes = 0;
115 for(;;) {
116 // Check one character after another, so that we don't just run over a 00.
117 int32_t nibble1, nibble2;
118 if((nibble1 = hex2num(s[0])) < 0 || (nibble2 = hex2num(s[1])) < 0) {
119 // Stop when we find something other than a pair of hex digits.
120 break;
121 }
122 if(numBytes == maxBytes || (numBytes != 0 && nibble1 == 0 && nibble2 <= 1)) {
123 // Too many bytes, or a 00 or 01 byte which is illegal inside a weight.
124 errorCode = U_INVALID_FORMAT_ERROR;
125 return 0;
126 }
127 weight = (weight << 8) | ((uint32_t)nibble1 << 4) | (uint32_t)nibble2;
128 ++numBytes;
129 s += 2;
130 if(*s != ' ') {
131 break;
132 }
133 ++s;
134 }
135 char c = *s;
136 if(c == 0 || strchr(separators, c) == NULL) {
137 errorCode = U_INVALID_FORMAT_ERROR;
138 return 0;
139 }
140 // numBytes==0 is ok, for example in [,,] or [, 82, 05]
141 // Left-align the weight.
142 while(numBytes < 4) {
143 weight <<= 8;
144 ++numBytes;
145 }
146 return weight;
147 }
148
149 /**
150 * Parse a CE like [0A 86, 05, 17] or [U+4E00, 10].
151 * Stop with an error, or else with the pointer s after the closing bracket.
152 */
parseCE(const CollationDataBuilder & builder,char * & s,UErrorCode & errorCode)153 static int64_t parseCE(const CollationDataBuilder &builder, char *&s, UErrorCode &errorCode) {
154 if(U_FAILURE(errorCode)) { return 0; }
155 ++s; // skip over the '['
156 if(s[0] == 'U' && s[1] == '+') {
157 // Read a code point and look up its CE.
158 // We use this especially for implicit primary weights,
159 // so that we can use different algorithms in the FractionalUCA.txt
160 // generator and the parser.
161 // The generator may not even need to compute any implicit primaries at all.
162 s += 2;
163 char *end;
164 unsigned long longCp = uprv_strtoul(s, &end, 16);
165 if(end == s || longCp > 0x10ffff) {
166 errorCode = U_INVALID_FORMAT_ERROR;
167 return 0;
168 }
169 UChar32 c = (UChar32)longCp;
170 int64_t ce = builder.getSingleCE(c, errorCode);
171 if(U_FAILURE(errorCode)) { return 0; }
172 s = end;
173 if(*s == ']') { // [U+4E00]
174 ++s;
175 return ce;
176 }
177 if(*s != ',') {
178 errorCode = U_INVALID_FORMAT_ERROR;
179 return 0;
180 }
181 // Parse the following, secondary or tertiary weight.
182 s = skipWhiteSpace(s + 1);
183 uint32_t w = parseWeight(s, ",]", 2, errorCode);
184 if(U_FAILURE(errorCode)) { return 0; }
185 if(*s == ']') { // [U+4E00, 10]
186 ++s;
187 // Set the tertiary weight to w.
188 return (ce & INT64_C(0xffffffffffff0000)) | (w >> 16);
189 }
190 // Set the secondary weight to w: [U+9F9C, 70, 20]
191 ce = (ce & INT64_C(0xffffffff00000000)) | w;
192 // Parse and set the tertiary weight.
193 s = skipWhiteSpace(s + 1);
194 w = parseWeight(s, "]", 2, errorCode);
195 ++s;
196 return ce | (w >> 16);
197 } else {
198 uint32_t p = parseWeight(s, ",", 4, errorCode);
199 if(U_FAILURE(errorCode)) { return 0; }
200 int64_t ce = (int64_t)p << 32;
201 s = skipWhiteSpace(s + 1);
202 uint32_t w = parseWeight(s, ",", 2, errorCode);
203 if(U_FAILURE(errorCode)) { return 0; }
204 ce |= w;
205 s = skipWhiteSpace(s + 1);
206 w = parseWeight(s, "]", 2, errorCode);
207 ++s;
208 return ce | (w >> 16);
209 }
210 }
211
212 namespace {
213
214 // Cached, lazy-init mapping from scripts to sample characters.
215 UChar32 sampleChars[USCRIPT_CODE_LIMIT] = { U_SENTINEL };
216
217 }
218
219 // Hardcoded mapping from script sample characters to script codes.
220 // Pro: Available without complete and updated UCD scripts data,
221 // easy to add non-script codes specific to collation.
222 // Con: Needs manual update for each new script or change in sample character.
223 static const struct {
224 UChar32 sampleChar;
225 int32_t script;
226 } sampleCharsToScripts[] = {
227 { 0x00A0, UCOL_REORDER_CODE_SPACE },
228 { 0x201C, UCOL_REORDER_CODE_PUNCTUATION },
229 { 0x263A, UCOL_REORDER_CODE_SYMBOL },
230 { 0x20AC, UCOL_REORDER_CODE_CURRENCY },
231 { 0x0034, UCOL_REORDER_CODE_DIGIT },
232 { 0x004C, USCRIPT_LATIN },
233 { 0x03A9, USCRIPT_GREEK },
234 { 0x03E2, USCRIPT_COPTIC },
235 { 0x042F, USCRIPT_CYRILLIC },
236 { 0x2C00, USCRIPT_GLAGOLITIC },
237 { 0x1036B, USCRIPT_OLD_PERMIC },
238 { 0x10D3, USCRIPT_GEORGIAN },
239 { 0x0531, USCRIPT_ARMENIAN },
240 { 0x05D0, USCRIPT_HEBREW },
241 { 0x10900, USCRIPT_PHOENICIAN },
242 { 0x0800, USCRIPT_SAMARITAN },
243 { 0x0628, USCRIPT_ARABIC },
244 { 0x0710, USCRIPT_SYRIAC },
245 { 0x0840, USCRIPT_MANDAIC },
246 { 0x078C, USCRIPT_THAANA },
247 { 0x07CA, USCRIPT_NKO },
248 { 0x07D8, USCRIPT_NKO },
249 { 0x2D30, USCRIPT_TIFINAGH },
250 { 0x2D5E, USCRIPT_TIFINAGH },
251 { 0x12A0, USCRIPT_ETHIOPIC },
252 { 0x0905, USCRIPT_DEVANAGARI },
253 { 0x0995, USCRIPT_BENGALI },
254 { 0x0A15, USCRIPT_GURMUKHI },
255 { 0x0A95, USCRIPT_GUJARATI },
256 { 0x0B15, USCRIPT_ORIYA },
257 { 0x0B95, USCRIPT_TAMIL },
258 { 0x0C15, USCRIPT_TELUGU },
259 { 0x0C95, USCRIPT_KANNADA },
260 { 0x0D15, USCRIPT_MALAYALAM },
261 { 0x0D85, USCRIPT_SINHALA },
262 { 0xABC0, USCRIPT_MEITEI_MAYEK },
263 { 0xA800, USCRIPT_SYLOTI_NAGRI },
264 { 0xA882, USCRIPT_SAURASHTRA },
265 { 0x11083, USCRIPT_KAITHI },
266 { 0x11152, USCRIPT_MAHAJANI },
267 { 0x11183, USCRIPT_SHARADA },
268 { 0x11208, USCRIPT_KHOJKI },
269 { 0x112BE, USCRIPT_KHUDAWADI },
270 { 0x1128F, USCRIPT_MULTANI },
271 { 0x11315, USCRIPT_GRANTHA },
272 { 0x11412, USCRIPT_NEWA },
273 { 0x11484, USCRIPT_TIRHUTA },
274 { 0x1158E, USCRIPT_SIDDHAM },
275 { 0x1160E, USCRIPT_MODI },
276 { 0x11680, USCRIPT_TAKRI },
277 { 0x1180B, USCRIPT_DOGRA },
278 { 0x11717, USCRIPT_AHOM },
279 { 0x11D71, USCRIPT_GUNJALA_GONDI },
280 { 0x1B83, USCRIPT_SUNDANESE },
281 { 0x11005, USCRIPT_BRAHMI },
282 { 0x10A00, USCRIPT_KHAROSHTHI },
283 { 0x11C0E, USCRIPT_BHAIKSUKI },
284 { 0x0E17, USCRIPT_THAI },
285 { 0x0EA5, USCRIPT_LAO },
286 { 0xAA80, USCRIPT_TAI_VIET },
287 { 0x0F40, USCRIPT_TIBETAN },
288 { 0x11C72, USCRIPT_MARCHEN },
289 { 0x1C00, USCRIPT_LEPCHA },
290 { 0xA840, USCRIPT_PHAGS_PA },
291 { 0x1900, USCRIPT_LIMBU },
292 { 0x1703, USCRIPT_TAGALOG },
293 { 0x1723, USCRIPT_HANUNOO },
294 { 0x1743, USCRIPT_BUHID },
295 { 0x1763, USCRIPT_TAGBANWA },
296 { 0x1A00, USCRIPT_BUGINESE },
297 { 0x11EE5, USCRIPT_MAKASAR },
298 { 0x1BC0, USCRIPT_BATAK },
299 { 0xA930, USCRIPT_REJANG },
300 { 0xA90A, USCRIPT_KAYAH_LI },
301 { 0x1000, USCRIPT_MYANMAR },
302 { 0x10D12, USCRIPT_HANIFI_ROHINGYA },
303 { 0x11103, USCRIPT_CHAKMA },
304 { 0x1780, USCRIPT_KHMER },
305 { 0x1950, USCRIPT_TAI_LE },
306 { 0x1980, USCRIPT_NEW_TAI_LUE },
307 { 0x1A20, USCRIPT_LANNA },
308 { 0xAA00, USCRIPT_CHAM },
309 { 0x1B05, USCRIPT_BALINESE },
310 { 0xA984, USCRIPT_JAVANESE },
311 { 0x1826, USCRIPT_MONGOLIAN },
312 { 0x1C5A, USCRIPT_OL_CHIKI },
313 { 0x13C4, USCRIPT_CHEROKEE },
314 { 0x104B5, USCRIPT_OSAGE },
315 { 0x14C0, USCRIPT_CANADIAN_ABORIGINAL },
316 { 0x168F, USCRIPT_OGHAM },
317 { 0x16A0, USCRIPT_RUNIC },
318 { 0x10CA1, USCRIPT_OLD_HUNGARIAN },
319 { 0x10C00, USCRIPT_ORKHON },
320 { 0xA549, USCRIPT_VAI },
321 { 0xA6A0, USCRIPT_BAMUM },
322 { 0x16AE6, USCRIPT_BASSA_VAH },
323 { 0x1E802, USCRIPT_MENDE },
324 { 0x16E40, USCRIPT_MEDEFAIDRIN },
325 { 0x1E909, USCRIPT_ADLAM, },
326 { 0xAC00, USCRIPT_HANGUL },
327 { 0x304B, USCRIPT_HIRAGANA },
328 { 0x30AB, USCRIPT_KATAKANA },
329 { 0x3105, USCRIPT_BOPOMOFO },
330 { 0xA288, USCRIPT_YI },
331 { 0xA4D0, USCRIPT_LISU },
332 { 0xA4E8, USCRIPT_LISU },
333 { 0x16F00, USCRIPT_MIAO },
334 { 0x118B4, USCRIPT_WARANG_CITI },
335 { 0x11AC0, USCRIPT_PAU_CIN_HAU },
336 { 0x16B1C, USCRIPT_PAHAWH_HMONG },
337 { 0x10280, USCRIPT_LYCIAN },
338 { 0x102A0, USCRIPT_CARIAN },
339 { 0x102B7, USCRIPT_CARIAN },
340 { 0x10920, USCRIPT_LYDIAN },
341 { 0x10300, USCRIPT_OLD_ITALIC },
342 { 0x10308, USCRIPT_OLD_ITALIC },
343 { 0x10330, USCRIPT_GOTHIC },
344 { 0x10414, USCRIPT_DESERET },
345 { 0x10450, USCRIPT_SHAVIAN },
346 { 0x1BC20, USCRIPT_DUPLOYAN },
347 { 0x10480, USCRIPT_OSMANYA },
348 { 0x10500, USCRIPT_ELBASAN },
349 { 0x10537, USCRIPT_CAUCASIAN_ALBANIAN },
350 { 0x110D0, USCRIPT_SORA_SOMPENG },
351 { 0x16A4F, USCRIPT_MRO },
352 { 0x10000, USCRIPT_LINEAR_B },
353 { 0x10647, USCRIPT_LINEAR_A },
354 { 0x10800, USCRIPT_CYPRIOT },
355 { 0x10A60, USCRIPT_OLD_SOUTH_ARABIAN },
356 { 0x10A95, USCRIPT_OLD_NORTH_ARABIAN },
357 { 0x10B00, USCRIPT_AVESTAN },
358 { 0x10873, USCRIPT_PALMYRENE },
359 { 0x10896, USCRIPT_NABATAEAN },
360 { 0x108F4, USCRIPT_HATRAN },
361 { 0x10840, USCRIPT_IMPERIAL_ARAMAIC },
362 { 0x10B40, USCRIPT_INSCRIPTIONAL_PARTHIAN },
363 { 0x10B60, USCRIPT_INSCRIPTIONAL_PAHLAVI },
364 { 0x10B8F, USCRIPT_PSALTER_PAHLAVI },
365 { 0x10AC1, USCRIPT_MANICHAEAN },
366 { 0x10AD8, USCRIPT_MANICHAEAN },
367 { 0x10F19, USCRIPT_OLD_SOGDIAN },
368 { 0x10F42, USCRIPT_SOGDIAN },
369 { 0x10380, USCRIPT_UGARITIC },
370 { 0x103A0, USCRIPT_OLD_PERSIAN },
371 { 0x12000, USCRIPT_CUNEIFORM },
372 { 0x13153, USCRIPT_EGYPTIAN_HIEROGLYPHS },
373 { 0x109A0, USCRIPT_MEROITIC_CURSIVE },
374 { 0x10980, USCRIPT_MEROITIC_HIEROGLYPHS },
375 { 0x14400, USCRIPT_ANATOLIAN_HIEROGLYPHS },
376 { 0x18229, USCRIPT_TANGUT },
377 { 0x5B57, USCRIPT_HAN },
378 { 0x11D10, USCRIPT_MASARAM_GONDI },
379 { 0x11A0B, USCRIPT_ZANABAZAR_SQUARE },
380 { 0x11A5C, USCRIPT_SOYOMBO },
381 { 0x1B1C4, USCRIPT_NUSHU },
382 { 0xFDD0, USCRIPT_UNKNOWN } // unassigned-implicit primary weights
383 };
384
getCharScript(UChar32 c)385 static int32_t getCharScript(UChar32 c) {
386 if (sampleChars[0] < 0) {
387 // Lazy-init the script->sample cache.
388 for (int32_t script = 0; script < USCRIPT_CODE_LIMIT; ++script) {
389 UnicodeString sample = uscript_getSampleUnicodeString((UScriptCode)script);
390 if (sample.isEmpty() || sample.hasMoreChar32Than(0, INT32_MAX, 1)) {
391 sampleChars[script] = U_SENTINEL;
392 } else {
393 sampleChars[script] = sample.char32At(0);
394 }
395 }
396 }
397 for (int32_t script = 0; script < USCRIPT_CODE_LIMIT; ++script) {
398 if (c == sampleChars[script]) {
399 return script;
400 }
401 }
402 for(int32_t i = 0; i < UPRV_LENGTHOF(sampleCharsToScripts); ++i) {
403 if(c == sampleCharsToScripts[i].sampleChar) {
404 return sampleCharsToScripts[i].script;
405 }
406 }
407 return USCRIPT_INVALID_CODE; // -1
408 }
409
410 /**
411 * Maps Unified_Ideograph's to primary CEs in the given order of ranges.
412 */
413 class HanOrder {
414 public:
HanOrder(UErrorCode & errorCode)415 HanOrder(UErrorCode &errorCode) : ranges(errorCode), set(), done(false) {}
416
addRange(UChar32 start,UChar32 end,UErrorCode & errorCode)417 void addRange(UChar32 start, UChar32 end, UErrorCode &errorCode) {
418 int32_t length = ranges.size();
419 if(length > 0 && (ranges.elementAti(length - 1) + 1) == start) {
420 // The previous range end is just before this range start: Merge adjacent ranges.
421 ranges.setElementAt(end, length - 1);
422 } else {
423 ranges.addElement(start, errorCode);
424 ranges.addElement(end, errorCode);
425 }
426 set.add(start, end);
427 }
428
setBuilderHanOrder(CollationBaseDataBuilder & builder,UErrorCode & errorCode)429 void setBuilderHanOrder(CollationBaseDataBuilder &builder, UErrorCode &errorCode) {
430 if(U_FAILURE(errorCode)) { return; }
431 builder.initHanRanges(ranges.getBuffer(), ranges.size(), errorCode);
432 done = true;
433 }
434
setDone()435 void setDone() {
436 done = true;
437 }
438
isDone()439 UBool isDone() { return done; }
440
getSet()441 const UnicodeSet &getSet() { return set; }
442
443 private:
444 UVector32 ranges;
445 UnicodeSet set;
446 UBool done;
447 };
448
449 static HanOrder *implicitHanOrder = NULL;
450 static HanOrder *radicalStrokeOrder = NULL;
451
452 enum ActionType {
453 READCE,
454 READPRIMARY,
455 READBYTE,
456 READUNIFIEDIDEOGRAPH,
457 READRADICAL,
458 READUCAVERSION,
459 READLEADBYTETOSCRIPTS,
460 IGNORE
461 };
462
463 static struct {
464 const char *const name;
465 int64_t value;
466 const ActionType what_to_do;
467 } vt[] = {
468 {"[first tertiary ignorable", 0, IGNORE},
469 {"[last tertiary ignorable", 0, IGNORE},
470 {"[first secondary ignorable", 0, READCE},
471 {"[last secondary ignorable", 0, READCE},
472 {"[first primary ignorable", 0, READCE},
473 {"[last primary ignorable", 0, READCE},
474 {"[first variable", 0, READCE},
475 {"[last variable", 0, READCE},
476 {"[first regular", 0, READCE},
477 {"[last regular", 0, READCE},
478 {"[first implicit", 0, READCE},
479 {"[last implicit", 0, READCE},
480 {"[first trailing", 0, READCE},
481 {"[last trailing", 0, READCE},
482
483 {"[Unified_Ideograph", 0, READUNIFIEDIDEOGRAPH},
484 {"[radical", 0, READRADICAL},
485
486 {"[fixed first implicit byte", 0, IGNORE},
487 {"[fixed last implicit byte", 0, IGNORE},
488 {"[fixed first trail byte", 0, IGNORE},
489 {"[fixed last trail byte", 0, IGNORE},
490 {"[fixed first special byte", 0, IGNORE},
491 {"[fixed last special byte", 0, IGNORE},
492 {"[fixed secondary common byte", 0, READBYTE},
493 {"[fixed last secondary common byte", 0, READBYTE},
494 {"[fixed first ignorable secondary byte", 0, READBYTE},
495 {"[fixed tertiary common byte", 0, READBYTE},
496 {"[fixed first ignorable tertiary byte", 0, READBYTE},
497 {"[variable top = ", 0, IGNORE},
498 {"[UCA version = ", 0, READUCAVERSION},
499 {"[top_byte", 0, READLEADBYTETOSCRIPTS},
500 {"[reorderingTokens", 0, IGNORE},
501 {"[categories", 0, IGNORE},
502 {"[first tertiary in secondary non-ignorable", 0, IGNORE},
503 {"[last tertiary in secondary non-ignorable", 0, IGNORE},
504 {"[first secondary in primary non-ignorable", 0, IGNORE},
505 {"[last secondary in primary non-ignorable", 0, IGNORE},
506 };
507
getOptionValue(const char * name)508 static int64_t getOptionValue(const char *name) {
509 for (int32_t i = 0; i < UPRV_LENGTHOF(vt); ++i) {
510 if(uprv_strcmp(name, vt[i].name) == 0) {
511 return vt[i].value;
512 }
513 }
514 return 0;
515 }
516
readAnOption(CollationBaseDataBuilder & builder,char * buffer,UErrorCode * status)517 static void readAnOption(
518 CollationBaseDataBuilder &builder, char *buffer, UErrorCode *status) {
519 for (int32_t cnt = 0; cnt<UPRV_LENGTHOF(vt); cnt++) {
520 int32_t vtLen = (int32_t)uprv_strlen(vt[cnt].name);
521 if(uprv_strncmp(buffer, vt[cnt].name, vtLen) == 0) {
522 ActionType what_to_do = vt[cnt].what_to_do;
523 char *pointer = skipWhiteSpace(buffer + vtLen);
524 if (what_to_do == IGNORE) { //vt[cnt].what_to_do == IGNORE
525 return;
526 } else if (what_to_do == READCE) {
527 vt[cnt].value = parseCE(builder, pointer, *status);
528 if(U_SUCCESS(*status) && *pointer != ']') {
529 *status = U_INVALID_FORMAT_ERROR;
530 }
531 if(U_FAILURE(*status)) {
532 fprintf(stderr, "Syntax error: unable to parse the CE from line '%s'\n", buffer);
533 return;
534 }
535 } else if(what_to_do == READPRIMARY) {
536 vt[cnt].value = parseWeight(pointer, "]", 4, *status);
537 if(U_FAILURE(*status)) {
538 fprintf(stderr, "Value of \"%s\" is not a primary weight\n", buffer);
539 return;
540 }
541 } else if(what_to_do == READBYTE) {
542 vt[cnt].value = parseWeight(pointer, "]", 1, *status) >> 24;
543 if(U_FAILURE(*status)) {
544 fprintf(stderr, "Value of \"%s\" is not a valid byte\n", buffer);
545 return;
546 }
547 } else if(what_to_do == READUNIFIEDIDEOGRAPH) {
548 if(implicitHanOrder != NULL) {
549 fprintf(stderr, "duplicate [Unified_Ideograph] lines\n");
550 *status = U_INVALID_FORMAT_ERROR;
551 return;
552 }
553 implicitHanOrder = new HanOrder(*status);
554 if(U_FAILURE(*status)) { return; }
555 for(;;) {
556 if(*pointer == ']') { break; }
557 if(*pointer == 0) {
558 // Missing ] after ranges.
559 *status = U_INVALID_FORMAT_ERROR;
560 return;
561 }
562 char *s = pointer;
563 while(*s != ' ' && *s != '\t' && *s != ']' && *s != '\0') { ++s; }
564 char c = *s;
565 *s = 0;
566 uint32_t start, end;
567 u_parseCodePointRange(pointer, &start, &end, status);
568 *s = c;
569 if(U_FAILURE(*status)) {
570 fprintf(stderr, "Syntax error: unable to parse one of the ranges from line '%s'\n", buffer);
571 *status = U_INVALID_FORMAT_ERROR;
572 return;
573 }
574 implicitHanOrder->addRange((UChar32)start, (UChar32)end, *status);
575 pointer = skipWhiteSpace(s);
576 }
577 if(hanOrder == HAN_IMPLICIT) {
578 implicitHanOrder->setBuilderHanOrder(builder, *status);
579 }
580 implicitHanOrder->setDone();
581 } else if(what_to_do == READRADICAL) {
582 if(radicalStrokeOrder == NULL) {
583 if(implicitHanOrder == NULL) {
584 fprintf(stderr, "[radical] section before [Unified_Ideograph] line\n");
585 *status = U_INVALID_FORMAT_ERROR;
586 return;
587 }
588 radicalStrokeOrder = new HanOrder(*status);
589 if(U_FAILURE(*status)) { return; }
590 } else if(radicalStrokeOrder->isDone()) {
591 fprintf(stderr, "duplicate [radical] sections\n");
592 *status = U_INVALID_FORMAT_ERROR;
593 return;
594 }
595 if(uprv_strcmp(pointer, "end]") == 0) {
596 if(radicalStrokeOrder->getSet() != implicitHanOrder->getSet()) {
597 fprintf(stderr, "[radical end]: "
598 "some of [Unified_Ideograph] missing from [radical] lines\n");
599 *status = U_INVALID_FORMAT_ERROR;
600 return;
601 }
602 if(hanOrder == HAN_RADICAL_STROKE) {
603 radicalStrokeOrder->setBuilderHanOrder(builder, *status);
604 }
605 radicalStrokeOrder->setDone();
606 } else {
607 // Read Han characters and ranges between : and ].
608 // Ignore the radical data before the :.
609 char *startPointer = uprv_strchr(pointer, ':');
610 char *limitPointer = uprv_strchr(pointer, ']');
611 if(startPointer == NULL || limitPointer == NULL ||
612 (startPointer + 1) >= limitPointer) {
613 fprintf(stderr, "[radical]: no Han characters listed between : and ]\n");
614 *status = U_INVALID_FORMAT_ERROR;
615 return;
616 }
617 pointer = startPointer + 1;
618 int32_t length = (int32_t)(limitPointer - pointer);
619 for(int32_t i = 0; i < length;) {
620 UChar32 start;
621 U8_NEXT(pointer, i, length, start);
622 UChar32 end;
623 if(pointer[i] == '-') {
624 ++i;
625 U8_NEXT(pointer, i, length, end);
626 } else {
627 end = start;
628 }
629 if(radicalStrokeOrder->getSet().containsSome(start, end)) {
630 fprintf(stderr, "[radical]: some of U+%04x..U+%04x occur "
631 "multiple times in the radical-stroke order\n",
632 start, end);
633 *status = U_INVALID_FORMAT_ERROR;
634 return;
635 }
636 if(!implicitHanOrder->getSet().contains(start, end)) {
637 fprintf(stderr, "[radical]: some of U+%04x..U+%04x are "
638 "not Unified_Ideograph\n",
639 start, end);
640 *status = U_INVALID_FORMAT_ERROR;
641 return;
642 }
643 radicalStrokeOrder->addRange(start, end, *status);
644 }
645 }
646 } else if (what_to_do == READUCAVERSION) {
647 u_versionFromString(UCAVersion, pointer);
648 if(beVerbose) {
649 char uca[U_MAX_VERSION_STRING_LENGTH];
650 u_versionToString(UCAVersion, uca);
651 printf("UCA version %s\n", uca);
652 }
653 UVersionInfo UCDVersion;
654 u_getUnicodeVersion(UCDVersion);
655 if (UCAVersion[0] != UCDVersion[0] || UCAVersion[1] != UCDVersion[1]) {
656 char uca[U_MAX_VERSION_STRING_LENGTH];
657 char ucd[U_MAX_VERSION_STRING_LENGTH];
658 u_versionToString(UCAVersion, uca);
659 u_versionToString(UCDVersion, ucd);
660 // Warning, not error, to permit bootstrapping during a version upgrade.
661 fprintf(stderr, "warning: UCA version %s != UCD version %s\n", uca, ucd);
662 }
663 } else if (what_to_do == READLEADBYTETOSCRIPTS) {
664 if (strstr(pointer, "COMPRESS") != NULL) {
665 uint16_t leadByte = (hex2num(*pointer++) * 16);
666 leadByte += hex2num(*pointer++);
667 builder.setCompressibleLeadByte(leadByte);
668 }
669 // We do not need the list of scripts on this line.
670 }
671 return;
672 }
673 }
674 fprintf(stderr, "Warning: unrecognized option: %s\n", buffer);
675 }
676
677 static UBool
readAnElement(char * line,CollationBaseDataBuilder & builder,UnicodeString & prefix,UnicodeString & s,int64_t ces[32],int32_t & cesLength,UErrorCode * status)678 readAnElement(char *line,
679 CollationBaseDataBuilder &builder,
680 UnicodeString &prefix, UnicodeString &s,
681 int64_t ces[32], int32_t &cesLength,
682 UErrorCode *status) {
683 if(U_FAILURE(*status)) {
684 return false;
685 }
686 int32_t lineLength = (int32_t)uprv_strlen(line);
687 while(lineLength>0 && (line[lineLength-1] == '\r' || line[lineLength-1] == '\n')) {
688 line[--lineLength] = 0;
689 }
690
691 if(lineLength >= 3 && line[0] == (char)0xef &&
692 line[1] == (char)0xbb && line[2] == (char)0xbf) {
693 // U+FEFF UTF-8 signature byte sequence.
694 // Ignore, assuming it is at the start of the file.
695 line += 3;
696 lineLength -= 3;
697 }
698 if(line[0] == 0 || line[0] == '#') {
699 return false; // just a comment, skip whole line
700 }
701
702 // Directives.
703 if(line[0] == '[') {
704 readAnOption(builder, line, status);
705 return false;
706 }
707
708 CharString input;
709 char *startCodePoint = line;
710 char *endCodePoint = strchr(startCodePoint, ';');
711 if(endCodePoint == NULL) {
712 fprintf(stderr, "error - line with no code point:\n%s\n", line);
713 *status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */
714 return false;
715 }
716
717 char *pipePointer = strchr(line, '|');
718 if (pipePointer != NULL) {
719 // Read the prefix string which precedes the actual string.
720 input.append(startCodePoint, (int32_t)(pipePointer - startCodePoint), *status);
721 UChar *prefixChars = prefix.getBuffer(32);
722 int32_t prefixSize =
723 u_parseString(input.data(),
724 prefixChars, prefix.getCapacity(),
725 NULL, status);
726 if(U_FAILURE(*status)) {
727 prefix.releaseBuffer(0);
728 fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n%s\n",
729 input.data(), line, u_errorName(*status));
730 *status = U_INVALID_FORMAT_ERROR;
731 return false;
732 }
733 prefix.releaseBuffer(prefixSize);
734 startCodePoint = pipePointer + 1;
735 input.clear();
736 }
737
738 // Read the string which gets the CE(s) assigned.
739 input.append(startCodePoint, (int32_t)(endCodePoint - startCodePoint), *status);
740 UChar *uchars = s.getBuffer(32);
741 int32_t cSize =
742 u_parseString(input.data(),
743 uchars, s.getCapacity(),
744 NULL, status);
745 if(U_FAILURE(*status)) {
746 s.releaseBuffer(0);
747 fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n%s\n",
748 input.data(), line, u_errorName(*status));
749 *status = U_INVALID_FORMAT_ERROR;
750 return false;
751 }
752 s.releaseBuffer(cSize);
753
754 char *pointer = endCodePoint + 1;
755
756 char *commentStart = strchr(pointer, '#');
757 if(commentStart == NULL) {
758 commentStart = strchr(pointer, 0);
759 }
760
761 cesLength = 0;
762 for(;;) {
763 pointer = skipWhiteSpace(pointer);
764 if(pointer == commentStart) {
765 break;
766 }
767 if(cesLength >= 31) {
768 fprintf(stderr, "Error: Too many CEs on line '%s'\n", line);
769 *status = U_INVALID_FORMAT_ERROR;
770 return false;
771 }
772 ces[cesLength++] = parseCE(builder, pointer, *status);
773 if(U_FAILURE(*status)) {
774 fprintf(stderr, "Syntax error parsing CE from line '%s' - %s\n",
775 line, u_errorName(*status));
776 return false;
777 }
778 }
779
780 if(s.length() == 1 && s[0] == 0xfffe) {
781 // UCA 6.0 gives U+FFFE a special minimum weight using the
782 // byte 02 which is the merge-sort-key separator and illegal for any
783 // other characters.
784 } else {
785 // Rudimentary check for valid bytes in CE weights.
786 // For a more comprehensive check see CollationTest::TestRootElements(),
787 // intltest collate/CollationTest/TestRootElements
788 for (int32_t i = 0; i < cesLength; ++i) {
789 int64_t ce = ces[i];
790 UBool isCompressible = false;
791 for (int j = 7; j >= 0; --j) {
792 uint8_t b = (uint8_t)(ce >> (j * 8));
793 if(j <= 1) { b &= 0x3f; } // tertiary bytes use 6 bits
794 if (b == 1) {
795 fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", line);
796 return false;
797 }
798 if (j == 7 && b == 2) {
799 fprintf(stderr, "Warning: invalid UCA primary weight lead byte 02 for %s\n", line);
800 return false;
801 }
802 if (j == 7) {
803 isCompressible = builder.isCompressibleLeadByte(b);
804 } else if (j == 6) {
805 // Primary second bytes 03 and FF are compression terminators.
806 // 02, 03 and FF are usable when the lead byte is not compressible.
807 // 02 is unusable and 03 is the low compression terminator when the lead byte is compressible.
808 if (isCompressible && (b <= 3 || b == 0xff)) {
809 fprintf(stderr, "Warning: invalid UCA primary second weight byte %02X for %s\n",
810 b, line);
811 return false;
812 }
813 }
814 }
815 }
816 }
817
818 return true;
819 }
820
821 static void
parseFractionalUCA(const char * filename,CollationBaseDataBuilder & builder,UErrorCode * status)822 parseFractionalUCA(const char *filename,
823 CollationBaseDataBuilder &builder,
824 UErrorCode *status)
825 {
826 if(U_FAILURE(*status)) { return; }
827 FILE *data = fopen(filename, "r");
828 if(data == NULL) {
829 fprintf(stderr, "Couldn't open file: %s\n", filename);
830 *status = U_FILE_ACCESS_ERROR;
831 return;
832 }
833 int32_t lineNumber = 0;
834 char buffer[30000];
835
836 const Normalizer2* norm = nullptr;
837 if (icu4xMode) {
838 norm = Normalizer2::getNFDInstance(*status);
839 }
840
841 UChar32 maxCodePoint = 0;
842 while(!feof(data)) {
843 if(U_FAILURE(*status)) {
844 fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n",
845 *status, u_errorName(*status), (int)lineNumber, filename);
846 exit(*status);
847 }
848
849 lineNumber++;
850 char *line = fgets(buffer, sizeof(buffer), data);
851 if(line == NULL) {
852 if(feof(data)) {
853 break;
854 } else {
855 fprintf(stderr, "no more input line and also no EOF!\n");
856 *status = U_INVALID_FORMAT_ERROR;
857 return;
858 }
859 }
860
861 UnicodeString prefix;
862 UnicodeString s;
863 int64_t ces[32];
864 int32_t cesLength = 0;
865 if(readAnElement(line, builder, prefix, s, ces, cesLength, status)) {
866 // we have read the line, now do something sensible with the read data!
867 uint32_t p = (uint32_t)(ces[0] >> 32);
868
869 if(s.length() > 1 && s[0] == 0xFDD0) {
870 // FractionalUCA.txt contractions starting with U+FDD0
871 // are only entered into the inverse table,
872 // not into the normal collation data.
873 builder.addRootElements(ces, cesLength, *status);
874 if(s.length() == 2 && cesLength == 1) {
875 switch(s[1]) {
876 case 0x34:
877 // Lead byte for numeric sorting.
878 builder.setNumericPrimary(p);
879 break;
880 case 0xFF21:
881 builder.addScriptStart(CollationData::REORDER_RESERVED_BEFORE_LATIN, p);
882 break;
883 case 0xFF3A:
884 builder.addScriptStart(CollationData::REORDER_RESERVED_AFTER_LATIN, p);
885 break;
886 default:
887 break;
888 }
889 }
890 } else {
891 UChar32 c = s.char32At(0);
892 if(c > maxCodePoint) { maxCodePoint = c; }
893
894 // We ignore the CEs for U+FFFD..U+FFFF and for the unassigned first primary.
895 // CollationBaseDataBuilder::init() maps them to special CEs.
896 // Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt.
897 if(0xfffd <= c && c <= 0xffff) { continue; }
898 if (icu4xMode) {
899 if (c >= 0xAC00 && c <= 0xD7A3) {
900 // Hangul syllable
901 continue;
902 }
903 if (c >= 0xD800 && c < 0xE000) {
904 // Surrogate
905 continue;
906 }
907 UnicodeString src;
908 UnicodeString dst;
909 src.append(c);
910 norm->normalize(src, dst, *status);
911 if (src != dst) {
912 // c decomposed, skip it
913 continue;
914 }
915 }
916 if(s.length() >= 2 && c == 0xFDD1) {
917 UChar32 c2 = s.char32At(1);
918 int32_t script = getCharScript(c2);
919 if(script < 0) {
920 fprintf(stderr,
921 "Error: Unknown script for first-primary sample character "
922 "U+%04X on line %u of %s:\n"
923 "%s\n"
924 " (add the character to genuca.cpp sampleCharsToScripts[])\n",
925 c2, (int)lineNumber, filename, line);
926 exit(U_INVALID_FORMAT_ERROR);
927 }
928 if(script == USCRIPT_UNKNOWN) {
929 // FDD1 FDD0, first unassigned-implicit primary
930 builder.addScriptStart(script, Collation::FIRST_UNASSIGNED_PRIMARY);
931 continue;
932 }
933 builder.addScriptStart(script, p);
934 if(script == USCRIPT_HIRAGANA) {
935 builder.addScriptStart(USCRIPT_KATAKANA_OR_HIRAGANA, p);
936 } else if(script == USCRIPT_HAN) {
937 builder.addScriptStart(USCRIPT_SIMPLIFIED_HAN, p);
938 builder.addScriptStart(USCRIPT_TRADITIONAL_HAN, p);
939 }
940 }
941
942 if(0xe0000000 <= p && p < 0xf0000000) {
943 fprintf(stderr,
944 "Error: Unexpected mapping to an implicit or trailing primary"
945 " on line %u of %s:\n"
946 "%s\n",
947 (int)lineNumber, filename, line);
948 exit(U_INVALID_FORMAT_ERROR);
949 }
950 builder.add(prefix, s, ces, cesLength, *status);
951 }
952 }
953 }
954
955 int32_t numRanges = 0;
956 int32_t numRangeCodePoints = 0;
957 UChar32 rangeFirst = U_SENTINEL;
958 UChar32 rangeLast = U_SENTINEL;
959 uint32_t rangeFirstPrimary = 0;
960 uint32_t rangeLastPrimary = 0;
961 int32_t rangeStep = -1;
962
963 // Detect ranges of characters in primary code point order,
964 // with 3-byte primaries and
965 // with consistent "step" differences between adjacent primaries.
966 // This relies on the FractionalUCA generator using the same primary-weight incrementation.
967 // Start at U+0180: No ranges for common Latin characters.
968 // Go one beyond maxCodePoint in case a range ends there.
969 for(UChar32 c = 0x180; c <= (maxCodePoint + 1); ++c) {
970 UBool action;
971 uint32_t p = builder.getLongPrimaryIfSingleCE(c);
972 if(p != 0) {
973 // p is a "long" (three-byte) primary.
974 if(rangeFirst >= 0 && c == (rangeLast + 1) && p > rangeLastPrimary) {
975 // Find the offset between the two primaries.
976 int32_t step = CollationBaseDataBuilder::diffThreeBytePrimaries(
977 rangeLastPrimary, p, builder.isCompressiblePrimary(p));
978 if(rangeFirst == rangeLast && step >= 2) {
979 // c == rangeFirst + 1, store the "step" between range primaries.
980 rangeStep = step;
981 rangeLast = c;
982 rangeLastPrimary = p;
983 action = 0; // continue range
984 } else if(rangeStep == step) {
985 // Continue the range with the same "step" difference.
986 rangeLast = c;
987 rangeLastPrimary = p;
988 action = 0; // continue range
989 } else {
990 action = 1; // maybe finish range, start a new one
991 }
992 } else {
993 action = 1; // maybe finish range, start a new one
994 }
995 } else {
996 action = -1; // maybe finish range, do not start a new one
997 }
998 if(action != 0 && rangeFirst >= 0) {
999 // Finish a range.
1000 // Set offset CE32s for a long range, leave single CEs for a short range.
1001 UBool didSetRange = builder.maybeSetPrimaryRange(
1002 rangeFirst, rangeLast,
1003 rangeFirstPrimary, rangeStep, *status);
1004 if(U_FAILURE(*status)) {
1005 fprintf(stderr,
1006 "failure setting code point order range U+%04lx..U+%04lx "
1007 "%08lx..%08lx step %d - %s\n",
1008 (long)rangeFirst, (long)rangeLast,
1009 (long)rangeFirstPrimary, (long)rangeLastPrimary,
1010 (int)rangeStep, u_errorName(*status));
1011 } else if(didSetRange) {
1012 int32_t rangeLength = rangeLast - rangeFirst + 1;
1013 if(beVerbose) {
1014 printf("* set code point order range U+%04lx..U+%04lx [%d] "
1015 "%08lx..%08lx step %d\n",
1016 (long)rangeFirst, (long)rangeLast,
1017 (int)rangeLength,
1018 (long)rangeFirstPrimary, (long)rangeLastPrimary,
1019 (int)rangeStep);
1020 }
1021 ++numRanges;
1022 numRangeCodePoints += rangeLength;
1023 }
1024 rangeFirst = U_SENTINEL;
1025 rangeStep = -1;
1026 }
1027 if(action > 0) {
1028 // Start a new range.
1029 rangeFirst = rangeLast = c;
1030 rangeFirstPrimary = rangeLastPrimary = p;
1031 }
1032 }
1033 printf("** set %d ranges with %d code points\n", (int)numRanges, (int)numRangeCodePoints);
1034
1035 // Idea: Probably best to work in two passes.
1036 // Pass 1 for reading all data, setting isCompressible flags (and reordering groups)
1037 // and finding ranges.
1038 // Then set the ranges in a newly initialized builder
1039 // for optimal compression (makes sure that adjacent blocks can overlap easily).
1040 // Then set all mappings outside the ranges.
1041 //
1042 // In the first pass, we could store mappings in a simple list,
1043 // with single-character/single-long-primary-CE mappings in a UTrie2;
1044 // or store the mappings in a temporary builder;
1045 // or we could just parse the input file again in the second pass.
1046 //
1047 // Ideally set/copy U+0000..U+017F before setting anything else,
1048 // then set default Han/Hangul, then set the ranges, then copy non-range mappings.
1049 // It should be easy to copy mappings from an un-built builder to a new one.
1050 // Add CollationDataBuilder::copyFrom(builder, code point, errorCode) -- copy contexts & expansions.
1051
1052 if(UCAVersion[0] == 0 && UCAVersion[1] == 0 && UCAVersion[2] == 0 && UCAVersion[3] == 0) {
1053 fprintf(stderr, "UCA version not specified. Cannot create data file!\n");
1054 fclose(data);
1055 return;
1056 }
1057
1058 if (beVerbose) {
1059 printf("\nLines read: %u\n", (int)lineNumber);
1060 }
1061
1062 fclose(data);
1063
1064 return;
1065 }
1066
1067 static void
buildAndWriteBaseData(CollationBaseDataBuilder & builder,const char * path,UErrorCode & errorCode)1068 buildAndWriteBaseData(CollationBaseDataBuilder &builder,
1069 const char *path, UErrorCode &errorCode) {
1070 if(U_FAILURE(errorCode)) { return; }
1071
1072 if(getOptionValue("[fixed secondary common byte") != Collation::COMMON_BYTE) {
1073 fprintf(stderr, "error: unexpected [fixed secondary common byte]");
1074 errorCode = U_INVALID_FORMAT_ERROR;
1075 return;
1076 }
1077 if(getOptionValue("[fixed tertiary common byte") != Collation::COMMON_BYTE) {
1078 fprintf(stderr, "error: unexpected [fixed tertiary common byte]");
1079 errorCode = U_INVALID_FORMAT_ERROR;
1080 return;
1081 }
1082
1083 CollationData data(*Normalizer2Factory::getNFCImpl(errorCode));
1084 builder.enableFastLatin();
1085 builder.build(data, errorCode);
1086 if(U_FAILURE(errorCode)) {
1087 fprintf(stderr, "builder.build() failed: %s\n",
1088 u_errorName(errorCode));
1089 return;
1090 }
1091
1092 // The CollationSettings constructor gives us the properly encoded
1093 // default options, so that we need not duplicate them here.
1094 CollationSettings settings;
1095
1096 UVector32 rootElements(errorCode);
1097 for(int32_t i = 0; i < CollationRootElements::IX_COUNT; ++i) {
1098 rootElements.addElement(0, errorCode);
1099 }
1100 builder.buildRootElementsTable(rootElements, errorCode);
1101 if(U_FAILURE(errorCode)) {
1102 fprintf(stderr, "builder.buildRootElementsTable() failed: %s\n",
1103 u_errorName(errorCode));
1104 return;
1105 }
1106 int32_t index = CollationRootElements::IX_COUNT;
1107 rootElements.setElementAt(index, CollationRootElements::IX_FIRST_TERTIARY_INDEX);
1108
1109 while((rootElements.elementAti(index) & 0xffff0000) == 0) { ++index; }
1110 rootElements.setElementAt(index, CollationRootElements::IX_FIRST_SECONDARY_INDEX);
1111
1112 while((rootElements.elementAti(index) & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
1113 ++index;
1114 }
1115 rootElements.setElementAt(index, CollationRootElements::IX_FIRST_PRIMARY_INDEX);
1116
1117 rootElements.setElementAt(Collation::COMMON_SEC_AND_TER_CE,
1118 CollationRootElements::IX_COMMON_SEC_AND_TER_CE);
1119
1120 int32_t secTerBoundaries = (int32_t)getOptionValue("[fixed last secondary common byte") << 24;
1121 secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable secondary byte") << 16;
1122 secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable tertiary byte");
1123 rootElements.setElementAt(secTerBoundaries, CollationRootElements::IX_SEC_TER_BOUNDARIES);
1124
1125 LocalMemory<uint8_t> buffer;
1126 int32_t capacity = 1000000;
1127 uint8_t *dest = buffer.allocateInsteadAndCopy(capacity);
1128 if(dest == NULL) {
1129 fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n",
1130 (long)capacity);
1131 errorCode = U_MEMORY_ALLOCATION_ERROR;
1132 return;
1133 }
1134 int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
1135 int32_t totalSize = CollationDataWriter::writeBase(
1136 data, settings,
1137 rootElements.getBuffer(), rootElements.size(),
1138 indexes, dest, capacity,
1139 errorCode);
1140 if(U_FAILURE(errorCode)) {
1141 fprintf(stderr, "CollationDataWriter::writeBase(capacity = %ld) failed: %s\n",
1142 (long)capacity, u_errorName(errorCode));
1143 return;
1144 }
1145 printf("*** CLDR root collation part sizes ***\n");
1146 CollationInfo::printSizes(totalSize, indexes);
1147 printf("*** CLDR root collation size: %6ld (with file header but no copyright string)\n",
1148 (long)totalSize + 32); // 32 bytes = DataHeader rounded up to 16-byte boundary
1149
1150 CollationTailoring::makeBaseVersion(UCAVersion, ucaDataInfo.dataVersion);
1151 const char *dataName =
1152 hanOrder == HAN_IMPLICIT ?
1153 (icu4xMode ? "ucadata-implicithan-icu4x" : "ucadata-implicithan") :
1154 (icu4xMode ? "ucadata-unihan-icu4x" : "ucadata-unihan");
1155 UNewDataMemory *pData=udata_create(path, "icu", dataName, &ucaDataInfo,
1156 withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
1157 if(U_FAILURE(errorCode)) {
1158 fprintf(stderr, "genuca: udata_create(%s, ucadata.icu) failed - %s\n",
1159 path, u_errorName(errorCode));
1160 return;
1161 }
1162
1163 udata_writeBlock(pData, dest, totalSize);
1164 long dataLength = udata_finish(pData, &errorCode);
1165 if(U_FAILURE(errorCode)) {
1166 fprintf(stderr, "genuca: error %s writing the output file\n", u_errorName(errorCode));
1167 return;
1168 }
1169
1170 if(dataLength != (long)totalSize) {
1171 fprintf(stderr,
1172 "udata_finish(ucadata.icu) reports %ld bytes written but should be %ld\n",
1173 dataLength, (long)totalSize);
1174 errorCode=U_INTERNAL_PROGRAM_ERROR;
1175 }
1176 }
1177
1178 /**
1179 * Adds each lead surrogate to the bmp set if any of the 1024
1180 * associated supplementary code points is in the supp set.
1181 * These can be one and the same set.
1182 */
1183 static void
setLeadSurrogatesForAssociatedSupplementary(UnicodeSet & bmp,const UnicodeSet & supp)1184 setLeadSurrogatesForAssociatedSupplementary(UnicodeSet &bmp, const UnicodeSet &supp) {
1185 UChar32 c = 0x10000;
1186 for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
1187 if(supp.containsSome(c, c + 0x3ff)) {
1188 bmp.add(lead);
1189 }
1190 }
1191 }
1192
1193 static int32_t
makeBMPFoldedBitSet(const UnicodeSet & set,uint8_t index[0x800],uint32_t bits[256],UErrorCode & errorCode)1194 makeBMPFoldedBitSet(const UnicodeSet &set, uint8_t index[0x800], uint32_t bits[256],
1195 UErrorCode &errorCode) {
1196 if(U_FAILURE(errorCode)) { return 0; }
1197 bits[0] = 0; // no bits set
1198 bits[1] = 0xffffffff; // all bits set
1199 int32_t bitsLength = 2;
1200 int32_t i = 0;
1201 for(UChar32 c = 0; c <= 0xffff; c += 0x20, ++i) {
1202 if(set.containsNone(c, c + 0x1f)) {
1203 index[i] = 0;
1204 } else if(set.contains(c, c + 0x1f)) {
1205 index[i] = 1;
1206 } else {
1207 uint32_t b = 0;
1208 for(int32_t j = 0; j <= 0x1f; ++j) {
1209 if(set.contains(c + j)) {
1210 b |= (uint32_t)1 << j;
1211 }
1212 }
1213 int32_t k;
1214 for(k = 2;; ++k) {
1215 if(k == bitsLength) {
1216 // new bit combination
1217 if(bitsLength == 256) {
1218 errorCode = U_BUFFER_OVERFLOW_ERROR;
1219 return 0;
1220 }
1221 bits[bitsLength++] = b;
1222 break;
1223 }
1224 if(bits[k] == b) {
1225 // duplicate bit combination
1226 break;
1227 }
1228 }
1229 index[i] = k;
1230 }
1231 }
1232 return bitsLength;
1233 }
1234
1235 // TODO: Make preparseucd.py write fcd_data.h mapping code point ranges to FCD16 values,
1236 // use that rather than properties APIs.
1237 // Then consider moving related logic for the unsafeBwdSet back from the loader into this builder.
1238
1239 /**
1240 * Builds data for the FCD check fast path.
1241 * For details see the CollationFCD class comments.
1242 */
1243 static void
buildAndWriteFCDData(const char * path,UErrorCode & errorCode)1244 buildAndWriteFCDData(const char *path, UErrorCode &errorCode) {
1245 UnicodeSet lcccSet(UNICODE_STRING_SIMPLE("[[:^lccc=0:][\\udc00-\\udfff]]"), errorCode);
1246 UnicodeSet tcccSet(UNICODE_STRING_SIMPLE("[:^tccc=0:]"), errorCode);
1247 if(U_FAILURE(errorCode)) { return; }
1248 setLeadSurrogatesForAssociatedSupplementary(tcccSet, tcccSet);
1249 // The following supp(lccc)->lead(tccc) should be unnecessary
1250 // after the previous supp(tccc)->lead(tccc)
1251 // because there should not be any characters with lccc!=0 and tccc=0.
1252 // It is safe and harmless.
1253 setLeadSurrogatesForAssociatedSupplementary(tcccSet, lcccSet);
1254 setLeadSurrogatesForAssociatedSupplementary(lcccSet, lcccSet);
1255 uint8_t lcccIndex[0x800], tcccIndex[0x800];
1256 uint32_t lcccBits[256], tcccBits[256];
1257 int32_t lcccBitsLength = makeBMPFoldedBitSet(lcccSet, lcccIndex, lcccBits, errorCode);
1258 int32_t tcccBitsLength = makeBMPFoldedBitSet(tcccSet, tcccIndex, tcccBits, errorCode);
1259 printf("@@@ lcccBitsLength=%d -> %d bytes\n", lcccBitsLength, 0x800 + lcccBitsLength * 4);
1260 printf("@@@ tcccBitsLength=%d -> %d bytes\n", tcccBitsLength, 0x800 + tcccBitsLength * 4);
1261
1262 if(U_FAILURE(errorCode)) { return; }
1263
1264 FILE *f=usrc_create(path, "collationfcd.cpp", 2016,
1265 "icu/tools/unicode/c/genuca/genuca.cpp");
1266 if(f==NULL) {
1267 errorCode=U_FILE_ACCESS_ERROR;
1268 return;
1269 }
1270 fputs("#include \"unicode/utypes.h\"\n\n", f);
1271 fputs("#if !UCONFIG_NO_COLLATION\n\n", f);
1272 fputs("#include \"collationfcd.h\"\n\n", f);
1273 fputs("U_NAMESPACE_BEGIN\n\n", f);
1274 usrc_writeArray(f,
1275 "const uint8_t CollationFCD::lcccIndex[%ld]={\n",
1276 lcccIndex, 8, 0x800,
1277 "", "\n};\n\n");
1278 usrc_writeArray(f,
1279 "const uint32_t CollationFCD::lcccBits[%ld]={\n",
1280 lcccBits, 32, lcccBitsLength,
1281 "", "\n};\n\n");
1282 usrc_writeArray(f,
1283 "const uint8_t CollationFCD::tcccIndex[%ld]={\n",
1284 tcccIndex, 8, 0x800,
1285 "", "\n};\n\n");
1286 usrc_writeArray(f,
1287 "const uint32_t CollationFCD::tcccBits[%ld]={\n",
1288 tcccBits, 32, tcccBitsLength,
1289 "", "\n};\n\n");
1290 fputs("U_NAMESPACE_END\n\n", f);
1291 fputs("#endif // !UCONFIG_NO_COLLATION\n", f);
1292 fclose(f);
1293 }
1294
1295 static void
parseAndWriteCollationRootData(const char * fracUCAPath,const char * binaryDataPath,const char * sourceCodePath,UErrorCode & errorCode)1296 parseAndWriteCollationRootData(
1297 const char *fracUCAPath,
1298 const char *binaryDataPath,
1299 const char *sourceCodePath,
1300 UErrorCode &errorCode) {
1301 if(U_FAILURE(errorCode)) { return; }
1302 CollationBaseDataBuilder builder(icu4xMode, errorCode);
1303 builder.init(errorCode);
1304 parseFractionalUCA(fracUCAPath, builder, &errorCode);
1305 buildAndWriteBaseData(builder, binaryDataPath, errorCode);
1306 buildAndWriteFCDData(sourceCodePath, errorCode);
1307 }
1308
1309 // ------------------------------------------------------------------------- ***
1310
1311 enum {
1312 HELP_H,
1313 HELP_QUESTION_MARK,
1314 VERBOSE,
1315 COPYRIGHT,
1316 HAN_ORDER,
1317 ICU4X
1318 };
1319
1320 static UOption options[]={
1321 UOPTION_HELP_H,
1322 UOPTION_HELP_QUESTION_MARK,
1323 UOPTION_VERBOSE,
1324 UOPTION_COPYRIGHT,
1325 UOPTION_DEF("hanOrder", '\x01', UOPT_REQUIRES_ARG),
1326 UOPTION_DEF("icu4x", 'X', UOPT_NO_ARG)
1327 };
1328
1329 extern "C" int
main(int argc,char * argv[])1330 main(int argc, char* argv[]) {
1331 U_MAIN_INIT_ARGS(argc, argv);
1332
1333 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
1334
1335 /* error handling, printing usage message */
1336 if(argc<0) {
1337 fprintf(stderr,
1338 "error in command line argument \"%s\"\n",
1339 argv[-argc]);
1340 }
1341 if(options[HAN_ORDER].doesOccur) {
1342 const char *order = options[HAN_ORDER].value;
1343 if(uprv_strcmp(order, "implicit") == 0) {
1344 hanOrder = HAN_IMPLICIT;
1345 } else if(uprv_strcmp(order, "radical-stroke") == 0) {
1346 hanOrder = HAN_RADICAL_STROKE;
1347 }
1348 }
1349 if(hanOrder == HAN_NO_ORDER) {
1350 argc = -1;
1351 }
1352 if( argc<2 ||
1353 options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
1354 ) {
1355 /*
1356 * Broken into chunks because the C89 standard says the minimum
1357 * required supported string length is 509 bytes.
1358 */
1359 fprintf(stderr,
1360 "Usage: %s [-options] --hanOrder (implicit|radical-stroke) path/to/ICU/src/root\n"
1361 "\n"
1362 "Reads path/to/ICU/src/root/source/data/unidata/FractionalUCA.txt and\n"
1363 "writes source and binary data files with the collation root data.\n"
1364 "\n",
1365 argv[0]);
1366 fprintf(stderr,
1367 "Options:\n"
1368 "\t-h or -? or --help this usage text\n"
1369 "\t-v or --verbose verbose output\n"
1370 "\t-c or --copyright include a copyright notice\n"
1371 "\t --hanOrder implicit or radical-stroke\n");
1372 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1373 }
1374
1375 beVerbose=options[VERBOSE].doesOccur;
1376 withCopyright=options[COPYRIGHT].doesOccur;
1377 icu4xMode=options[ICU4X].doesOccur;
1378
1379 IcuToolErrorCode errorCode("genuca");
1380
1381 CharString icuSrcRoot(argv[1], errorCode);
1382
1383 CharString icuSource(icuSrcRoot, errorCode);
1384 icuSource.appendPathPart("source", errorCode);
1385
1386 CharString icuSourceData(icuSource, errorCode);
1387 icuSourceData.appendPathPart("data", errorCode);
1388
1389 CharString fracUCAPath(icuSourceData, errorCode);
1390 fracUCAPath.appendPathPart("unidata", errorCode);
1391 fracUCAPath.appendPathPart("FractionalUCA.txt", errorCode);
1392
1393 CharString sourceDataInColl(icuSourceData, errorCode);
1394 sourceDataInColl.appendPathPart("in", errorCode);
1395 sourceDataInColl.appendPathPart("coll", errorCode);
1396
1397 CharString sourceI18n(icuSource, errorCode);
1398 sourceI18n.appendPathPart("i18n", errorCode);
1399
1400 errorCode.assertSuccess();
1401
1402 parseAndWriteCollationRootData(
1403 fracUCAPath.data(),
1404 sourceDataInColl.data(),
1405 sourceI18n.data(),
1406 errorCode);
1407
1408 return errorCode;
1409 }
1410
1411 #endif // UCONFIG_NO_COLLATION
1412