1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 #include <cstddef>
5 #include <cstdint>
6 #include <cstdio>
7 #include <iostream>
8 #include "unicode/localpointer.h"
9 #include "unicode/umachine.h"
10 #include "unicode/unistr.h"
11 #include "unicode/urename.h"
12 #include "unicode/uset.h"
13 #include <vector>
14 #include <algorithm>
15 #include "toolutil.h"
16 #include "uoptions.h"
17 #include "cmemory.h"
18 #include "charstr.h"
19 #include "cstring.h"
20 #include "unicode/uchar.h"
21 #include "unicode/errorcode.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uscript.h"
24 #include "unicode/putil.h"
25 #include "unicode/umutablecptrie.h"
26 #include "unicode/ucharstriebuilder.h"
27 #include "ucase.h"
28 #include "unicode/normalizer2.h"
29 #include "normalizer2impl.h"
30 #include "writesrc.h"
31
32 U_NAMESPACE_USE
33
34 /*
35 * Global - verbosity
36 */
37 UBool VERBOSE = false;
38 UBool QUIET = false;
39
40 UBool haveCopyright = true;
41 UCPTrieType trieType = UCPTRIE_TYPE_SMALL;
42 const char* destdir = "";
43
44 // Mask constants for modified values in the Script CodePointTrie, values are logically 12-bits.
45 int16_t DATAEXPORT_SCRIPT_X_WITH_COMMON = 0x0400;
46 int16_t DATAEXPORT_SCRIPT_X_WITH_INHERITED = 0x0800;
47 int16_t DATAEXPORT_SCRIPT_X_WITH_OTHER = 0x0c00;
48
49 // TODO(ICU-21821): Replace this with a call to a library function
50 int32_t scxCodePoints[] = {
51 7415, 7377, 7380, 7387, 7390, 7391, 7394, 7395, 7396, 7397,
52 7398, 7399, 7400, 7403, 7404, 7406, 7407, 7408, 7409, 113824,
53 113825, 113826, 113827, 834, 837, 7616, 7617, 12294, 12350, 12351,
54 12688, 12689, 12690, 12691, 12692, 12693, 12694, 12695, 12696, 12697,
55 12698, 12699, 12700, 12701, 12702, 12703, 12736, 12737, 12738, 12739,
56 12740, 12741, 12742, 12743, 12744, 12745, 12746, 12747, 12748, 12749,
57 12750, 12751, 12752, 12753, 12754, 12755, 12756, 12757, 12758, 12759,
58 12760, 12761, 12762, 12763, 12764, 12765, 12766, 12767, 12768, 12769,
59 12770, 12771, 12832, 12833, 12834, 12835, 12836, 12837, 12838, 12839,
60 12840, 12841, 12842, 12843, 12844, 12845, 12846, 12847, 12848, 12849,
61 12850, 12851, 12852, 12853, 12854, 12855, 12856, 12857, 12858, 12859,
62 12860, 12861, 12862, 12863, 12864, 12865, 12866, 12867, 12868, 12869,
63 12870, 12871, 12928, 12929, 12930, 12931, 12932, 12933, 12934, 12935,
64 12936, 12937, 12938, 12939, 12940, 12941, 12942, 12943, 12944, 12945,
65 12946, 12947, 12948, 12949, 12950, 12951, 12952, 12953, 12954, 12955,
66 12956, 12957, 12958, 12959, 12960, 12961, 12962, 12963, 12964, 12965,
67 12966, 12967, 12968, 12969, 12970, 12971, 12972, 12973, 12974, 12975,
68 12976, 12992, 12993, 12994, 12995, 12996, 12997, 12998, 12999, 13000,
69 13001, 13002, 13003, 13055, 13144, 13145, 13146, 13147, 13148, 13149,
70 13150, 13151, 13152, 13153, 13154, 13155, 13156, 13157, 13158, 13159,
71 13160, 13161, 13162, 13163, 13164, 13165, 13166, 13167, 13168, 13179,
72 13180, 13181, 13182, 13183, 13280, 13281, 13282, 13283, 13284, 13285,
73 13286, 13287, 13288, 13289, 13290, 13291, 13292, 13293, 13294, 13295,
74 13296, 13297, 13298, 13299, 13300, 13301, 13302, 13303, 13304, 13305,
75 13306, 13307, 13308, 13309, 13310, 119648, 119649, 119650, 119651, 119652,
76 119653, 119654, 119655, 119656, 119657, 119658, 119659, 119660, 119661, 119662,
77 119663, 119664, 119665, 127568, 127569, 867, 868, 869, 870, 871,
78 872, 873, 874, 875, 876, 877, 878, 879, 7418, 7674,
79 66272, 66273, 66274, 66275, 66276, 66277, 66278, 66279, 66280, 66281,
80 66282, 66283, 66284, 66285, 66286, 66287, 66288, 66289, 66290, 66291,
81 66292, 66293, 66294, 66295, 66296, 66297, 66298, 66299, 1748, 64830,
82 64831, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619,
83 1620, 1621, 1648, 65010, 65021, 7381, 7382, 7384, 7393, 7402,
84 7405, 7413, 7414, 43249, 12330, 12331, 12332, 12333, 43471, 65794,
85 65847, 65848, 65849, 65850, 65851, 65852, 65853, 65854, 65855, 1156,
86 1159, 11843, 42607, 1157, 1158, 1155, 7672, 7379, 7411, 7416,
87 7417, 7401, 7383, 7385, 7388, 7389, 7392, 43251, 4347, 3046,
88 3047, 3048, 3049, 3050, 3051, 3052, 3053, 3054, 3055, 3056,
89 3057, 3058, 3059, 70401, 70403, 70459, 70460, 73680, 73681, 73683,
90 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2799,
91 2662, 2663, 2664, 2665, 2666, 2667, 2668, 2669, 2670, 2671,
92 42752, 42753, 42754, 42755, 42756, 42757, 42758, 42759, 12337, 12338,
93 12339, 12340, 12341, 12441, 12442, 12443, 12444, 12448, 12540, 65392,
94 65438, 65439, 3302, 3303, 3304, 3305, 3306, 3307, 3308, 3309,
95 3310, 3311, 8239, 68338, 6146, 6147, 6149, 1564, 1632, 1633,
96 1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 2534, 2535,
97 2536, 2537, 2538, 2539, 2540, 2541, 2542, 2543, 4160, 4161,
98 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 65792, 65793,
99 65799, 65800, 65801, 65802, 65803, 65804, 65805, 65806, 65807, 65808,
100 65809, 65810, 65811, 65812, 65813, 65814, 65815, 65816, 65817, 65818,
101 65819, 65820, 65821, 65822, 65823, 65824, 65825, 65826, 65827, 65828,
102 65829, 65830, 65831, 65832, 65833, 65834, 65835, 65836, 65837, 65838,
103 65839, 65840, 65841, 65842, 65843, 7412, 8432, 12348, 12349, 43310,
104 7376, 7378, 5941, 5942, 2406, 2407, 2408, 2409, 2410, 2411,
105 2412, 2413, 2414, 2415, 12291, 12307, 12316, 12317, 12318, 12319,
106 12336, 12343, 65093, 65094, 1548, 1563, 12289, 12290, 12296, 12297,
107 12298, 12299, 12300, 12301, 12302, 12303, 12304, 12305, 12308, 12309,
108 12310, 12311, 12312, 12313, 12314, 12315, 12539, 65377, 65378, 65379,
109 65380, 65381, 7386, 1567, 7410, 1600, 43062, 43063, 43064, 43065,
110 2386, 2385, 43059, 43060, 43061, 43056, 43057, 43058, 2404, 2405
111 };
112
handleError(ErrorCode & status,const char * context)113 void handleError(ErrorCode& status, const char* context) {
114 if (status.isFailure()) {
115 std::cerr << "Error: " << context << ": " << status.errorName() << std::endl;
116 exit(status.reset());
117 }
118 }
119
120 class PropertyValueNameGetter : public ValueNameGetter {
121 public:
PropertyValueNameGetter(UProperty prop)122 PropertyValueNameGetter(UProperty prop) : property(prop) {}
123 ~PropertyValueNameGetter() override;
getName(uint32_t value)124 const char *getName(uint32_t value) override {
125 return u_getPropertyValueName(property, value, U_SHORT_PROPERTY_NAME);
126 }
127
128 private:
129 UProperty property;
130 };
131
~PropertyValueNameGetter()132 PropertyValueNameGetter::~PropertyValueNameGetter() {}
133
134 // Dump an aliases = [...] key for properties with aliases
dumpPropertyAliases(UProperty uproperty,FILE * f)135 void dumpPropertyAliases(UProperty uproperty, FILE* f) {
136 int i = U_LONG_PROPERTY_NAME + 1;
137
138 while(true) {
139 // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially,
140 // and returning null after that
141 const char* alias = u_getPropertyName(uproperty, (UPropertyNameChoice) i);
142 if (!alias) {
143 break;
144 }
145 if (i == U_LONG_PROPERTY_NAME + 1) {
146 fprintf(f, "aliases = [\"%s\"", alias);
147 } else {
148 fprintf(f, ", \"%s\"", alias);
149 }
150 i++;
151 }
152 if (i != U_LONG_PROPERTY_NAME + 1) {
153 fprintf(f, "]\n");
154 }
155 }
156
dumpBinaryProperty(UProperty uproperty,FILE * f)157 void dumpBinaryProperty(UProperty uproperty, FILE* f) {
158 IcuToolErrorCode status("icuexportdata: dumpBinaryProperty");
159 const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
160 const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
161 const USet* uset = u_getBinaryPropertySet(uproperty, status);
162 handleError(status, fullPropName);
163
164 fputs("[[binary_property]]\n", f);
165 fprintf(f, "long_name = \"%s\"\n", fullPropName);
166 if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
167 fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
168 dumpPropertyAliases(uproperty, f);
169 usrc_writeUnicodeSet(f, uset, UPRV_TARGET_SYNTAX_TOML);
170 }
171
172 // If the value exists, dump an indented entry of the format
173 // `" {discr = <discriminant>, long = <longname>, short = <shortname>, aliases = [<aliases>]},"`
dumpValueEntry(UProperty uproperty,int v,bool is_mask,FILE * f)174 void dumpValueEntry(UProperty uproperty, int v, bool is_mask, FILE* f) {
175 const char* fullValueName = u_getPropertyValueName(uproperty, v, U_LONG_PROPERTY_NAME);
176 const char* shortValueName = u_getPropertyValueName(uproperty, v, U_SHORT_PROPERTY_NAME);
177 if (!fullValueName) {
178 return;
179 }
180 if (is_mask) {
181 fprintf(f, " {discr = 0x%X", v);
182 } else {
183 fprintf(f, " {discr = %i", v);
184 }
185 fprintf(f, ", long = \"%s\"", fullValueName);
186 if (shortValueName) {
187 fprintf(f, ", short = \"%s\"", shortValueName);
188 }
189 int i = U_LONG_PROPERTY_NAME + 1;
190 while(true) {
191 // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially,
192 // and returning null after that
193 const char* alias = u_getPropertyValueName(uproperty, v, (UPropertyNameChoice) i);
194 if (!alias) {
195 break;
196 }
197 if (i == U_LONG_PROPERTY_NAME + 1) {
198 fprintf(f, ", aliases = [\"%s\"", alias);
199 } else {
200 fprintf(f, ", \"%s\"", alias);
201 }
202 i++;
203 }
204 if (i != U_LONG_PROPERTY_NAME + 1) {
205 fprintf(f, "]");
206 }
207 fprintf(f, "},\n");
208 }
209
dumpEnumeratedProperty(UProperty uproperty,FILE * f)210 void dumpEnumeratedProperty(UProperty uproperty, FILE* f) {
211 IcuToolErrorCode status("icuexportdata: dumpEnumeratedProperty");
212 const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
213 const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
214 const UCPMap* umap = u_getIntPropertyMap(uproperty, status);
215 handleError(status, fullPropName);
216
217 fputs("[[enum_property]]\n", f);
218 fprintf(f, "long_name = \"%s\"\n", fullPropName);
219 if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
220 fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
221 dumpPropertyAliases(uproperty, f);
222
223 int32_t minValue = u_getIntPropertyMinValue(uproperty);
224 U_ASSERT(minValue >= 0);
225 int32_t maxValue = u_getIntPropertyMaxValue(uproperty);
226 U_ASSERT(maxValue >= 0);
227
228 fprintf(f, "values = [\n");
229 for (int v = minValue; v <= maxValue; v++) {
230 dumpValueEntry(uproperty, v, false, f);
231 }
232 fprintf(f, "]\n");
233
234 PropertyValueNameGetter valueNameGetter(uproperty);
235 usrc_writeUCPMap(f, umap, &valueNameGetter, UPRV_TARGET_SYNTAX_TOML);
236 fputs("\n", f);
237
238
239 UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32;
240 if (maxValue <= 0xff) {
241 width = UCPTRIE_VALUE_BITS_8;
242 } else if (maxValue <= 0xffff) {
243 width = UCPTRIE_VALUE_BITS_16;
244 }
245 LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(umap, status));
246 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
247 builder.getAlias(),
248 trieType,
249 width,
250 status));
251 handleError(status, fullPropName);
252
253 fputs("[enum_property.code_point_trie]\n", f);
254 usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
255 }
256
257 /*
258 * Export Bidi_Mirroring_Glyph values (code points) in a similar way to how enumerated
259 * properties are dumped to file.
260 * Note: the data will store 0 for code points without a value defined for
261 * Bidi_Mirroring_Glyph.
262 */
dumpBidiMirroringGlyph(FILE * f)263 void dumpBidiMirroringGlyph(FILE* f) {
264 UProperty uproperty = UCHAR_BIDI_MIRRORING_GLYPH;
265 IcuToolErrorCode status("icuexportdata: dumpBidiMirroringGlyph");
266 const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
267 const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
268 handleError(status, fullPropName);
269
270 // Store 21-bit code point as is
271 UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32;
272
273 // note: unlike dumpEnumeratedProperty, which can get inversion map data using
274 // u_getIntPropertyMap(uproperty), the only reliable way to get Bidi_Mirroring_Glyph
275 // is to use u_charMirror(cp) over the code point space.
276 LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
277 for(UChar32 c = UCHAR_MIN_VALUE; c <= UCHAR_MAX_VALUE; c++) {
278 UChar32 mirroringGlyph = u_charMirror(c);
279 // The trie builder code throws an error when it cannot compress the data sufficiently.
280 // Therefore, when the value is undefined for a code point, keep a 0 in the trie
281 // instead of the ICU API behavior of returning the code point value. Using 0
282 // results in a relatively significant space savings by not including redundant data.
283 if (c != mirroringGlyph) {
284 umutablecptrie_set(builder.getAlias(), c, mirroringGlyph, status);
285 }
286 }
287
288 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
289 builder.getAlias(),
290 trieType,
291 width,
292 status));
293 handleError(status, fullPropName);
294
295 // currently a trie and inversion map are the same (as relied upon in characterproperties.cpp)
296 const UCPMap* umap = reinterpret_cast<UCPMap *>(utrie.getAlias());
297
298 fputs("[[enum_property]]\n", f);
299 fprintf(f, "long_name = \"%s\"\n", fullPropName);
300 if (shortPropName) {
301 fprintf(f, "short_name = \"%s\"\n", shortPropName);
302 }
303 fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
304 dumpPropertyAliases(uproperty, f);
305
306 usrc_writeUCPMap(f, umap, nullptr, UPRV_TARGET_SYNTAX_TOML);
307 fputs("\n", f);
308
309 fputs("[enum_property.code_point_trie]\n", f);
310 usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
311 }
312
313 // After printing property value `v`, print `mask` if and only if `mask` comes immediately
314 // after the property in the listing
maybeDumpMaskValue(UProperty uproperty,uint32_t v,uint32_t mask,FILE * f)315 void maybeDumpMaskValue(UProperty uproperty, uint32_t v, uint32_t mask, FILE* f) {
316 if (U_MASK(v) < mask && U_MASK(v + 1) > mask)
317 dumpValueEntry(uproperty, mask, true, f);
318 }
319
dumpGeneralCategoryMask(FILE * f)320 void dumpGeneralCategoryMask(FILE* f) {
321 IcuToolErrorCode status("icuexportdata: dumpGeneralCategoryMask");
322 UProperty uproperty = UCHAR_GENERAL_CATEGORY_MASK;
323
324 fputs("[[mask_property]]\n", f);
325 const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
326 const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
327 fprintf(f, "long_name = \"%s\"\n", fullPropName);
328 if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
329 fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
330 dumpPropertyAliases(uproperty, f);
331
332
333 fprintf(f, "mask_for = \"General_Category\"\n");
334 uint32_t minValue = u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY);
335 U_ASSERT(minValue >= 0);
336 uint32_t maxValue = u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY);
337 U_ASSERT(maxValue >= 0);
338
339 fprintf(f, "values = [\n");
340 for (uint32_t v = minValue; v <= maxValue; v++) {
341 dumpValueEntry(uproperty, U_MASK(v), true, f);
342
343 // We want to dump these masks "in order", which means they
344 // should come immediately after every property they contain
345 maybeDumpMaskValue(uproperty, v, U_GC_L_MASK, f);
346 maybeDumpMaskValue(uproperty, v, U_GC_LC_MASK, f);
347 maybeDumpMaskValue(uproperty, v, U_GC_M_MASK, f);
348 maybeDumpMaskValue(uproperty, v, U_GC_N_MASK, f);
349 maybeDumpMaskValue(uproperty, v, U_GC_Z_MASK, f);
350 maybeDumpMaskValue(uproperty, v, U_GC_C_MASK, f);
351 maybeDumpMaskValue(uproperty, v, U_GC_P_MASK, f);
352 maybeDumpMaskValue(uproperty, v, U_GC_S_MASK, f);
353 }
354 fprintf(f, "]\n");
355 }
356
dumpScriptExtensions(FILE * f)357 void dumpScriptExtensions(FILE* f) {
358 IcuToolErrorCode status("icuexportdata: dumpScriptExtensions");
359
360 fputs("[[script_extensions]]\n", f);
361 const char* scxFullPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_LONG_PROPERTY_NAME);
362 const char* scxShortPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_SHORT_PROPERTY_NAME);
363 fprintf(f, "long_name = \"%s\"\n", scxFullPropName);
364 if (scxShortPropName) fprintf(f, "short_name = \"%s\"\n", scxShortPropName);
365 fprintf(f, "uproperty_discr = 0x%X\n", UCHAR_SCRIPT_EXTENSIONS);
366 dumpPropertyAliases(UCHAR_SCRIPT_EXTENSIONS, f);
367
368 // We want to use 16 bits for our exported trie of sc/scx data because we
369 // need 12 bits to match the 12 bits of data stored for sc/scx in the trie
370 // in the uprops.icu data file.
371 UCPTrieValueWidth scWidth = UCPTRIE_VALUE_BITS_16;
372
373 // Create a mutable UCPTrie builder populated with Script property values data.
374 const UCPMap* scInvMap = u_getIntPropertyMap(UCHAR_SCRIPT, status);
375 handleError(status, scxFullPropName);
376 LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(scInvMap, status));
377 handleError(status, scxFullPropName);
378
379 // The values for the output scx companion array.
380 // Invariant is that all subvectors are distinct.
381 std::vector< std::vector<uint16_t> > outputDedupVec;
382
383 // The sc/scx companion array is an array of arrays (of script codes)
384 fputs("script_code_array = [\n", f);
385 for(const UChar32 cp : scxCodePoints) {
386 // Get the Script value
387 uint32_t scVal = umutablecptrie_get(builder.getAlias(), cp);
388 // Get the Script_Extensions value (array of Script codes)
389 const int32_t SCX_ARRAY_CAPACITY = 32;
390 UScriptCode scxValArray[SCX_ARRAY_CAPACITY];
391 int32_t numScripts = uscript_getScriptExtensions(cp, scxValArray, SCX_ARRAY_CAPACITY, status);
392 handleError(status, scxFullPropName);
393
394 // Convert the scx array into a vector
395 std::vector<uint16_t> scxValVec;
396 for(int i = 0; i < numScripts; i++) {
397 scxValVec.push_back(scxValArray[i]);
398 }
399 // Ensure that it is sorted
400 std::sort(scxValVec.begin(), scxValVec.end());
401 // Copy the Script value into the first position of the scx array only
402 // if we have the "other" case (Script value is not Common nor Inherited).
403 // This offers faster access when users want only the Script value.
404 if (scVal != USCRIPT_COMMON && scVal != USCRIPT_INHERITED) {
405 scxValVec.insert(scxValVec.begin(), scVal);
406 }
407
408 // See if there is already an scx value array matching the newly built one.
409 // If there is, then use its index.
410 // If not, then append the new value array.
411 bool isScxValUnique = true;
412 size_t outputIndex = 0;
413 for (outputIndex = 0; outputIndex < outputDedupVec.size(); outputIndex++) {
414 if (outputDedupVec[outputIndex] == scxValVec) {
415 isScxValUnique = false;
416 break;
417 }
418 }
419
420 if (isScxValUnique) {
421 outputDedupVec.push_back(scxValVec);
422 usrc_writeArray(f, " [", scxValVec.data(), 16, scxValVec.size(), " ", "],\n");
423 }
424
425 // We must update the value in the UCPTrie for the code point to contain:
426 // 9..0 the Script code in the lower 10 bits when 11..10 is 0, else it is
427 // the index into the companion array
428 // 11..10 the same higher-order 2 bits in the trie in uprops.icu indicating whether
429 // 3: other
430 // 2: Script=Inherited
431 // 1: Script=Common
432 // 0: Script=value in 9..0 (N/A because we are in this loop to create the companion array for non-0 cases)
433 uint16_t mask = 0;
434 if (scVal == USCRIPT_COMMON) {
435 mask = DATAEXPORT_SCRIPT_X_WITH_COMMON;
436 } else if (scVal == USCRIPT_INHERITED) {
437 mask = DATAEXPORT_SCRIPT_X_WITH_INHERITED;
438 } else {
439 mask = DATAEXPORT_SCRIPT_X_WITH_OTHER;
440 }
441
442 // The new trie value is the index into the new array with the high order bits set
443 uint32_t newScVal = outputIndex | mask;
444
445 // Update the code point in the mutable trie builder with the trie value
446 umutablecptrie_set(builder.getAlias(), cp, newScVal, status);
447 handleError(status, scxFullPropName);
448 }
449 fputs("]\n\n", f); // Print the TOML close delimiter for the outer array.
450
451 // Convert from mutable trie builder to immutable trie.
452 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
453 builder.getAlias(),
454 trieType,
455 scWidth,
456 status));
457 handleError(status, scxFullPropName);
458
459 fputs("[script_extensions.code_point_trie]\n", f);
460 usrc_writeUCPTrie(f, scxShortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
461 }
462
prepareOutputFile(const char * basename)463 FILE* prepareOutputFile(const char* basename) {
464 IcuToolErrorCode status("icuexportdata");
465 CharString outFileName;
466 if (destdir != nullptr && *destdir != 0) {
467 outFileName.append(destdir, status).ensureEndsWithFileSeparator(status);
468 }
469 outFileName.append(basename, status);
470 outFileName.append(".toml", status);
471 handleError(status, basename);
472
473 FILE* f = fopen(outFileName.data(), "w");
474 if (f == nullptr) {
475 std::cerr << "Unable to open file: " << outFileName.data() << std::endl;
476 exit(U_FILE_ACCESS_ERROR);
477 }
478 if (!QUIET) {
479 std::cout << "Writing to: " << outFileName.data() << std::endl;
480 }
481
482 if (haveCopyright) {
483 usrc_writeCopyrightHeader(f, "#", 2021);
484 }
485 usrc_writeFileNameGeneratedBy(f, "#", basename, "icuexportdata.cpp");
486
487 return f;
488 }
489
490 #if !UCONFIG_NO_NORMALIZATION
491
492 struct PendingDescriptor {
493 UChar32 scalar;
494 uint32_t descriptor;
495 UBool supplementary;
496 };
497
writeCanonicalCompositions(USet * backwardCombiningStarters)498 void writeCanonicalCompositions(USet* backwardCombiningStarters) {
499 IcuToolErrorCode status("icuexportdata: computeCanonicalCompositions");
500 const char* basename = "compositions";
501 FILE* f = prepareOutputFile(basename);
502
503 LocalPointer<UCharsTrieBuilder> backwardBuilder(new UCharsTrieBuilder(status), status);
504
505 const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
506 UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
507
508 const Normalizer2* nfc = Normalizer2::getNFCInstance(status);
509 for (UChar32 c = 0; c <= 0x10FFFF; ++c) {
510 if (c >= 0xD800 && c < 0xE000) {
511 // Surrogate
512 continue;
513 }
514 UnicodeString decomposition;
515 if (!nfc->getRawDecomposition(c, decomposition)) {
516 continue;
517 }
518 int32_t len = decomposition.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
519 if (len != 2) {
520 continue;
521 }
522 UChar32 starter = utf32[0];
523 UChar32 second = utf32[1];
524 UChar32 composite = nfc->composePair(starter, second);
525 if (composite < 0) {
526 continue;
527 }
528 if (c != composite) {
529 status.set(U_INTERNAL_PROGRAM_ERROR);
530 handleError(status, basename);
531 }
532 if (!u_getCombiningClass(second)) {
533 uset_add(backwardCombiningStarters, second);
534 }
535 if (composite >= 0xAC00 && composite <= 0xD7A3) {
536 // Hangul syllable
537 continue;
538 }
539
540 UnicodeString backward;
541 backward.append(second);
542 backward.append(starter);
543 backwardBuilder->add(backward, int32_t(composite), status);
544 }
545 UnicodeString canonicalCompositionTrie;
546 backwardBuilder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, canonicalCompositionTrie, status);
547
548 usrc_writeArray(f, "compositions = [\n ", canonicalCompositionTrie.getBuffer(), 16, canonicalCompositionTrie.length(), " ", "\n]\n");
549 fclose(f);
550 handleError(status, basename);
551 }
552
writeDecompositionTables(const char * basename,const uint16_t * ptr16,size_t len16,const uint32_t * ptr32,size_t len32)553 void writeDecompositionTables(const char* basename, const uint16_t* ptr16, size_t len16, const uint32_t* ptr32, size_t len32) {
554 FILE* f = prepareOutputFile(basename);
555 usrc_writeArray(f, "scalars16 = [\n ", ptr16, 16, len16, " ", "\n]\n");
556 usrc_writeArray(f, "scalars32 = [\n ", ptr32, 32, len32, " ", "\n]\n");
557 fclose(f);
558 }
559
writeDecompositionData(const char * basename,uint32_t baseSize16,uint32_t baseSize32,uint32_t supplementSize16,USet * uset,USet * reference,const std::vector<PendingDescriptor> & pendingTrieInsertions,char16_t passthroughCap)560 void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector<PendingDescriptor>& pendingTrieInsertions, char16_t passthroughCap) {
561 IcuToolErrorCode status("icuexportdata: writeDecompositionData");
562 FILE* f = prepareOutputFile(basename);
563
564 // Zero is a magic number that means the character decomposes to itself.
565 LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
566
567 // Iterate backwards to insert lower code points in the trie first in case it matters
568 // for trie block allocation.
569 for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) {
570 const PendingDescriptor& pending = pendingTrieInsertions[i];
571 uint32_t additional = 0;
572 if (!(pending.descriptor & 0xFFFE0000)) {
573 uint32_t offset = pending.descriptor & 0xFFF;
574 if (!pending.supplementary) {
575 if (offset >= baseSize16) {
576 // This is a offset to supplementary 16-bit data. We have
577 // 16-bit base data and 32-bit base data before. However,
578 // the 16-bit base data length is already part of offset.
579 additional = baseSize32;
580 }
581 } else {
582 if (offset >= baseSize32) {
583 // This is an offset to supplementary 32-bit data. We have 16-bit
584 // base data, 32-bit base data, and 16-bit supplementary data before.
585 // However, the 32-bit base data length is already part
586 // of offset.
587 additional = baseSize16 + supplementSize16;
588 } else {
589 // This is an offset to 32-bit base data. We have 16-bit
590 // base data before.
591 additional = baseSize16;
592 }
593 }
594 if (offset + additional > 0xFFF) {
595 status.set(U_INTERNAL_PROGRAM_ERROR);
596 handleError(status, basename);
597 }
598 }
599 // It turns out it's better to swap the halves compared to the initial
600 // idea in order to put special marker values close to zero so that
601 // an important marker value becomes 1, so it's efficient to compare
602 // "1 or 0". Unfortunately, going through all the code to swap
603 // things is too error prone, so let's do the swapping here in one
604 // place.
605 uint32_t oldTrieValue = pending.descriptor + additional;
606 uint32_t swappedTrieValue = (oldTrieValue >> 16) | (oldTrieValue << 16);
607 umutablecptrie_set(builder.getAlias(), pending.scalar, swappedTrieValue, status);
608 }
609 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
610 builder.getAlias(),
611 trieType,
612 UCPTRIE_VALUE_BITS_32,
613 status));
614 handleError(status, basename);
615
616 if (reference) {
617 if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) {
618 // NFD expectations don't hold. The set must not contain the half-width
619 // kana voicing marks and must contain iota subscript.
620 status.set(U_INTERNAL_PROGRAM_ERROR);
621 handleError(status, basename);
622 }
623
624 USet* halfWidthVoicing = uset_openEmpty();
625 uset_add(halfWidthVoicing, 0xFF9E);
626 uset_add(halfWidthVoicing, 0xFF9F);
627
628 USet* iotaSubscript = uset_openEmpty();
629 uset_add(iotaSubscript, 0x0345);
630
631 uint8_t flags = 0;
632
633 USet* halfWidthCheck = uset_cloneAsThawed(uset);
634 uset_removeAll(halfWidthCheck, reference);
635 if (uset_equals(halfWidthCheck, halfWidthVoicing)) {
636 flags |= 1;
637 } else if (!uset_isEmpty(halfWidthCheck)) {
638 // The result was neither empty nor contained exactly
639 // the two half-width voicing marks. The ICU4X
640 // normalizer doesn't know how to deal with this case.
641 status.set(U_INTERNAL_PROGRAM_ERROR);
642 handleError(status, basename);
643 }
644 uset_close(halfWidthCheck);
645
646 USet* iotaCheck = uset_cloneAsThawed(reference);
647 uset_removeAll(iotaCheck, uset);
648 if (!(uset_equals(iotaCheck, iotaSubscript)) && !uset_isEmpty(iotaCheck)) {
649 // The result was neither empty nor contained exactly
650 // the iota subscript. The ICU4X normalizer doesn't
651 // know how to deal with this case.
652 status.set(U_INTERNAL_PROGRAM_ERROR);
653 handleError(status, basename);
654 }
655
656 uset_close(iotaSubscript);
657 uset_close(halfWidthVoicing);
658
659 fprintf(f, "flags = 0x%X\n", flags);
660 fprintf(f, "cap = 0x%X\n", passthroughCap);
661 }
662 fprintf(f, "[trie]\n");
663 usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
664 fclose(f);
665 handleError(status, basename);
666 }
667
668 // Special marker for the NFKD form of U+FDFA
669 const int32_t FDFA_MARKER = 3;
670
671 // Special marker for characters whose decomposition starts with a non-starter
672 // and the decomposition isn't the character itself.
673 const int32_t SPECIAL_NON_STARTER_DECOMPOSITION_MARKER = 2;
674
675 // Special marker for starters that decompose to themselves but that may
676 // combine backwards under canonical composition
677 const int32_t BACKWARD_COMBINING_STARTER_MARKER = 1;
678
679 /// Marker that a complex decomposition isn't round-trippable
680 /// under re-composition.
681 const uint32_t NON_ROUND_TRIP_MARKER = 1;
682
permissibleBmpPair(UBool knownToRoundTrip,UChar32 c,UChar32 second)683 UBool permissibleBmpPair(UBool knownToRoundTrip, UChar32 c, UChar32 second) {
684 if (knownToRoundTrip) {
685 return true;
686 }
687 // Nuktas, Hebrew presentation forms and polytonic Greek with oxia
688 // are special-cased in ICU4X.
689 if (c >= 0xFB1D && c <= 0xFB4E) {
690 // Hebrew presentation forms
691 return true;
692 }
693 if (c >= 0x1F71 && c <= 0x1FFB) {
694 // Polytonic Greek with oxia
695 return true;
696 }
697 if ((second & 0x7F) == 0x3C && second >= 0x0900 && second <= 0x0BFF) {
698 // Nukta
699 return true;
700 }
701 // To avoid more branchiness, 4 characters that decompose to
702 // a BMP starter followed by a BMP non-starter are excluded
703 // from being encoded directly into the trie value and are
704 // handled as complex decompositions instead. These are:
705 // U+0F76 TIBETAN VOWEL SIGN VOCALIC R
706 // U+0F78 TIBETAN VOWEL SIGN VOCALIC L
707 // U+212B ANGSTROM SIGN
708 // U+2ADC FORKING
709 return false;
710 }
711
712
713 // Find the slice `needle` within `storage` and return its index, failing which,
714 // append all elements of `needle` to `storage` and return the index of it at the end.
715 template<typename T>
findOrAppend(std::vector<T> & storage,const UChar32 * needle,size_t needleLen)716 size_t findOrAppend(std::vector<T>& storage, const UChar32* needle, size_t needleLen) {
717 // Last index where we might find the start of the complete needle.
718 // bounds check is `i + needleLen <= storage.size()` since the inner
719 // loop will range from `i` to `i + needleLen - 1` (the `-1` is why we use `<=`)
720 for (size_t i = 0; i + needleLen <= storage.size(); i++) {
721 for (size_t j = 0;; j++) {
722 if (j == needleLen) {
723 return i; // found a match
724 }
725 if (storage[i + j] != uint32_t(needle[j])) {
726 break;
727 }
728 }
729 }
730 // We didn't find anything. Append, keeping the append index in mind.
731 size_t index = storage.size();
732 for(size_t i = 0; i < needleLen; i++) {
733 storage.push_back(T(needle[i]));
734 }
735
736 return index;
737 }
738
739
740 // Computes data for canonical decompositions
computeDecompositions(const char * basename,const USet * backwardCombiningStarters,std::vector<uint16_t> & storage16,std::vector<uint32_t> & storage32,USet * decompositionStartsWithNonStarter,USet * decompositionStartsWithBackwardCombiningStarter,std::vector<PendingDescriptor> & pendingTrieInsertions,UChar32 & decompositionPassthroughBound,UChar32 & compositionPassthroughBound)741 void computeDecompositions(const char* basename,
742 const USet* backwardCombiningStarters,
743 std::vector<uint16_t>& storage16,
744 std::vector<uint32_t>& storage32,
745 USet* decompositionStartsWithNonStarter,
746 USet* decompositionStartsWithBackwardCombiningStarter,
747 std::vector<PendingDescriptor>& pendingTrieInsertions,
748 UChar32& decompositionPassthroughBound,
749 UChar32& compositionPassthroughBound) {
750 IcuToolErrorCode status("icuexportdata: computeDecompositions");
751 const Normalizer2* mainNormalizer;
752 const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(status);
753 const Normalizer2* nfcNormalizer = Normalizer2::getNFCInstance(status);
754 FILE* f = nullptr;
755 std::vector<uint32_t> nonRecursive32;
756 LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status));
757
758 if (uprv_strcmp(basename, "nfkd") == 0) {
759 mainNormalizer = Normalizer2::getNFKDInstance(status);
760 } else if (uprv_strcmp(basename, "uts46d") == 0) {
761 mainNormalizer = Normalizer2::getInstance(nullptr, "uts46", UNORM2_COMPOSE, status);
762 } else {
763 mainNormalizer = nfdNormalizer;
764 f = prepareOutputFile("decompositionex");
765 }
766
767 // Max length as of Unicode 14 is 4 for NFD. For NFKD the max
768 // is 18 (U+FDFA; special-cased), and the next longest is 8 (U+FDFB).
769 const int32_t LONGEST_ENCODABLE_LENGTH_16 = 9;
770 const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8;
771 const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
772 UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
773 const int32_t RAW_DECOMPOSITION_BUFFER_SIZE = 2;
774 UChar32 rawUtf32[RAW_DECOMPOSITION_BUFFER_SIZE];
775
776 // Iterate over all scalar values excluding Hangul syllables.
777 //
778 // We go backwards in order to better find overlapping decompositions.
779 //
780 // As of Unicode 14:
781 // Iterate forward without overlap search:
782 // nfd: 16 size: 896, 32 size: 173
783 // nfkd: 16 size: 3854, 32 size: 179
784 //
785 // Iterate forward with overlap search:
786 // nfd: 16 size: 888, 32 size: 173
787 // nfkd: 16 size: 3266, 32 size: 179
788 //
789 // Iterate backward with overlap search:
790 // nfd: 16 size: 776, 32 size: 173
791 // nfkd: 16 size: 2941, 32 size: 179
792 //
793 // UChar32 is signed!
794 for (UChar32 c = 0x10FFFF; c >= 0; --c) {
795 if (c >= 0xAC00 && c <= 0xD7A3) {
796 // Hangul syllable
797 continue;
798 }
799 if (c >= 0xD800 && c < 0xE000) {
800 // Surrogate
801 continue;
802 }
803 UnicodeString src;
804 UnicodeString dst;
805 // True if we're building non-NFD or we're building NFD but
806 // the `c` round trips to NFC.
807 // False if we're building NFD and `c` does not round trip to NFC.
808 UBool nonNfdOrRoundTrips = true;
809 src.append(c);
810 if (mainNormalizer != nfdNormalizer) {
811 UnicodeString inter;
812 mainNormalizer->normalize(src, inter, status);
813 nfdNormalizer->normalize(inter, dst, status);
814 } else {
815 nfdNormalizer->normalize(src, dst, status);
816 UnicodeString nfc;
817 nfcNormalizer->normalize(dst, nfc, status);
818 nonNfdOrRoundTrips = (src == nfc);
819 }
820 int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
821 if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
822 // Characters that normalize to nothing or to U+FFFD (without the
823 // input being U+FFFD) in ICU4C's UTS 46 normalization normalize
824 // as in NFD in ICU4X's UTF 46 normalization in the interest
825 // of data size and ICU4X's normalizer being unable to handle
826 // normalizing to nothing.
827 // When UTS 46 is implemented on top of ICU4X, a preprocessing
828 // step is supposed to remove these characters before the
829 // normalization step.
830 if (uprv_strcmp(basename, "uts46d") != 0) {
831 status.set(U_INTERNAL_PROGRAM_ERROR);
832 handleError(status, basename);
833 }
834 nfdNormalizer->normalize(src, dst, status);
835 len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
836 if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
837 status.set(U_INTERNAL_PROGRAM_ERROR);
838 handleError(status, basename);
839 }
840 }
841 if (len > DECOMPOSITION_BUFFER_SIZE) {
842 status.set(U_INTERNAL_PROGRAM_ERROR);
843 handleError(status, basename);
844 }
845 uint8_t firstCombiningClass = u_getCombiningClass(utf32[0]);
846 bool specialNonStarterDecomposition = false;
847 bool startsWithBackwardCombiningStarter = false;
848 if (firstCombiningClass) {
849 decompositionPassthroughBound = c;
850 compositionPassthroughBound = c;
851 uset_add(decompositionStartsWithNonStarter, c);
852 if (src != dst) {
853 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F) {
854 specialNonStarterDecomposition = true;
855 } else {
856 // A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X.
857 status.set(U_INTERNAL_PROGRAM_ERROR);
858 handleError(status, basename);
859 }
860 }
861 } else if (uset_contains(backwardCombiningStarters, utf32[0])) {
862 compositionPassthroughBound = c;
863 startsWithBackwardCombiningStarter = true;
864 uset_add(decompositionStartsWithBackwardCombiningStarter, c);
865 }
866 if (c != BACKWARD_COMBINING_STARTER_MARKER && len == 1 && utf32[0] == BACKWARD_COMBINING_STARTER_MARKER) {
867 status.set(U_INTERNAL_PROGRAM_ERROR);
868 handleError(status, basename);
869 }
870 if (c != SPECIAL_NON_STARTER_DECOMPOSITION_MARKER && len == 1 && utf32[0] == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER) {
871 status.set(U_INTERNAL_PROGRAM_ERROR);
872 handleError(status, basename);
873 }
874 if (c != FDFA_MARKER && len == 1 && utf32[0] == FDFA_MARKER) {
875 status.set(U_INTERNAL_PROGRAM_ERROR);
876 handleError(status, basename);
877 }
878 if (mainNormalizer != nfdNormalizer) {
879 UnicodeString nfd;
880 nfdNormalizer->normalize(src, nfd, status);
881 if (dst == nfd) {
882 continue;
883 }
884 decompositionPassthroughBound = c;
885 compositionPassthroughBound = c;
886 } else if (firstCombiningClass) {
887 len = 1;
888 if (specialNonStarterDecomposition) {
889 utf32[0] = SPECIAL_NON_STARTER_DECOMPOSITION_MARKER; // magic value
890 } else {
891 // Use the surrogate range to store the canonical combining class
892 utf32[0] = 0xD800 | UChar32(firstCombiningClass);
893 }
894 } else {
895 if (src == dst) {
896 if (startsWithBackwardCombiningStarter) {
897 pendingTrieInsertions.push_back({c, BACKWARD_COMBINING_STARTER_MARKER << 16, false});
898 }
899 continue;
900 }
901 decompositionPassthroughBound = c;
902 // ICU4X hard-codes ANGSTROM SIGN
903 if (c != 0x212B) {
904 UnicodeString raw;
905 if (!nfdNormalizer->getRawDecomposition(c, raw)) {
906 // We're always supposed to have a non-recursive decomposition
907 // if we had a recursive one.
908 status.set(U_INTERNAL_PROGRAM_ERROR);
909 handleError(status, basename);
910 }
911 // In addition to actual difference, put the whole range that contains characters
912 // with oxia into the non-recursive trie in order to catch cases where characters
913 // with oxia have singleton decompositions to corresponding characters with tonos.
914 // This way, the run-time decision to fall through can be done on the range
915 // without checking for individual characters inside the range.
916 if (raw != dst || (c >= 0x1F71 && c <= 0x1FFB)) {
917 int32_t rawLen = raw.toUTF32(rawUtf32, RAW_DECOMPOSITION_BUFFER_SIZE, status);
918 if (!rawLen) {
919 status.set(U_INTERNAL_PROGRAM_ERROR);
920 handleError(status, basename);
921 }
922 if (rawLen == 1) {
923 if (c >= 0xFFFF) {
924 status.set(U_INTERNAL_PROGRAM_ERROR);
925 handleError(status, basename);
926 }
927 umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, uint32_t(rawUtf32[0]), status);
928 } else if (rawUtf32[0] <= 0xFFFF && rawUtf32[1] <= 0xFFFF) {
929 if (!rawUtf32[0] || !rawUtf32[1]) {
930 status.set(U_INTERNAL_PROGRAM_ERROR);
931 handleError(status, basename);
932 }
933 // Swapped for consistency with the primary trie
934 uint32_t bmpPair = uint32_t(rawUtf32[1]) << 16 | uint32_t(rawUtf32[0]);
935 umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, bmpPair, status);
936 } else {
937 // Let's add 1 to index to make it always non-zero to distinguish
938 // it from the default zero.
939 uint32_t index = nonRecursive32.size() + 1;
940 nonRecursive32.push_back(uint32_t(rawUtf32[0]));
941 nonRecursive32.push_back(uint32_t(rawUtf32[1]));
942 if (index > 0xFFFF) {
943 status.set(U_INTERNAL_PROGRAM_ERROR);
944 handleError(status, basename);
945 }
946 umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, index << 16, status);
947 }
948 }
949 }
950 }
951 if (!nonNfdOrRoundTrips) {
952 compositionPassthroughBound = c;
953 }
954 if (len == 1 && utf32[0] <= 0xFFFF) {
955 if (startsWithBackwardCombiningStarter) {
956 if (mainNormalizer == nfdNormalizer) {
957 // Not supposed to happen in NFD
958 status.set(U_INTERNAL_PROGRAM_ERROR);
959 handleError(status, basename);
960 } else if (!((utf32[0] >= 0x1161 && utf32[0] <= 0x1175) || (utf32[0] >= 0x11A8 && utf32[0] <= 0x11C2))) {
961 // Other than conjoining jamo vowels and trails
962 // unsupported for non-NFD.
963 status.set(U_INTERNAL_PROGRAM_ERROR);
964 handleError(status, basename);
965 }
966 }
967 pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, false});
968 } else if (len == 2 &&
969 utf32[0] <= 0xFFFF &&
970 utf32[1] <= 0xFFFF &&
971 !u_getCombiningClass(utf32[0]) &&
972 u_getCombiningClass(utf32[1]) &&
973 permissibleBmpPair(nonNfdOrRoundTrips, c, utf32[1])) {
974 for (int32_t i = 0; i < len; ++i) {
975 if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) {
976 // Assert that iota subscript and half-width voicing marks never occur in these
977 // expansions in the normalization forms where they are special.
978 status.set(U_INTERNAL_PROGRAM_ERROR);
979 handleError(status, basename);
980 }
981 }
982 if (startsWithBackwardCombiningStarter) {
983 status.set(U_INTERNAL_PROGRAM_ERROR);
984 handleError(status, basename);
985 }
986 pendingTrieInsertions.push_back({c, (uint32_t(utf32[0]) << 16) | uint32_t(utf32[1]), false});
987 } else {
988 if (startsWithBackwardCombiningStarter) {
989 status.set(U_INTERNAL_PROGRAM_ERROR);
990 handleError(status, basename);
991 }
992
993 UBool supplementary = false;
994 UBool nonInitialStarter = false;
995 for (int32_t i = 0; i < len; ++i) {
996 if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) {
997 // Assert that iota subscript and half-width voicing marks never occur in these
998 // expansions in the normalization forms where they are special.
999 status.set(U_INTERNAL_PROGRAM_ERROR);
1000 handleError(status, basename);
1001 }
1002
1003 if (utf32[i] > 0xFFFF) {
1004 supplementary = true;
1005 }
1006 if (utf32[i] == 0) {
1007 status.set(U_INTERNAL_PROGRAM_ERROR);
1008 handleError(status, basename);
1009 }
1010 if (i != 0 && !u_getCombiningClass(utf32[i])) {
1011 nonInitialStarter = true;
1012 }
1013 }
1014 if (!supplementary) {
1015 if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) {
1016 if (len == 18 && c == 0xFDFA) {
1017 // Special marker for the one character whose decomposition
1018 // is too long.
1019 pendingTrieInsertions.push_back({c, FDFA_MARKER << 16, supplementary});
1020 continue;
1021 } else {
1022 status.set(U_INTERNAL_PROGRAM_ERROR);
1023 handleError(status, basename);
1024 }
1025 }
1026 } else if (len > LONGEST_ENCODABLE_LENGTH_32 || !len) {
1027 status.set(U_INTERNAL_PROGRAM_ERROR);
1028 handleError(status, basename);
1029 }
1030 // Complex decomposition
1031 // Format for 16-bit value:
1032 // 15..13: length minus two for 16-bit case and length minus one for
1033 // the 32-bit case. Length 8 needs to fit in three bits in
1034 // the 16-bit case, and this way the value is future-proofed
1035 // up to 9 in the 16-bit case. Zero is unused and length one
1036 // in the 16-bit case goes directly into the trie.
1037 // 12: 1 if all trailing characters are guaranteed non-starters,
1038 // 0 if no guarantees about non-starterness.
1039 // Note: The bit choice is this way around to allow for
1040 // dynamically falling back to not having this but instead
1041 // having one more bit for length by merely choosing
1042 // different masks.
1043 // 11..0: Start offset in storage. The offset is to the logical
1044 // sequence of scalars16, scalars32, supplementary_scalars16,
1045 // supplementary_scalars32.
1046 uint32_t descriptor = uint32_t(!nonInitialStarter) << 12;
1047 if (!supplementary) {
1048 descriptor |= (uint32_t(len) - 2) << 13;
1049 } else {
1050 descriptor |= (uint32_t(len) - 1) << 13;
1051 }
1052 if (descriptor & 0xFFF) {
1053 status.set(U_INTERNAL_PROGRAM_ERROR);
1054 handleError(status, basename);
1055 }
1056 size_t index = 0;
1057 if (!supplementary) {
1058 index = findOrAppend(storage16, utf32, len);
1059 } else {
1060 index = findOrAppend(storage32, utf32, len);
1061 }
1062 if (index > 0xFFF) {
1063 status.set(U_INTERNAL_PROGRAM_ERROR);
1064 handleError(status, basename);
1065 }
1066 descriptor |= uint32_t(index);
1067 if (!descriptor || descriptor > 0xFFFF) {
1068 // > 0xFFFF should never happen if the code above is correct.
1069 // == 0 should not happen due to the nature of the data.
1070 status.set(U_INTERNAL_PROGRAM_ERROR);
1071 handleError(status, basename);
1072 }
1073 uint32_t nonRoundTripMarker = 0;
1074 if (!nonNfdOrRoundTrips) {
1075 nonRoundTripMarker = (NON_ROUND_TRIP_MARKER << 16);
1076 }
1077 pendingTrieInsertions.push_back({c, descriptor | nonRoundTripMarker, supplementary});
1078 }
1079 }
1080 if (storage16.size() + storage32.size() > 0xFFF) {
1081 status.set(U_INTERNAL_PROGRAM_ERROR);
1082 }
1083 if (f) {
1084 usrc_writeArray(f, "scalars32 = [\n ", nonRecursive32.data(), 32, nonRecursive32.size(), " ", "\n]\n");
1085
1086 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
1087 nonRecursiveBuilder.getAlias(),
1088 trieType,
1089 UCPTRIE_VALUE_BITS_32,
1090 status));
1091 handleError(status, basename);
1092
1093 fprintf(f, "[trie]\n");
1094 usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
1095
1096 fclose(f);
1097 }
1098 handleError(status, basename);
1099 }
1100
1101 #endif // !UCONFIG_NO_NORMALIZATION
1102
1103 enum {
1104 OPT_HELP_H,
1105 OPT_HELP_QUESTION_MARK,
1106 OPT_MODE,
1107 OPT_TRIE_TYPE,
1108 OPT_VERSION,
1109 OPT_DESTDIR,
1110 OPT_ALL,
1111 OPT_INDEX,
1112 OPT_COPYRIGHT,
1113 OPT_VERBOSE,
1114 OPT_QUIET,
1115
1116 OPT_COUNT
1117 };
1118
1119 #define UOPTION_MODE UOPTION_DEF("mode", 'm', UOPT_REQUIRES_ARG)
1120 #define UOPTION_TRIE_TYPE UOPTION_DEF("trie-type", '\1', UOPT_REQUIRES_ARG)
1121 #define UOPTION_ALL UOPTION_DEF("all", '\1', UOPT_NO_ARG)
1122 #define UOPTION_INDEX UOPTION_DEF("index", '\1', UOPT_NO_ARG)
1123
1124 static UOption options[]={
1125 UOPTION_HELP_H,
1126 UOPTION_HELP_QUESTION_MARK,
1127 UOPTION_MODE,
1128 UOPTION_TRIE_TYPE,
1129 UOPTION_VERSION,
1130 UOPTION_DESTDIR,
1131 UOPTION_ALL,
1132 UOPTION_INDEX,
1133 UOPTION_COPYRIGHT,
1134 UOPTION_VERBOSE,
1135 UOPTION_QUIET,
1136 };
1137
printHelp(FILE * stdfile,const char * program)1138 void printHelp(FILE* stdfile, const char* program) {
1139 fprintf(stdfile,
1140 "usage: %s -m mode [-options] [--all | properties...]\n"
1141 "\tdump Unicode property data to .toml files\n"
1142 "options:\n"
1143 "\t-h or -? or --help this usage text\n"
1144 "\t-V or --version show a version message\n"
1145 "\t-m or --mode mode: currently only 'uprops', 'ucase', and 'norm', but more may be added\n"
1146 "\t --trie-type set the trie type (small or fast, default small)\n"
1147 "\t-d or --destdir destination directory, followed by the path\n"
1148 "\t --all write out all properties known to icuexportdata\n"
1149 "\t --index write an _index.toml summarizing all data exported\n"
1150 "\t-c or --copyright include a copyright notice\n"
1151 "\t-v or --verbose Turn on verbose output\n"
1152 "\t-q or --quiet do not display warnings and progress\n",
1153 program);
1154 }
1155
exportUprops(int argc,char * argv[])1156 int exportUprops(int argc, char* argv[]) {
1157 // Load list of Unicode properties
1158 std::vector<const char*> propNames;
1159 for (int i=1; i<argc; i++) {
1160 propNames.push_back(argv[i]);
1161 }
1162 if (options[OPT_ALL].doesOccur) {
1163 int i = UCHAR_BINARY_START;
1164 while (true) {
1165 if (i == UCHAR_BINARY_LIMIT) {
1166 i = UCHAR_INT_START;
1167 }
1168 if (i == UCHAR_INT_LIMIT) {
1169 i = UCHAR_GENERAL_CATEGORY_MASK;
1170 }
1171 if (i == UCHAR_GENERAL_CATEGORY_MASK + 1) {
1172 i = UCHAR_BIDI_MIRRORING_GLYPH;
1173 }
1174 if (i == UCHAR_BIDI_MIRRORING_GLYPH + 1) {
1175 i = UCHAR_SCRIPT_EXTENSIONS;
1176 }
1177 if (i == UCHAR_SCRIPT_EXTENSIONS + 1) {
1178 break;
1179 }
1180 UProperty uprop = static_cast<UProperty>(i);
1181 const char* propName = u_getPropertyName(uprop, U_SHORT_PROPERTY_NAME);
1182 if (propName == nullptr) {
1183 propName = u_getPropertyName(uprop, U_LONG_PROPERTY_NAME);
1184 if (propName != nullptr && VERBOSE) {
1185 std::cerr << "Note: falling back to long name for: " << propName << std::endl;
1186 }
1187 }
1188 if (propName != nullptr) {
1189 propNames.push_back(propName);
1190 } else {
1191 std::cerr << "Warning: Could not find name for: " << uprop << std::endl;
1192 }
1193 i++;
1194 }
1195 }
1196
1197 if (propNames.empty()
1198 || options[OPT_HELP_H].doesOccur
1199 || options[OPT_HELP_QUESTION_MARK].doesOccur
1200 || !options[OPT_MODE].doesOccur) {
1201 FILE *stdfile=argc<0 ? stderr : stdout;
1202 fprintf(stdfile,
1203 "usage: %s -m uprops [-options] [--all | properties...]\n"
1204 "\tdump Unicode property data to .toml files\n"
1205 "options:\n"
1206 "\t-h or -? or --help this usage text\n"
1207 "\t-V or --version show a version message\n"
1208 "\t-m or --mode mode: currently only 'uprops', but more may be added\n"
1209 "\t --trie-type set the trie type (small or fast, default small)\n"
1210 "\t-d or --destdir destination directory, followed by the path\n"
1211 "\t --all write out all properties known to icuexportdata\n"
1212 "\t --index write an _index.toml summarizing all data exported\n"
1213 "\t-c or --copyright include a copyright notice\n"
1214 "\t-v or --verbose Turn on verbose output\n"
1215 "\t-q or --quiet do not display warnings and progress\n",
1216 argv[0]);
1217 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1218 }
1219
1220 const char* mode = options[OPT_MODE].value;
1221 if (uprv_strcmp(mode, "uprops") != 0) {
1222 fprintf(stderr, "Invalid option for --mode (must be uprops)\n");
1223 return U_ILLEGAL_ARGUMENT_ERROR;
1224 }
1225
1226 if (options[OPT_TRIE_TYPE].doesOccur) {
1227 if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
1228 trieType = UCPTRIE_TYPE_FAST;
1229 } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
1230 trieType = UCPTRIE_TYPE_SMALL;
1231 } else {
1232 fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
1233 return U_ILLEGAL_ARGUMENT_ERROR;
1234 }
1235 }
1236
1237 for (const char* propName : propNames) {
1238 UProperty propEnum = u_getPropertyEnum(propName);
1239 if (propEnum == UCHAR_INVALID_CODE) {
1240 std::cerr << "Error: Invalid property alias: " << propName << std::endl;
1241 return U_ILLEGAL_ARGUMENT_ERROR;
1242 }
1243
1244 FILE* f = prepareOutputFile(propName);
1245
1246 UVersionInfo versionInfo;
1247 u_getUnicodeVersion(versionInfo);
1248 char uvbuf[U_MAX_VERSION_STRING_LENGTH];
1249 u_versionToString(versionInfo, uvbuf);
1250 fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n",
1251 U_ICU_VERSION,
1252 uvbuf);
1253
1254 if (propEnum < UCHAR_BINARY_LIMIT) {
1255 dumpBinaryProperty(propEnum, f);
1256 } else if (UCHAR_INT_START <= propEnum && propEnum <= UCHAR_INT_LIMIT) {
1257 dumpEnumeratedProperty(propEnum, f);
1258 } else if (propEnum == UCHAR_GENERAL_CATEGORY_MASK) {
1259 dumpGeneralCategoryMask(f);
1260 } else if (propEnum == UCHAR_BIDI_MIRRORING_GLYPH) {
1261 dumpBidiMirroringGlyph(f);
1262 } else if (propEnum == UCHAR_SCRIPT_EXTENSIONS) {
1263 dumpScriptExtensions(f);
1264 } else {
1265 std::cerr << "Don't know how to write property: " << propEnum << std::endl;
1266 return U_INTERNAL_PROGRAM_ERROR;
1267 }
1268
1269 fclose(f);
1270 }
1271
1272 if (options[OPT_INDEX].doesOccur) {
1273 FILE* f = prepareOutputFile("_index");
1274 fprintf(f, "index = [\n");
1275 for (const char* propName : propNames) {
1276 // At this point, propName is a valid property name, so it should be alphanum ASCII
1277 fprintf(f, " { filename=\"%s.toml\" },\n", propName);
1278 }
1279 fprintf(f, "]\n");
1280 fclose(f);
1281 }
1282
1283 return 0;
1284 }
1285
1286 struct AddRangeHelper {
1287 UMutableCPTrie* ucptrie;
1288 };
1289
1290 static UBool U_CALLCONV
addRangeToUCPTrie(const void * context,UChar32 start,UChar32 end,uint32_t value)1291 addRangeToUCPTrie(const void* context, UChar32 start, UChar32 end, uint32_t value) {
1292 IcuToolErrorCode status("addRangeToUCPTrie");
1293 UMutableCPTrie* ucptrie = ((const AddRangeHelper*) context)->ucptrie;
1294 umutablecptrie_setRange(ucptrie, start, end, value, status);
1295 handleError(status, "setRange");
1296
1297 return true;
1298 }
1299
exportCase(int argc,char * argv[])1300 int exportCase(int argc, char* argv[]) {
1301 if (argc > 1) {
1302 fprintf(stderr, "ucase mode does not expect additional arguments\n");
1303 return U_ILLEGAL_ARGUMENT_ERROR;
1304 }
1305 (void) argv; // Suppress unused variable warning
1306
1307 IcuToolErrorCode status("icuexportdata");
1308 LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
1309 handleError(status, "exportCase");
1310
1311 int32_t exceptionsLength, unfoldLength;
1312 const UCaseProps *caseProps = ucase_getSingleton(&exceptionsLength, &unfoldLength);
1313 const UTrie2* caseTrie = &caseProps->trie;
1314
1315 AddRangeHelper helper = { builder.getAlias() };
1316 utrie2_enum(caseTrie, nullptr, addRangeToUCPTrie, &helper);
1317
1318 UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_16;
1319 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
1320 builder.getAlias(),
1321 trieType,
1322 width,
1323 status));
1324 handleError(status, "exportCase");
1325
1326 FILE* f = prepareOutputFile("ucase");
1327
1328 UVersionInfo versionInfo;
1329 u_getUnicodeVersion(versionInfo);
1330 char uvbuf[U_MAX_VERSION_STRING_LENGTH];
1331 u_versionToString(versionInfo, uvbuf);
1332 fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n",
1333 U_ICU_VERSION,
1334 uvbuf);
1335
1336 fputs("[ucase.code_point_trie]\n", f);
1337 usrc_writeUCPTrie(f, "case_trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
1338 fputs("\n", f);
1339
1340 const char* indent = " ";
1341 const char* suffix = "\n]\n";
1342
1343 fputs("[ucase.exceptions]\n", f);
1344 const char* exceptionsPrefix = "exceptions = [\n ";
1345 int32_t exceptionsWidth = 16;
1346 usrc_writeArray(f, exceptionsPrefix, caseProps->exceptions, exceptionsWidth,
1347 exceptionsLength, indent, suffix);
1348 fputs("\n", f);
1349
1350 fputs("[ucase.unfold]\n", f);
1351 const char* unfoldPrefix = "unfold = [\n ";
1352 int32_t unfoldWidth = 16;
1353 usrc_writeArray(f, unfoldPrefix, caseProps->unfold, unfoldWidth,
1354 unfoldLength, indent, suffix);
1355
1356 return 0;
1357 }
1358
1359 #if !UCONFIG_NO_NORMALIZATION
1360
exportNorm()1361 int exportNorm() {
1362 IcuToolErrorCode status("icuexportdata: exportNorm");
1363 USet* backwardCombiningStarters = uset_openEmpty();
1364 writeCanonicalCompositions(backwardCombiningStarters);
1365
1366 std::vector<uint16_t> storage16;
1367 std::vector<uint32_t> storage32;
1368
1369 // Note: the USets are not exported. They are only used to check that a new
1370 // Unicode version doesn't violate expectations that are hard-coded in ICU4X.
1371 USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty();
1372 USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1373 std::vector<PendingDescriptor> nfdPendingTrieInsertions;
1374 UChar32 nfdBound = 0x10FFFF;
1375 UChar32 nfcBound = 0x10FFFF;
1376 computeDecompositions("nfd",
1377 backwardCombiningStarters,
1378 storage16,
1379 storage32,
1380 nfdDecompositionStartsWithNonStarter,
1381 nfdDecompositionStartsWithBackwardCombiningStarter,
1382 nfdPendingTrieInsertions,
1383 nfdBound,
1384 nfcBound);
1385 if (!(nfdBound == 0xC0 && nfcBound == 0x300)) {
1386 // Unexpected bounds for NFD/NFC.
1387 status.set(U_INTERNAL_PROGRAM_ERROR);
1388 handleError(status, "exportNorm");
1389 }
1390
1391 uint32_t baseSize16 = storage16.size();
1392 uint32_t baseSize32 = storage32.size();
1393
1394 USet* nfkdDecompositionStartsWithNonStarter = uset_openEmpty();
1395 USet* nfkdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1396 std::vector<PendingDescriptor> nfkdPendingTrieInsertions;
1397 UChar32 nfkdBound = 0x10FFFF;
1398 UChar32 nfkcBound = 0x10FFFF;
1399 computeDecompositions("nfkd",
1400 backwardCombiningStarters,
1401 storage16,
1402 storage32,
1403 nfkdDecompositionStartsWithNonStarter,
1404 nfkdDecompositionStartsWithBackwardCombiningStarter,
1405 nfkdPendingTrieInsertions,
1406 nfkdBound,
1407 nfkcBound);
1408 if (!(nfkdBound <= 0xC0 && nfkcBound <= 0x300)) {
1409 status.set(U_INTERNAL_PROGRAM_ERROR);
1410 handleError(status, "exportNorm");
1411 }
1412 if (nfkcBound > 0xC0) {
1413 if (nfkdBound != 0xC0) {
1414 status.set(U_INTERNAL_PROGRAM_ERROR);
1415 handleError(status, "exportNorm");
1416 }
1417 } else {
1418 if (nfkdBound != nfkcBound) {
1419 status.set(U_INTERNAL_PROGRAM_ERROR);
1420 handleError(status, "exportNorm");
1421 }
1422 }
1423
1424 USet* uts46DecompositionStartsWithNonStarter = uset_openEmpty();
1425 USet* uts46DecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1426 std::vector<PendingDescriptor> uts46PendingTrieInsertions;
1427 UChar32 uts46dBound = 0x10FFFF;
1428 UChar32 uts46Bound = 0x10FFFF;
1429 computeDecompositions("uts46d",
1430 backwardCombiningStarters,
1431 storage16,
1432 storage32,
1433 uts46DecompositionStartsWithNonStarter,
1434 uts46DecompositionStartsWithBackwardCombiningStarter,
1435 uts46PendingTrieInsertions,
1436 uts46dBound,
1437 uts46Bound);
1438 if (!(uts46dBound <= 0xC0 && uts46Bound <= 0x300)) {
1439 status.set(U_INTERNAL_PROGRAM_ERROR);
1440 handleError(status, "exportNorm");
1441 }
1442 if (uts46Bound > 0xC0) {
1443 if (uts46dBound != 0xC0) {
1444 status.set(U_INTERNAL_PROGRAM_ERROR);
1445 handleError(status, "exportNorm");
1446 }
1447 } else {
1448 if (uts46dBound != uts46Bound) {
1449 status.set(U_INTERNAL_PROGRAM_ERROR);
1450 handleError(status, "exportNorm");
1451 }
1452 }
1453
1454 uint32_t supplementSize16 = storage16.size() - baseSize16;
1455 uint32_t supplementSize32 = storage32.size() - baseSize32;
1456
1457 writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions, char16_t(nfcBound));
1458 writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions, char16_t(nfkcBound));
1459 writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions, char16_t(uts46Bound));
1460
1461 writeDecompositionTables("nfdex", storage16.data(), baseSize16, storage32.data(), baseSize32);
1462 writeDecompositionTables("nfkdex", storage16.data() + baseSize16, supplementSize16, storage32.data() + baseSize32, supplementSize32);
1463
1464 uset_close(nfdDecompositionStartsWithNonStarter);
1465 uset_close(nfkdDecompositionStartsWithNonStarter);
1466 uset_close(uts46DecompositionStartsWithNonStarter);
1467
1468 uset_close(nfdDecompositionStartsWithBackwardCombiningStarter);
1469 uset_close(nfkdDecompositionStartsWithBackwardCombiningStarter);
1470 uset_close(uts46DecompositionStartsWithBackwardCombiningStarter);
1471
1472 uset_close(backwardCombiningStarters);
1473 handleError(status, "exportNorm");
1474 return 0;
1475 }
1476
1477 #endif // !UCONFIG_NO_NORMALIZATION
1478
main(int argc,char * argv[])1479 int main(int argc, char* argv[]) {
1480 U_MAIN_INIT_ARGS(argc, argv);
1481
1482 /* preset then read command line options */
1483 options[OPT_DESTDIR].value=u_getDataDirectory();
1484 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
1485
1486 if(options[OPT_VERSION].doesOccur) {
1487 printf("icuexportdata version %s, ICU tool to dump data files for external consumers\n",
1488 U_ICU_DATA_VERSION);
1489 printf("%s\n", U_COPYRIGHT_STRING);
1490 exit(0);
1491 }
1492
1493 /* error handling, printing usage message */
1494 if(argc<0) {
1495 fprintf(stderr,
1496 "error in command line argument \"%s\"\n",
1497 argv[-argc]);
1498 }
1499
1500 if (argc < 0
1501 || options[OPT_HELP_H].doesOccur
1502 || options[OPT_HELP_QUESTION_MARK].doesOccur
1503 || !options[OPT_MODE].doesOccur) {
1504 FILE *stdfile=argc<0 ? stderr : stdout;
1505 printHelp(stdfile, argv[0]);
1506 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1507 }
1508
1509 /* get the options values */
1510 haveCopyright = options[OPT_COPYRIGHT].doesOccur;
1511 destdir = options[OPT_DESTDIR].value;
1512 VERBOSE = options[OPT_VERBOSE].doesOccur;
1513 QUIET = options[OPT_QUIET].doesOccur;
1514
1515 if (options[OPT_TRIE_TYPE].doesOccur) {
1516 if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
1517 trieType = UCPTRIE_TYPE_FAST;
1518 } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
1519 trieType = UCPTRIE_TYPE_SMALL;
1520 } else {
1521 fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
1522 return U_ILLEGAL_ARGUMENT_ERROR;
1523 }
1524 }
1525
1526 const char* mode = options[OPT_MODE].value;
1527 if (uprv_strcmp(mode, "norm") == 0) {
1528 #if !UCONFIG_NO_NORMALIZATION
1529 return exportNorm();
1530 #else
1531 fprintf(stderr, "Exporting normalization data not supported when compiling without normalization support.\n");
1532 return U_ILLEGAL_ARGUMENT_ERROR;
1533 #endif
1534 }
1535 if (uprv_strcmp(mode, "uprops") == 0) {
1536 return exportUprops(argc, argv);
1537 } else if (uprv_strcmp(mode, "ucase") == 0) {
1538 return exportCase(argc, argv);
1539 }
1540
1541 fprintf(stderr, "Invalid option for --mode (must be uprops, ucase, or norm)\n");
1542 return U_ILLEGAL_ARGUMENT_ERROR;
1543 }
1544