1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 #include <cstddef>
5 #include <cstdint>
6 #include <cstdio>
7 #include <iostream>
8 #include <unicode/localpointer.h>
9 #include <unicode/umachine.h>
10 #include <unicode/unistr.h>
11 #include <unicode/urename.h>
12 #include <unicode/uset.h>
13 #include <vector>
14 #include <algorithm>
15 #include "toolutil.h"
16 #include "uoptions.h"
17 #include "cmemory.h"
18 #include "charstr.h"
19 #include "cstring.h"
20 #include "unicode/uchar.h"
21 #include "unicode/errorcode.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uscript.h"
24 #include "unicode/putil.h"
25 #include "unicode/umutablecptrie.h"
26 #include "unicode/ucharstriebuilder.h"
27 #include "ucase.h"
28 #include "unicode/normalizer2.h"
29 #include "normalizer2impl.h"
30 #include "writesrc.h"
31
32 U_NAMESPACE_USE
33
34 /*
35 * Global - verbosity
36 */
37 UBool VERBOSE = false;
38 UBool QUIET = false;
39
40 UBool haveCopyright = true;
41 UCPTrieType trieType = UCPTRIE_TYPE_SMALL;
42 const char* destdir = "";
43
44 // Mask constants for modified values in the Script CodePointTrie, values are logically 12-bits.
45 int16_t DATAEXPORT_SCRIPT_X_WITH_COMMON = 0x0400;
46 int16_t DATAEXPORT_SCRIPT_X_WITH_INHERITED = 0x0800;
47 int16_t DATAEXPORT_SCRIPT_X_WITH_OTHER = 0x0c00;
48
49 // TODO(ICU-21821): Replace this with a call to a library function
50 int32_t scxCodePoints[] = {
51 7415, 7377, 7380, 7387, 7390, 7391, 7394, 7395, 7396, 7397,
52 7398, 7399, 7400, 7403, 7404, 7406, 7407, 7408, 7409, 113824,
53 113825, 113826, 113827, 834, 837, 7616, 7617, 12294, 12350, 12351,
54 12688, 12689, 12690, 12691, 12692, 12693, 12694, 12695, 12696, 12697,
55 12698, 12699, 12700, 12701, 12702, 12703, 12736, 12737, 12738, 12739,
56 12740, 12741, 12742, 12743, 12744, 12745, 12746, 12747, 12748, 12749,
57 12750, 12751, 12752, 12753, 12754, 12755, 12756, 12757, 12758, 12759,
58 12760, 12761, 12762, 12763, 12764, 12765, 12766, 12767, 12768, 12769,
59 12770, 12771, 12832, 12833, 12834, 12835, 12836, 12837, 12838, 12839,
60 12840, 12841, 12842, 12843, 12844, 12845, 12846, 12847, 12848, 12849,
61 12850, 12851, 12852, 12853, 12854, 12855, 12856, 12857, 12858, 12859,
62 12860, 12861, 12862, 12863, 12864, 12865, 12866, 12867, 12868, 12869,
63 12870, 12871, 12928, 12929, 12930, 12931, 12932, 12933, 12934, 12935,
64 12936, 12937, 12938, 12939, 12940, 12941, 12942, 12943, 12944, 12945,
65 12946, 12947, 12948, 12949, 12950, 12951, 12952, 12953, 12954, 12955,
66 12956, 12957, 12958, 12959, 12960, 12961, 12962, 12963, 12964, 12965,
67 12966, 12967, 12968, 12969, 12970, 12971, 12972, 12973, 12974, 12975,
68 12976, 12992, 12993, 12994, 12995, 12996, 12997, 12998, 12999, 13000,
69 13001, 13002, 13003, 13055, 13144, 13145, 13146, 13147, 13148, 13149,
70 13150, 13151, 13152, 13153, 13154, 13155, 13156, 13157, 13158, 13159,
71 13160, 13161, 13162, 13163, 13164, 13165, 13166, 13167, 13168, 13179,
72 13180, 13181, 13182, 13183, 13280, 13281, 13282, 13283, 13284, 13285,
73 13286, 13287, 13288, 13289, 13290, 13291, 13292, 13293, 13294, 13295,
74 13296, 13297, 13298, 13299, 13300, 13301, 13302, 13303, 13304, 13305,
75 13306, 13307, 13308, 13309, 13310, 119648, 119649, 119650, 119651, 119652,
76 119653, 119654, 119655, 119656, 119657, 119658, 119659, 119660, 119661, 119662,
77 119663, 119664, 119665, 127568, 127569, 867, 868, 869, 870, 871,
78 872, 873, 874, 875, 876, 877, 878, 879, 7418, 7674,
79 66272, 66273, 66274, 66275, 66276, 66277, 66278, 66279, 66280, 66281,
80 66282, 66283, 66284, 66285, 66286, 66287, 66288, 66289, 66290, 66291,
81 66292, 66293, 66294, 66295, 66296, 66297, 66298, 66299, 1748, 64830,
82 64831, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619,
83 1620, 1621, 1648, 65010, 65021, 7381, 7382, 7384, 7393, 7402,
84 7405, 7413, 7414, 43249, 12330, 12331, 12332, 12333, 43471, 65794,
85 65847, 65848, 65849, 65850, 65851, 65852, 65853, 65854, 65855, 1156,
86 1159, 11843, 42607, 1157, 1158, 1155, 7672, 7379, 7411, 7416,
87 7417, 7401, 7383, 7385, 7388, 7389, 7392, 43251, 4347, 3046,
88 3047, 3048, 3049, 3050, 3051, 3052, 3053, 3054, 3055, 3056,
89 3057, 3058, 3059, 70401, 70403, 70459, 70460, 73680, 73681, 73683,
90 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2799,
91 2662, 2663, 2664, 2665, 2666, 2667, 2668, 2669, 2670, 2671,
92 42752, 42753, 42754, 42755, 42756, 42757, 42758, 42759, 12337, 12338,
93 12339, 12340, 12341, 12441, 12442, 12443, 12444, 12448, 12540, 65392,
94 65438, 65439, 3302, 3303, 3304, 3305, 3306, 3307, 3308, 3309,
95 3310, 3311, 8239, 68338, 6146, 6147, 6149, 1564, 1632, 1633,
96 1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 2534, 2535,
97 2536, 2537, 2538, 2539, 2540, 2541, 2542, 2543, 4160, 4161,
98 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 65792, 65793,
99 65799, 65800, 65801, 65802, 65803, 65804, 65805, 65806, 65807, 65808,
100 65809, 65810, 65811, 65812, 65813, 65814, 65815, 65816, 65817, 65818,
101 65819, 65820, 65821, 65822, 65823, 65824, 65825, 65826, 65827, 65828,
102 65829, 65830, 65831, 65832, 65833, 65834, 65835, 65836, 65837, 65838,
103 65839, 65840, 65841, 65842, 65843, 7412, 8432, 12348, 12349, 43310,
104 7376, 7378, 5941, 5942, 2406, 2407, 2408, 2409, 2410, 2411,
105 2412, 2413, 2414, 2415, 12291, 12307, 12316, 12317, 12318, 12319,
106 12336, 12343, 65093, 65094, 1548, 1563, 12289, 12290, 12296, 12297,
107 12298, 12299, 12300, 12301, 12302, 12303, 12304, 12305, 12308, 12309,
108 12310, 12311, 12312, 12313, 12314, 12315, 12539, 65377, 65378, 65379,
109 65380, 65381, 7386, 1567, 7410, 1600, 43062, 43063, 43064, 43065,
110 2386, 2385, 43059, 43060, 43061, 43056, 43057, 43058, 2404, 2405
111 };
112
handleError(ErrorCode & status,const char * context)113 void handleError(ErrorCode& status, const char* context) {
114 if (status.isFailure()) {
115 std::cerr << "Error: " << context << ": " << status.errorName() << std::endl;
116 exit(status.reset());
117 }
118 }
119
120 class PropertyValueNameGetter : public ValueNameGetter {
121 public:
PropertyValueNameGetter(UProperty prop)122 PropertyValueNameGetter(UProperty prop) : property(prop) {}
123 ~PropertyValueNameGetter() override;
getName(uint32_t value)124 const char *getName(uint32_t value) override {
125 return u_getPropertyValueName(property, value, U_SHORT_PROPERTY_NAME);
126 }
127
128 private:
129 UProperty property;
130 };
131
~PropertyValueNameGetter()132 PropertyValueNameGetter::~PropertyValueNameGetter() {}
133
134 // Dump an aliases = [...] key for properties with aliases
dumpPropertyAliases(UProperty uproperty,FILE * f)135 void dumpPropertyAliases(UProperty uproperty, FILE* f) {
136 int i = U_LONG_PROPERTY_NAME + 1;
137
138 while(true) {
139 // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially,
140 // and returning null after that
141 const char* alias = u_getPropertyName(uproperty, (UPropertyNameChoice) i);
142 if (!alias) {
143 break;
144 }
145 if (i == U_LONG_PROPERTY_NAME + 1) {
146 fprintf(f, "aliases = [\"%s\"", alias);
147 } else {
148 fprintf(f, ", \"%s\"", alias);
149 }
150 i++;
151 }
152 if (i != U_LONG_PROPERTY_NAME + 1) {
153 fprintf(f, "]\n");
154 }
155 }
156
dumpBinaryProperty(UProperty uproperty,FILE * f)157 void dumpBinaryProperty(UProperty uproperty, FILE* f) {
158 IcuToolErrorCode status("icuexportdata: dumpBinaryProperty");
159 const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
160 const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
161 const USet* uset = u_getBinaryPropertySet(uproperty, status);
162 handleError(status, fullPropName);
163
164 fputs("[[binary_property]]\n", f);
165 fprintf(f, "long_name = \"%s\"\n", fullPropName);
166 if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
167 fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
168 dumpPropertyAliases(uproperty, f);
169 usrc_writeUnicodeSet(f, uset, UPRV_TARGET_SYNTAX_TOML);
170 }
171
172 // If the value exists, dump an indented entry of the format
173 // `" {discr = <discriminant>, long = <longname>, short = <shortname>, aliases = [<aliases>]},"`
dumpValueEntry(UProperty uproperty,int v,bool is_mask,FILE * f)174 void dumpValueEntry(UProperty uproperty, int v, bool is_mask, FILE* f) {
175 const char* fullValueName = u_getPropertyValueName(uproperty, v, U_LONG_PROPERTY_NAME);
176 const char* shortValueName = u_getPropertyValueName(uproperty, v, U_SHORT_PROPERTY_NAME);
177 if (!fullValueName) {
178 return;
179 }
180 if (is_mask) {
181 fprintf(f, " {discr = 0x%X", v);
182 } else {
183 fprintf(f, " {discr = %i", v);
184 }
185 fprintf(f, ", long = \"%s\"", fullValueName);
186 if (shortValueName) {
187 fprintf(f, ", short = \"%s\"", shortValueName);
188 }
189 int i = U_LONG_PROPERTY_NAME + 1;
190 while(true) {
191 // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially,
192 // and returning null after that
193 const char* alias = u_getPropertyValueName(uproperty, v, (UPropertyNameChoice) i);
194 if (!alias) {
195 break;
196 }
197 if (i == U_LONG_PROPERTY_NAME + 1) {
198 fprintf(f, ", aliases = [\"%s\"", alias);
199 } else {
200 fprintf(f, ", \"%s\"", alias);
201 }
202 i++;
203 }
204 if (i != U_LONG_PROPERTY_NAME + 1) {
205 fprintf(f, "]");
206 }
207 fprintf(f, "},\n");
208 }
209
dumpEnumeratedProperty(UProperty uproperty,FILE * f)210 void dumpEnumeratedProperty(UProperty uproperty, FILE* f) {
211 IcuToolErrorCode status("icuexportdata: dumpEnumeratedProperty");
212 const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
213 const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
214 const UCPMap* umap = u_getIntPropertyMap(uproperty, status);
215 handleError(status, fullPropName);
216
217 fputs("[[enum_property]]\n", f);
218 fprintf(f, "long_name = \"%s\"\n", fullPropName);
219 if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
220 fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
221 dumpPropertyAliases(uproperty, f);
222
223 int32_t minValue = u_getIntPropertyMinValue(uproperty);
224 U_ASSERT(minValue >= 0);
225 int32_t maxValue = u_getIntPropertyMaxValue(uproperty);
226 U_ASSERT(maxValue >= 0);
227
228 fprintf(f, "values = [\n");
229 for (int v = minValue; v <= maxValue; v++) {
230 dumpValueEntry(uproperty, v, false, f);
231 }
232 fprintf(f, "]\n");
233
234 PropertyValueNameGetter valueNameGetter(uproperty);
235 usrc_writeUCPMap(f, umap, &valueNameGetter, UPRV_TARGET_SYNTAX_TOML);
236 fputs("\n", f);
237
238
239 UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32;
240 if (maxValue <= 0xff) {
241 width = UCPTRIE_VALUE_BITS_8;
242 } else if (maxValue <= 0xffff) {
243 width = UCPTRIE_VALUE_BITS_16;
244 }
245 LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(umap, status));
246 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
247 builder.getAlias(),
248 trieType,
249 width,
250 status));
251 handleError(status, fullPropName);
252
253 fputs("[enum_property.code_point_trie]\n", f);
254 usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
255 }
256
257 /*
258 * Export Bidi_Mirroring_Glyph values (code points) in a similar way to how enumerated
259 * properties are dumped to file.
260 * Note: the data will store 0 for code points without a value defined for
261 * Bidi_Mirroring_Glyph.
262 */
dumpBidiMirroringGlyph(FILE * f)263 void dumpBidiMirroringGlyph(FILE* f) {
264 UProperty uproperty = UCHAR_BIDI_MIRRORING_GLYPH;
265 IcuToolErrorCode status("icuexportdata: dumpBidiMirroringGlyph");
266 const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
267 const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
268 handleError(status, fullPropName);
269
270 // Store 21-bit code point as is
271 UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32;
272
273 // note: unlike dumpEnumeratedProperty, which can get inversion map data using
274 // u_getIntPropertyMap(uproperty), the only reliable way to get Bidi_Mirroring_Glyph
275 // is to use u_charMirror(cp) over the code point space.
276 LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
277 for(UChar32 c = UCHAR_MIN_VALUE; c <= UCHAR_MAX_VALUE; c++) {
278 UChar32 mirroringGlyph = u_charMirror(c);
279 // The trie builder code throws an error when it cannot compress the data sufficiently.
280 // Therefore, when the value is undefined for a code point, keep a 0 in the trie
281 // instead of the ICU API behavior of returning the code point value. Using 0
282 // results in a relatively significant space savings by not including redundant data.
283 if (c != mirroringGlyph) {
284 umutablecptrie_set(builder.getAlias(), c, mirroringGlyph, status);
285 }
286 }
287
288 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
289 builder.getAlias(),
290 trieType,
291 width,
292 status));
293 handleError(status, fullPropName);
294
295 // currently a trie and inversion map are the same (as relied upon in characterproperties.cpp)
296 const UCPMap* umap = reinterpret_cast<UCPMap *>(utrie.getAlias());
297
298 fputs("[[enum_property]]\n", f);
299 fprintf(f, "long_name = \"%s\"\n", fullPropName);
300 if (shortPropName) {
301 fprintf(f, "short_name = \"%s\"\n", shortPropName);
302 }
303 fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
304 dumpPropertyAliases(uproperty, f);
305
306 usrc_writeUCPMap(f, umap, nullptr, UPRV_TARGET_SYNTAX_TOML);
307 fputs("\n", f);
308
309 fputs("[enum_property.code_point_trie]\n", f);
310 usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
311 }
312
313 // After printing property value `v`, print `mask` if and only if `mask` comes immediately
314 // after the property in the listing
maybeDumpMaskValue(UProperty uproperty,uint32_t v,uint32_t mask,FILE * f)315 void maybeDumpMaskValue(UProperty uproperty, uint32_t v, uint32_t mask, FILE* f) {
316 if (U_MASK(v) < mask && U_MASK(v + 1) > mask)
317 dumpValueEntry(uproperty, mask, true, f);
318 }
319
dumpGeneralCategoryMask(FILE * f)320 void dumpGeneralCategoryMask(FILE* f) {
321 IcuToolErrorCode status("icuexportdata: dumpGeneralCategoryMask");
322 UProperty uproperty = UCHAR_GENERAL_CATEGORY_MASK;
323
324 fputs("[[mask_property]]\n", f);
325 const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
326 const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
327 fprintf(f, "long_name = \"%s\"\n", fullPropName);
328 if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
329 fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
330 dumpPropertyAliases(uproperty, f);
331
332
333 fprintf(f, "mask_for = \"General_Category\"\n");
334 uint32_t minValue = u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY);
335 U_ASSERT(minValue >= 0);
336 uint32_t maxValue = u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY);
337 U_ASSERT(maxValue >= 0);
338
339 fprintf(f, "values = [\n");
340 for (uint32_t v = minValue; v <= maxValue; v++) {
341 dumpValueEntry(uproperty, U_MASK(v), true, f);
342
343 // We want to dump these masks "in order", which means they
344 // should come immediately after every property they contain
345 maybeDumpMaskValue(uproperty, v, U_GC_L_MASK, f);
346 maybeDumpMaskValue(uproperty, v, U_GC_LC_MASK, f);
347 maybeDumpMaskValue(uproperty, v, U_GC_M_MASK, f);
348 maybeDumpMaskValue(uproperty, v, U_GC_N_MASK, f);
349 maybeDumpMaskValue(uproperty, v, U_GC_Z_MASK, f);
350 maybeDumpMaskValue(uproperty, v, U_GC_C_MASK, f);
351 maybeDumpMaskValue(uproperty, v, U_GC_P_MASK, f);
352 maybeDumpMaskValue(uproperty, v, U_GC_S_MASK, f);
353 }
354 fprintf(f, "]\n");
355 }
356
dumpScriptExtensions(FILE * f)357 void dumpScriptExtensions(FILE* f) {
358 IcuToolErrorCode status("icuexportdata: dumpScriptExtensions");
359
360 fputs("[[script_extensions]]\n", f);
361 const char* scxFullPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_LONG_PROPERTY_NAME);
362 const char* scxShortPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_SHORT_PROPERTY_NAME);
363 fprintf(f, "long_name = \"%s\"\n", scxFullPropName);
364 if (scxShortPropName) fprintf(f, "short_name = \"%s\"\n", scxShortPropName);
365 fprintf(f, "uproperty_discr = 0x%X\n", UCHAR_SCRIPT_EXTENSIONS);
366 dumpPropertyAliases(UCHAR_SCRIPT_EXTENSIONS, f);
367
368 // We want to use 16 bits for our exported trie of sc/scx data because we
369 // need 12 bits to match the 12 bits of data stored for sc/scx in the trie
370 // in the uprops.icu data file.
371 UCPTrieValueWidth scWidth = UCPTRIE_VALUE_BITS_16;
372
373 // Create a mutable UCPTrie builder populated with Script property values data.
374 const UCPMap* scInvMap = u_getIntPropertyMap(UCHAR_SCRIPT, status);
375 handleError(status, scxFullPropName);
376 LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(scInvMap, status));
377 handleError(status, scxFullPropName);
378
379 // The values for the output scx companion array.
380 // Invariant is that all subvectors are distinct.
381 std::vector< std::vector<uint16_t> > outputDedupVec;
382
383 // The sc/scx companion array is an array of arrays (of script codes)
384 fputs("script_code_array = [\n", f);
385 for(const UChar32 cp : scxCodePoints) {
386 // Get the Script value
387 uint32_t scVal = umutablecptrie_get(builder.getAlias(), cp);
388 // Get the Script_Extensions value (array of Script codes)
389 const int32_t SCX_ARRAY_CAPACITY = 32;
390 UScriptCode scxValArray[SCX_ARRAY_CAPACITY];
391 int32_t numScripts = uscript_getScriptExtensions(cp, scxValArray, SCX_ARRAY_CAPACITY, status);
392 handleError(status, scxFullPropName);
393
394 // Convert the scx array into a vector
395 std::vector<uint16_t> scxValVec;
396 for(int i = 0; i < numScripts; i++) {
397 scxValVec.push_back(scxValArray[i]);
398 }
399 // Ensure that it is sorted
400 std::sort(scxValVec.begin(), scxValVec.end());
401 // Copy the Script value into the first position of the scx array only
402 // if we have the "other" case (Script value is not Common nor Inherited).
403 // This offers faster access when users want only the Script value.
404 if (scVal != USCRIPT_COMMON && scVal != USCRIPT_INHERITED) {
405 scxValVec.insert(scxValVec.begin(), scVal);
406 }
407
408 // See if there is already an scx value array matching the newly built one.
409 // If there is, then use its index.
410 // If not, then append the new value array.
411 bool isScxValUnique = true;
412 size_t outputIndex = 0;
413 for (outputIndex = 0; outputIndex < outputDedupVec.size(); outputIndex++) {
414 if (outputDedupVec[outputIndex] == scxValVec) {
415 isScxValUnique = false;
416 break;
417 }
418 }
419
420 if (isScxValUnique) {
421 outputDedupVec.push_back(scxValVec);
422 usrc_writeArray(f, " [", scxValVec.data(), 16, scxValVec.size(), " ", "],\n");
423 }
424
425 // We must update the value in the UCPTrie for the code point to contain:
426 // 9..0 the Script code in the lower 10 bits when 11..10 is 0, else it is
427 // the index into the companion array
428 // 11..10 the same higher-order 2 bits in the trie in uprops.icu indicating whether
429 // 3: other
430 // 2: Script=Inherited
431 // 1: Script=Common
432 // 0: Script=value in 9..0 (N/A because we are in this loop to create the companion array for non-0 cases)
433 uint16_t mask = 0;
434 if (scVal == USCRIPT_COMMON) {
435 mask = DATAEXPORT_SCRIPT_X_WITH_COMMON;
436 } else if (scVal == USCRIPT_INHERITED) {
437 mask = DATAEXPORT_SCRIPT_X_WITH_INHERITED;
438 } else {
439 mask = DATAEXPORT_SCRIPT_X_WITH_OTHER;
440 }
441
442 // The new trie value is the index into the new array with the high order bits set
443 uint32_t newScVal = outputIndex | mask;
444
445 // Update the code point in the mutable trie builder with the trie value
446 umutablecptrie_set(builder.getAlias(), cp, newScVal, status);
447 handleError(status, scxFullPropName);
448 }
449 fputs("]\n\n", f); // Print the TOML close delimiter for the outer array.
450
451 // Convert from mutable trie builder to immutable trie.
452 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
453 builder.getAlias(),
454 trieType,
455 scWidth,
456 status));
457 handleError(status, scxFullPropName);
458
459 fputs("[script_extensions.code_point_trie]\n", f);
460 usrc_writeUCPTrie(f, scxShortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
461 }
462
prepareOutputFile(const char * basename)463 FILE* prepareOutputFile(const char* basename) {
464 IcuToolErrorCode status("icuexportdata");
465 CharString outFileName;
466 if (destdir != nullptr && *destdir != 0) {
467 outFileName.append(destdir, status).ensureEndsWithFileSeparator(status);
468 }
469 outFileName.append(basename, status);
470 outFileName.append(".toml", status);
471 handleError(status, basename);
472
473 FILE* f = fopen(outFileName.data(), "w");
474 if (f == nullptr) {
475 std::cerr << "Unable to open file: " << outFileName.data() << std::endl;
476 exit(U_FILE_ACCESS_ERROR);
477 }
478 if (!QUIET) {
479 std::cout << "Writing to: " << outFileName.data() << std::endl;
480 }
481
482 if (haveCopyright) {
483 usrc_writeCopyrightHeader(f, "#", 2021);
484 }
485 usrc_writeFileNameGeneratedBy(f, "#", basename, "icuexportdata.cpp");
486
487 return f;
488 }
489
490 #if !UCONFIG_NO_NORMALIZATION
491
492 struct PendingDescriptor {
493 UChar32 scalar;
494 uint32_t descriptor;
495 UBool supplementary;
496 };
497
writeCanonicalCompositions(USet * backwardCombiningStarters)498 void writeCanonicalCompositions(USet* backwardCombiningStarters) {
499 IcuToolErrorCode status("icuexportdata: computeCanonicalCompositions");
500 const char* basename = "compositions";
501 FILE* f = prepareOutputFile(basename);
502
503 LocalPointer<UCharsTrieBuilder> backwardBuilder(new UCharsTrieBuilder(status), status);
504
505 const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
506 UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
507
508 const Normalizer2* nfc = Normalizer2::getNFCInstance(status);
509 for (UChar32 c = 0; c <= 0x10FFFF; ++c) {
510 if (c >= 0xD800 && c < 0xE000) {
511 // Surrogate
512 continue;
513 }
514 UnicodeString decomposition;
515 if (!nfc->getRawDecomposition(c, decomposition)) {
516 continue;
517 }
518 int32_t len = decomposition.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
519 if (len != 2) {
520 continue;
521 }
522 UChar32 starter = utf32[0];
523 UChar32 second = utf32[1];
524 UChar32 composite = nfc->composePair(starter, second);
525 if (composite < 0) {
526 continue;
527 }
528 if (c != composite) {
529 status.set(U_INTERNAL_PROGRAM_ERROR);
530 handleError(status, basename);
531 }
532 if (!u_getCombiningClass(second)) {
533 uset_add(backwardCombiningStarters, second);
534 }
535 if (composite >= 0xAC00 && composite <= 0xD7A3) {
536 // Hangul syllable
537 continue;
538 }
539
540 UnicodeString backward;
541 backward.append(second);
542 backward.append(starter);
543 backwardBuilder->add(backward, int32_t(composite), status);
544 }
545 UnicodeString canonicalCompositionTrie;
546 backwardBuilder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, canonicalCompositionTrie, status);
547
548 usrc_writeArray(f, "compositions = [\n ", canonicalCompositionTrie.getBuffer(), 16, canonicalCompositionTrie.length(), " ", "\n]\n");
549 fclose(f);
550 handleError(status, basename);
551 }
552
writeDecompositionTables(const char * basename,const uint16_t * ptr16,size_t len16,const uint32_t * ptr32,size_t len32)553 void writeDecompositionTables(const char* basename, const uint16_t* ptr16, size_t len16, const uint32_t* ptr32, size_t len32) {
554 FILE* f = prepareOutputFile(basename);
555 usrc_writeArray(f, "scalars16 = [\n ", ptr16, 16, len16, " ", "\n]\n");
556 usrc_writeArray(f, "scalars32 = [\n ", ptr32, 32, len32, " ", "\n]\n");
557 fclose(f);
558 }
559
writeDecompositionData(const char * basename,uint32_t baseSize16,uint32_t baseSize32,uint32_t supplementSize16,USet * uset,USet * reference,const std::vector<PendingDescriptor> & pendingTrieInsertions,char16_t passthroughCap)560 void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector<PendingDescriptor>& pendingTrieInsertions, char16_t passthroughCap) {
561 IcuToolErrorCode status("icuexportdata: writeDecompositionData");
562 FILE* f = prepareOutputFile(basename);
563
564 // Zero is a magic number that means the character decomposes to itself.
565 LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
566
567 // Iterate backwards to insert lower code points in the trie first in case it matters
568 // for trie block allocation.
569 for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) {
570 const PendingDescriptor& pending = pendingTrieInsertions[i];
571 uint32_t additional = 0;
572 if (!(pending.descriptor & 0xFFFE0000)) {
573 uint32_t offset = pending.descriptor & 0xFFF;
574 if (!pending.supplementary) {
575 if (offset >= baseSize16) {
576 // This is a offset to supplementary 16-bit data. We have
577 // 16-bit base data and 32-bit base data before. However,
578 // the 16-bit base data length is already part of offset.
579 additional = baseSize32;
580 }
581 } else {
582 if (offset >= baseSize32) {
583 // This is an offset to supplementary 32-bit data. We have 16-bit
584 // base data, 32-bit base data, and 16-bit supplementary data before.
585 // However, the 32-bit base data length is already part
586 // of offset.
587 additional = baseSize16 + supplementSize16;
588 } else {
589 // This is an offset to 32-bit base data. We have 16-bit
590 // base data before.
591 additional = baseSize16;
592 }
593 }
594 if (offset + additional > 0xFFF) {
595 status.set(U_INTERNAL_PROGRAM_ERROR);
596 handleError(status, basename);
597 }
598 }
599 // It turns out it's better to swap the halves compared to the initial
600 // idea in order to put special marker values close to zero so that
601 // an important marker value becomes 1, so it's efficient to compare
602 // "1 or 0". Unfortunately, going through all the code to swap
603 // things is too error prone, so let's do the swapping here in one
604 // place.
605 uint32_t oldTrieValue = pending.descriptor + additional;
606 uint32_t swappedTrieValue = (oldTrieValue >> 16) | (oldTrieValue << 16);
607 umutablecptrie_set(builder.getAlias(), pending.scalar, swappedTrieValue, status);
608 }
609 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
610 builder.getAlias(),
611 trieType,
612 UCPTRIE_VALUE_BITS_32,
613 status));
614 handleError(status, basename);
615
616 if (reference) {
617 if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) {
618 // NFD expectations don't hold. The set must not contain the half-width
619 // kana voicing marks and must contain iota subscript.
620 status.set(U_INTERNAL_PROGRAM_ERROR);
621 handleError(status, basename);
622 }
623
624 USet* halfWidthVoicing = uset_openEmpty();
625 uset_add(halfWidthVoicing, 0xFF9E);
626 uset_add(halfWidthVoicing, 0xFF9F);
627
628 USet* iotaSubscript = uset_openEmpty();
629 uset_add(iotaSubscript, 0x0345);
630
631 uint8_t flags = 0;
632
633 USet* halfWidthCheck = uset_cloneAsThawed(uset);
634 uset_removeAll(halfWidthCheck, reference);
635 if (uset_equals(halfWidthCheck, halfWidthVoicing)) {
636 flags |= 1;
637 } else if (!uset_isEmpty(halfWidthCheck)) {
638 // The result was neither empty nor contained exactly
639 // the two half-width voicing marks. The ICU4X
640 // normalizer doesn't know how to deal with this case.
641 status.set(U_INTERNAL_PROGRAM_ERROR);
642 handleError(status, basename);
643 }
644 uset_close(halfWidthCheck);
645
646 USet* iotaCheck = uset_cloneAsThawed(reference);
647 uset_removeAll(iotaCheck, uset);
648 if (!(uset_equals(iotaCheck, iotaSubscript)) && !uset_isEmpty(iotaCheck)) {
649 // The result was neither empty nor contained exactly
650 // the iota subscript. The ICU4X normalizer doesn't
651 // know how to deal with this case.
652 status.set(U_INTERNAL_PROGRAM_ERROR);
653 handleError(status, basename);
654 }
655 uset_close(halfWidthCheck);
656
657 uset_close(iotaSubscript);
658 uset_close(halfWidthVoicing);
659
660 fprintf(f, "flags = 0x%X\n", flags);
661 fprintf(f, "cap = 0x%X\n", passthroughCap);
662 }
663 fprintf(f, "[trie]\n");
664 usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
665 fclose(f);
666 handleError(status, basename);
667 }
668
669 // Special marker for the NFKD form of U+FDFA
670 const int32_t FDFA_MARKER = 3;
671
672 // Special marker for characters whose decomposition starts with a non-starter
673 // and the decomposition isn't the character itself.
674 const int32_t SPECIAL_NON_STARTER_DECOMPOSITION_MARKER = 2;
675
676 // Special marker for starters that decompose to themselves but that may
677 // combine backwards under canonical composition
678 const int32_t BACKWARD_COMBINING_STARTER_MARKER = 1;
679
680 /// Marker that a complex decomposition isn't round-trippable
681 /// under re-composition.
682 const uint32_t NON_ROUND_TRIP_MARKER = 1;
683
permissibleBmpPair(UBool knownToRoundTrip,UChar32 c,UChar32 second)684 UBool permissibleBmpPair(UBool knownToRoundTrip, UChar32 c, UChar32 second) {
685 if (knownToRoundTrip) {
686 return true;
687 }
688 // Nuktas, Hebrew presentation forms and polytonic Greek with oxia
689 // are special-cased in ICU4X.
690 if (c >= 0xFB1D && c <= 0xFB4E) {
691 // Hebrew presentation forms
692 return true;
693 }
694 if (c >= 0x1F71 && c <= 0x1FFB) {
695 // Polytonic Greek with oxia
696 return true;
697 }
698 if ((second & 0x7F) == 0x3C && second >= 0x0900 && second <= 0x0BFF) {
699 // Nukta
700 return true;
701 }
702 // To avoid more branchiness, 4 characters that decompose to
703 // a BMP starter followed by a BMP non-starter are excluded
704 // from being encoded directly into the trie value and are
705 // handled as complex decompositions instead. These are:
706 // U+0F76 TIBETAN VOWEL SIGN VOCALIC R
707 // U+0F78 TIBETAN VOWEL SIGN VOCALIC L
708 // U+212B ANGSTROM SIGN
709 // U+2ADC FORKING
710 return false;
711 }
712
713 // Computes data for canonical decompositions
computeDecompositions(const char * basename,const USet * backwardCombiningStarters,std::vector<uint16_t> & storage16,std::vector<uint32_t> & storage32,USet * decompositionStartsWithNonStarter,USet * decompositionStartsWithBackwardCombiningStarter,std::vector<PendingDescriptor> & pendingTrieInsertions,UChar32 & decompositionPassthroughBound,UChar32 & compositionPassthroughBound)714 void computeDecompositions(const char* basename,
715 const USet* backwardCombiningStarters,
716 std::vector<uint16_t>& storage16,
717 std::vector<uint32_t>& storage32,
718 USet* decompositionStartsWithNonStarter,
719 USet* decompositionStartsWithBackwardCombiningStarter,
720 std::vector<PendingDescriptor>& pendingTrieInsertions,
721 UChar32& decompositionPassthroughBound,
722 UChar32& compositionPassthroughBound) {
723 IcuToolErrorCode status("icuexportdata: computeDecompositions");
724 const Normalizer2* mainNormalizer;
725 const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(status);
726 const Normalizer2* nfcNormalizer = Normalizer2::getNFCInstance(status);
727 FILE* f = nullptr;
728 std::vector<uint32_t> nonRecursive32;
729 LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status));
730
731 if (uprv_strcmp(basename, "nfkd") == 0) {
732 mainNormalizer = Normalizer2::getNFKDInstance(status);
733 } else if (uprv_strcmp(basename, "uts46d") == 0) {
734 mainNormalizer = Normalizer2::getInstance(nullptr, "uts46", UNORM2_COMPOSE, status);
735 } else {
736 mainNormalizer = nfdNormalizer;
737 f = prepareOutputFile("decompositionex");
738 }
739
740 // Max length as of Unicode 14 is 4 for NFD. For NFKD the max
741 // is 18 (U+FDFA; special-cased), and the next longest is 8 (U+FDFB).
742 const int32_t LONGEST_ENCODABLE_LENGTH_16 = 9;
743 const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8;
744 const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
745 UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
746 const int32_t RAW_DECOMPOSITION_BUFFER_SIZE = 2;
747 UChar32 rawUtf32[RAW_DECOMPOSITION_BUFFER_SIZE];
748
749 // Iterate over all scalar values excluding Hangul syllables.
750 //
751 // We go backwards in order to better find overlapping decompositions.
752 //
753 // As of Unicode 14:
754 // Iterate forward without overlap search:
755 // nfd: 16 size: 896, 32 size: 173
756 // nfkd: 16 size: 3854, 32 size: 179
757 //
758 // Iterate forward with overlap search:
759 // nfd: 16 size: 888, 32 size: 173
760 // nfkd: 16 size: 3266, 32 size: 179
761 //
762 // Iterate backward with overlap search:
763 // nfd: 16 size: 776, 32 size: 173
764 // nfkd: 16 size: 2941, 32 size: 179
765 //
766 // UChar32 is signed!
767 for (UChar32 c = 0x10FFFF; c >= 0; --c) {
768 if (c >= 0xAC00 && c <= 0xD7A3) {
769 // Hangul syllable
770 continue;
771 }
772 if (c >= 0xD800 && c < 0xE000) {
773 // Surrogate
774 continue;
775 }
776 UnicodeString src;
777 UnicodeString dst;
778 // True if we're building non-NFD or we're building NFD but
779 // the `c` round trips to NFC.
780 // False if we're building NFD and `c` does not round trip to NFC.
781 UBool nonNfdOrRoundTrips = true;
782 src.append(c);
783 if (mainNormalizer != nfdNormalizer) {
784 UnicodeString inter;
785 mainNormalizer->normalize(src, inter, status);
786 nfdNormalizer->normalize(inter, dst, status);
787 } else {
788 nfdNormalizer->normalize(src, dst, status);
789 UnicodeString nfc;
790 nfcNormalizer->normalize(dst, nfc, status);
791 nonNfdOrRoundTrips = (src == nfc);
792 }
793 int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
794 if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
795 // Characters that normalize to nothing or to U+FFFD (without the
796 // input being U+FFFD) in ICU4C's UTS 46 normalization normalize
797 // as in NFD in ICU4X's UTF 46 normalization in the interest
798 // of data size and ICU4X's normalizer being unable to handle
799 // normalizing to nothing.
800 // When UTS 46 is implemented on top of ICU4X, a preprocessing
801 // step is supposed to remove these characters before the
802 // normalization step.
803 if (uprv_strcmp(basename, "uts46d") != 0) {
804 status.set(U_INTERNAL_PROGRAM_ERROR);
805 handleError(status, basename);
806 }
807 nfdNormalizer->normalize(src, dst, status);
808 len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
809 if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
810 status.set(U_INTERNAL_PROGRAM_ERROR);
811 handleError(status, basename);
812 }
813 }
814 if (len > DECOMPOSITION_BUFFER_SIZE) {
815 status.set(U_INTERNAL_PROGRAM_ERROR);
816 handleError(status, basename);
817 }
818 uint8_t firstCombiningClass = u_getCombiningClass(utf32[0]);
819 bool specialNonStarterDecomposition = false;
820 bool startsWithBackwardCombiningStarter = false;
821 if (firstCombiningClass) {
822 decompositionPassthroughBound = c;
823 compositionPassthroughBound = c;
824 uset_add(decompositionStartsWithNonStarter, c);
825 if (src != dst) {
826 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F) {
827 specialNonStarterDecomposition = true;
828 } else {
829 // A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X.
830 status.set(U_INTERNAL_PROGRAM_ERROR);
831 handleError(status, basename);
832 }
833 }
834 } else if (uset_contains(backwardCombiningStarters, utf32[0])) {
835 compositionPassthroughBound = c;
836 startsWithBackwardCombiningStarter = true;
837 uset_add(decompositionStartsWithBackwardCombiningStarter, c);
838 }
839 if (c != BACKWARD_COMBINING_STARTER_MARKER && len == 1 && utf32[0] == BACKWARD_COMBINING_STARTER_MARKER) {
840 status.set(U_INTERNAL_PROGRAM_ERROR);
841 handleError(status, basename);
842 }
843 if (c != SPECIAL_NON_STARTER_DECOMPOSITION_MARKER && len == 1 && utf32[0] == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER) {
844 status.set(U_INTERNAL_PROGRAM_ERROR);
845 handleError(status, basename);
846 }
847 if (c != FDFA_MARKER && len == 1 && utf32[0] == FDFA_MARKER) {
848 status.set(U_INTERNAL_PROGRAM_ERROR);
849 handleError(status, basename);
850 }
851 if (mainNormalizer != nfdNormalizer) {
852 UnicodeString nfd;
853 nfdNormalizer->normalize(src, nfd, status);
854 if (dst == nfd) {
855 continue;
856 }
857 decompositionPassthroughBound = c;
858 compositionPassthroughBound = c;
859 } else if (firstCombiningClass) {
860 len = 1;
861 if (specialNonStarterDecomposition) {
862 utf32[0] = SPECIAL_NON_STARTER_DECOMPOSITION_MARKER; // magic value
863 } else {
864 // Use the surrogate range to store the canonical combining class
865 utf32[0] = 0xD800 | UChar32(firstCombiningClass);
866 }
867 } else {
868 if (src == dst) {
869 if (startsWithBackwardCombiningStarter) {
870 pendingTrieInsertions.push_back({c, BACKWARD_COMBINING_STARTER_MARKER << 16, false});
871 }
872 continue;
873 }
874 decompositionPassthroughBound = c;
875 // ICU4X hard-codes ANGSTROM SIGN
876 if (c != 0x212B) {
877 UnicodeString raw;
878 if (!nfdNormalizer->getRawDecomposition(c, raw)) {
879 // We're always supposed to have a non-recursive decomposition
880 // if we had a recursive one.
881 status.set(U_INTERNAL_PROGRAM_ERROR);
882 handleError(status, basename);
883 }
884 // In addition to actual difference, put the whole range that contains characters
885 // with oxia into the non-recursive trie in order to catch cases where characters
886 // with oxia have singleton decompositions to corresponding characters with tonos.
887 // This way, the run-time decision to fall through can be done on the range
888 // without checking for individual characters inside the range.
889 if (raw != dst || (c >= 0x1F71 && c <= 0x1FFB)) {
890 int32_t rawLen = raw.toUTF32(rawUtf32, RAW_DECOMPOSITION_BUFFER_SIZE, status);
891 if (!rawLen) {
892 status.set(U_INTERNAL_PROGRAM_ERROR);
893 handleError(status, basename);
894 }
895 if (rawLen == 1) {
896 if (c >= 0xFFFF) {
897 status.set(U_INTERNAL_PROGRAM_ERROR);
898 handleError(status, basename);
899 }
900 umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, uint32_t(rawUtf32[0]), status);
901 } else if (rawUtf32[0] <= 0xFFFF && rawUtf32[1] <= 0xFFFF) {
902 if (!rawUtf32[0] || !rawUtf32[1]) {
903 status.set(U_INTERNAL_PROGRAM_ERROR);
904 handleError(status, basename);
905 }
906 // Swapped for consistency with the primary trie
907 uint32_t bmpPair = uint32_t(rawUtf32[1]) << 16 | uint32_t(rawUtf32[0]);
908 umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, bmpPair, status);
909 } else {
910 // Let's add 1 to index to make it always non-zero to distinguish
911 // it from the default zero.
912 uint32_t index = nonRecursive32.size() + 1;
913 nonRecursive32.push_back(uint32_t(rawUtf32[0]));
914 nonRecursive32.push_back(uint32_t(rawUtf32[1]));
915 if (index > 0xFFFF) {
916 status.set(U_INTERNAL_PROGRAM_ERROR);
917 handleError(status, basename);
918 }
919 umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, index << 16, status);
920 }
921 }
922 }
923 }
924 if (!nonNfdOrRoundTrips) {
925 compositionPassthroughBound = c;
926 }
927 if (len == 1 && utf32[0] <= 0xFFFF) {
928 if (startsWithBackwardCombiningStarter) {
929 if (mainNormalizer == nfdNormalizer) {
930 // Not supposed to happen in NFD
931 status.set(U_INTERNAL_PROGRAM_ERROR);
932 handleError(status, basename);
933 } else if (!((utf32[0] >= 0x1161 && utf32[0] <= 0x1175) || (utf32[0] >= 0x11A8 && utf32[0] <= 0x11C2))) {
934 // Other than conjoining jamo vowels and trails
935 // unsupported for non-NFD.
936 status.set(U_INTERNAL_PROGRAM_ERROR);
937 handleError(status, basename);
938 }
939 }
940 pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, false});
941 } else if (len == 2 &&
942 utf32[0] <= 0xFFFF &&
943 utf32[1] <= 0xFFFF &&
944 !u_getCombiningClass(utf32[0]) &&
945 u_getCombiningClass(utf32[1]) &&
946 permissibleBmpPair(nonNfdOrRoundTrips, c, utf32[1])) {
947 for (int32_t i = 0; i < len; ++i) {
948 if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) {
949 // Assert that iota subscript and half-width voicing marks never occur in these
950 // expansions in the normalization forms where they are special.
951 status.set(U_INTERNAL_PROGRAM_ERROR);
952 handleError(status, basename);
953 }
954 }
955 if (startsWithBackwardCombiningStarter) {
956 status.set(U_INTERNAL_PROGRAM_ERROR);
957 handleError(status, basename);
958 }
959 pendingTrieInsertions.push_back({c, (uint32_t(utf32[0]) << 16) | uint32_t(utf32[1]), false});
960 } else {
961 if (startsWithBackwardCombiningStarter) {
962 status.set(U_INTERNAL_PROGRAM_ERROR);
963 handleError(status, basename);
964 }
965
966 UBool supplementary = false;
967 UBool nonInitialStarter = false;
968 for (int32_t i = 0; i < len; ++i) {
969 if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) {
970 // Assert that iota subscript and half-width voicing marks never occur in these
971 // expansions in the normalization forms where they are special.
972 status.set(U_INTERNAL_PROGRAM_ERROR);
973 handleError(status, basename);
974 }
975
976 if (utf32[i] > 0xFFFF) {
977 supplementary = true;
978 }
979 if (utf32[i] == 0) {
980 status.set(U_INTERNAL_PROGRAM_ERROR);
981 handleError(status, basename);
982 }
983 if (i != 0 && !u_getCombiningClass(utf32[i])) {
984 nonInitialStarter = true;
985 }
986 }
987 if (!supplementary) {
988 if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) {
989 if (len == 18 && c == 0xFDFA) {
990 // Special marker for the one character whose decomposition
991 // is too long.
992 pendingTrieInsertions.push_back({c, FDFA_MARKER << 16, supplementary});
993 continue;
994 } else {
995 status.set(U_INTERNAL_PROGRAM_ERROR);
996 handleError(status, basename);
997 }
998 }
999 } else if (len > LONGEST_ENCODABLE_LENGTH_32 || !len) {
1000 status.set(U_INTERNAL_PROGRAM_ERROR);
1001 handleError(status, basename);
1002 }
1003 // Complex decomposition
1004 // Format for 16-bit value:
1005 // 15..13: length minus two for 16-bit case and length minus one for
1006 // the 32-bit case. Length 8 needs to fit in three bits in
1007 // the 16-bit case, and this way the value is future-proofed
1008 // up to 9 in the 16-bit case. Zero is unused and length one
1009 // in the 16-bit case goes directly into the trie.
1010 // 12: 1 if all trailing characters are guaranteed non-starters,
1011 // 0 if no guarantees about non-starterness.
1012 // Note: The bit choice is this way around to allow for
1013 // dynamically falling back to not having this but instead
1014 // having one more bit for length by merely choosing
1015 // different masks.
1016 // 11..0: Start offset in storage. The offset is to the logical
1017 // sequence of scalars16, scalars32, supplementary_scalars16,
1018 // supplementary_scalars32.
1019 uint32_t descriptor = uint32_t(!nonInitialStarter) << 12;
1020 if (!supplementary) {
1021 descriptor |= (uint32_t(len) - 2) << 13;
1022 } else {
1023 descriptor |= (uint32_t(len) - 1) << 13;
1024 }
1025 if (descriptor & 0xFFF) {
1026 status.set(U_INTERNAL_PROGRAM_ERROR);
1027 handleError(status, basename);
1028 }
1029 size_t index = 0;
1030 bool writeToStorage = false;
1031 // Sadly, C++ lacks break and continue by label, so using goto in the
1032 // inner loops to break or continue the outer loop.
1033 if (!supplementary) {
1034 outer16: for (;;) {
1035 if (index == storage16.size()) {
1036 writeToStorage = true;
1037 break;
1038 }
1039 if (storage16[index] == utf32[0]) {
1040 for (int32_t i = 1; i < len; ++i) {
1041 if (storage16[index + i] != uint32_t(utf32[i])) {
1042 ++index;
1043 // continue outer
1044 goto outer16;
1045 }
1046 }
1047 // break outer
1048 goto after;
1049 }
1050 ++index;
1051 }
1052 } else {
1053 outer32: for (;;) {
1054 if (index == storage32.size()) {
1055 writeToStorage = true;
1056 break;
1057 }
1058 if (storage32[index] == uint32_t(utf32[0])) {
1059 for (int32_t i = 1; i < len; ++i) {
1060 if (storage32[index + i] != uint32_t(utf32[i])) {
1061 ++index;
1062 // continue outer
1063 goto outer32;
1064 }
1065 }
1066 // break outer
1067 goto after;
1068 }
1069 ++index;
1070 }
1071 }
1072 after:
1073 if (index > 0xFFF) {
1074 status.set(U_INTERNAL_PROGRAM_ERROR);
1075 handleError(status, basename);
1076 }
1077 descriptor |= uint32_t(index);
1078 if (!descriptor || descriptor > 0xFFFF) {
1079 // > 0xFFFF should never happen if the code above is correct.
1080 // == 0 should not happen due to the nature of the data.
1081 status.set(U_INTERNAL_PROGRAM_ERROR);
1082 handleError(status, basename);
1083 }
1084 if (writeToStorage) {
1085 if (!supplementary) {
1086 for (int32_t i = 0; i < len; ++i) {
1087 storage16.push_back(uint16_t(utf32[i]));
1088 }
1089 } else {
1090 for (int32_t i = 0; i < len; ++i) {
1091 storage32.push_back(uint32_t(utf32[i]));
1092 }
1093 }
1094 }
1095
1096 uint32_t nonRoundTripMarker = 0;
1097 if (!nonNfdOrRoundTrips) {
1098 nonRoundTripMarker = (NON_ROUND_TRIP_MARKER << 16);
1099 }
1100 pendingTrieInsertions.push_back({c, descriptor | nonRoundTripMarker, supplementary});
1101 }
1102 }
1103 if (storage16.size() + storage32.size() > 0xFFF) {
1104 status.set(U_INTERNAL_PROGRAM_ERROR);
1105 }
1106 if (f) {
1107 usrc_writeArray(f, "scalars32 = [\n ", nonRecursive32.data(), 32, nonRecursive32.size(), " ", "\n]\n");
1108
1109 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
1110 nonRecursiveBuilder.getAlias(),
1111 trieType,
1112 UCPTRIE_VALUE_BITS_32,
1113 status));
1114 handleError(status, basename);
1115
1116 fprintf(f, "[trie]\n");
1117 usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
1118
1119 fclose(f);
1120 }
1121 handleError(status, basename);
1122 }
1123
1124 #endif // !UCONFIG_NO_NORMALIZATION
1125
1126 enum {
1127 OPT_HELP_H,
1128 OPT_HELP_QUESTION_MARK,
1129 OPT_MODE,
1130 OPT_TRIE_TYPE,
1131 OPT_VERSION,
1132 OPT_DESTDIR,
1133 OPT_ALL,
1134 OPT_INDEX,
1135 OPT_COPYRIGHT,
1136 OPT_VERBOSE,
1137 OPT_QUIET,
1138
1139 OPT_COUNT
1140 };
1141
1142 #define UOPTION_MODE UOPTION_DEF("mode", 'm', UOPT_REQUIRES_ARG)
1143 #define UOPTION_TRIE_TYPE UOPTION_DEF("trie-type", '\1', UOPT_REQUIRES_ARG)
1144 #define UOPTION_ALL UOPTION_DEF("all", '\1', UOPT_NO_ARG)
1145 #define UOPTION_INDEX UOPTION_DEF("index", '\1', UOPT_NO_ARG)
1146
1147 static UOption options[]={
1148 UOPTION_HELP_H,
1149 UOPTION_HELP_QUESTION_MARK,
1150 UOPTION_MODE,
1151 UOPTION_TRIE_TYPE,
1152 UOPTION_VERSION,
1153 UOPTION_DESTDIR,
1154 UOPTION_ALL,
1155 UOPTION_INDEX,
1156 UOPTION_COPYRIGHT,
1157 UOPTION_VERBOSE,
1158 UOPTION_QUIET,
1159 };
1160
printHelp(FILE * stdfile,const char * program)1161 void printHelp(FILE* stdfile, const char* program) {
1162 fprintf(stdfile,
1163 "usage: %s -m mode [-options] [--all | properties...]\n"
1164 "\tdump Unicode property data to .toml files\n"
1165 "options:\n"
1166 "\t-h or -? or --help this usage text\n"
1167 "\t-V or --version show a version message\n"
1168 "\t-m or --mode mode: currently only 'uprops', 'ucase', and 'norm', but more may be added\n"
1169 "\t --trie-type set the trie type (small or fast, default small)\n"
1170 "\t-d or --destdir destination directory, followed by the path\n"
1171 "\t --all write out all properties known to icuexportdata\n"
1172 "\t --index write an _index.toml summarizing all data exported\n"
1173 "\t-c or --copyright include a copyright notice\n"
1174 "\t-v or --verbose Turn on verbose output\n"
1175 "\t-q or --quiet do not display warnings and progress\n",
1176 program);
1177 }
1178
exportUprops(int argc,char * argv[])1179 int exportUprops(int argc, char* argv[]) {
1180 // Load list of Unicode properties
1181 std::vector<const char*> propNames;
1182 for (int i=1; i<argc; i++) {
1183 propNames.push_back(argv[i]);
1184 }
1185 if (options[OPT_ALL].doesOccur) {
1186 int i = UCHAR_BINARY_START;
1187 while (true) {
1188 if (i == UCHAR_BINARY_LIMIT) {
1189 i = UCHAR_INT_START;
1190 }
1191 if (i == UCHAR_INT_LIMIT) {
1192 i = UCHAR_GENERAL_CATEGORY_MASK;
1193 }
1194 if (i == UCHAR_GENERAL_CATEGORY_MASK + 1) {
1195 i = UCHAR_BIDI_MIRRORING_GLYPH;
1196 }
1197 if (i == UCHAR_BIDI_MIRRORING_GLYPH + 1) {
1198 i = UCHAR_SCRIPT_EXTENSIONS;
1199 }
1200 if (i == UCHAR_SCRIPT_EXTENSIONS + 1) {
1201 break;
1202 }
1203 UProperty uprop = static_cast<UProperty>(i);
1204 const char* propName = u_getPropertyName(uprop, U_SHORT_PROPERTY_NAME);
1205 if (propName == nullptr) {
1206 propName = u_getPropertyName(uprop, U_LONG_PROPERTY_NAME);
1207 if (propName != nullptr && VERBOSE) {
1208 std::cerr << "Note: falling back to long name for: " << propName << std::endl;
1209 }
1210 }
1211 if (propName != nullptr) {
1212 propNames.push_back(propName);
1213 } else {
1214 std::cerr << "Warning: Could not find name for: " << uprop << std::endl;
1215 }
1216 i++;
1217 }
1218 }
1219
1220 if (propNames.empty()
1221 || options[OPT_HELP_H].doesOccur
1222 || options[OPT_HELP_QUESTION_MARK].doesOccur
1223 || !options[OPT_MODE].doesOccur) {
1224 FILE *stdfile=argc<0 ? stderr : stdout;
1225 fprintf(stdfile,
1226 "usage: %s -m uprops [-options] [--all | properties...]\n"
1227 "\tdump Unicode property data to .toml files\n"
1228 "options:\n"
1229 "\t-h or -? or --help this usage text\n"
1230 "\t-V or --version show a version message\n"
1231 "\t-m or --mode mode: currently only 'uprops', but more may be added\n"
1232 "\t --trie-type set the trie type (small or fast, default small)\n"
1233 "\t-d or --destdir destination directory, followed by the path\n"
1234 "\t --all write out all properties known to icuexportdata\n"
1235 "\t --index write an _index.toml summarizing all data exported\n"
1236 "\t-c or --copyright include a copyright notice\n"
1237 "\t-v or --verbose Turn on verbose output\n"
1238 "\t-q or --quiet do not display warnings and progress\n",
1239 argv[0]);
1240 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1241 }
1242
1243 const char* mode = options[OPT_MODE].value;
1244 if (uprv_strcmp(mode, "uprops") != 0) {
1245 fprintf(stderr, "Invalid option for --mode (must be uprops)\n");
1246 return U_ILLEGAL_ARGUMENT_ERROR;
1247 }
1248
1249 if (options[OPT_TRIE_TYPE].doesOccur) {
1250 if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
1251 trieType = UCPTRIE_TYPE_FAST;
1252 } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
1253 trieType = UCPTRIE_TYPE_SMALL;
1254 } else {
1255 fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
1256 return U_ILLEGAL_ARGUMENT_ERROR;
1257 }
1258 }
1259
1260 for (const char* propName : propNames) {
1261 UProperty propEnum = u_getPropertyEnum(propName);
1262 if (propEnum == UCHAR_INVALID_CODE) {
1263 std::cerr << "Error: Invalid property alias: " << propName << std::endl;
1264 return U_ILLEGAL_ARGUMENT_ERROR;
1265 }
1266
1267 FILE* f = prepareOutputFile(propName);
1268
1269 UVersionInfo versionInfo;
1270 u_getUnicodeVersion(versionInfo);
1271 char uvbuf[U_MAX_VERSION_STRING_LENGTH];
1272 u_versionToString(versionInfo, uvbuf);
1273 fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n",
1274 U_ICU_VERSION,
1275 uvbuf);
1276
1277 if (propEnum < UCHAR_BINARY_LIMIT) {
1278 dumpBinaryProperty(propEnum, f);
1279 } else if (UCHAR_INT_START <= propEnum && propEnum <= UCHAR_INT_LIMIT) {
1280 dumpEnumeratedProperty(propEnum, f);
1281 } else if (propEnum == UCHAR_GENERAL_CATEGORY_MASK) {
1282 dumpGeneralCategoryMask(f);
1283 } else if (propEnum == UCHAR_BIDI_MIRRORING_GLYPH) {
1284 dumpBidiMirroringGlyph(f);
1285 } else if (propEnum == UCHAR_SCRIPT_EXTENSIONS) {
1286 dumpScriptExtensions(f);
1287 } else {
1288 std::cerr << "Don't know how to write property: " << propEnum << std::endl;
1289 return U_INTERNAL_PROGRAM_ERROR;
1290 }
1291
1292 fclose(f);
1293 }
1294
1295 if (options[OPT_INDEX].doesOccur) {
1296 FILE* f = prepareOutputFile("_index");
1297 fprintf(f, "index = [\n");
1298 for (const char* propName : propNames) {
1299 // At this point, propName is a valid property name, so it should be alphanum ASCII
1300 fprintf(f, " { filename=\"%s.toml\" },\n", propName);
1301 }
1302 fprintf(f, "]\n");
1303 fclose(f);
1304 }
1305
1306 return 0;
1307 }
1308
1309 struct AddRangeHelper {
1310 UMutableCPTrie* ucptrie;
1311 };
1312
1313 static UBool U_CALLCONV
addRangeToUCPTrie(const void * context,UChar32 start,UChar32 end,uint32_t value)1314 addRangeToUCPTrie(const void* context, UChar32 start, UChar32 end, uint32_t value) {
1315 IcuToolErrorCode status("addRangeToUCPTrie");
1316 UMutableCPTrie* ucptrie = ((const AddRangeHelper*) context)->ucptrie;
1317 umutablecptrie_setRange(ucptrie, start, end, value, status);
1318 handleError(status, "setRange");
1319
1320 return true;
1321 }
1322
exportCase(int argc,char * argv[])1323 int exportCase(int argc, char* argv[]) {
1324 if (argc > 1) {
1325 fprintf(stderr, "ucase mode does not expect additional arguments\n");
1326 return U_ILLEGAL_ARGUMENT_ERROR;
1327 }
1328 (void) argv; // Suppress unused variable warning
1329
1330 IcuToolErrorCode status("icuexportdata");
1331 LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
1332 handleError(status, "exportCase");
1333
1334 int32_t exceptionsLength, unfoldLength;
1335 const UCaseProps *caseProps = ucase_getSingleton(&exceptionsLength, &unfoldLength);
1336 const UTrie2* caseTrie = &caseProps->trie;
1337
1338 AddRangeHelper helper = { builder.getAlias() };
1339 utrie2_enum(caseTrie, nullptr, addRangeToUCPTrie, &helper);
1340
1341 UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_16;
1342 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
1343 builder.getAlias(),
1344 trieType,
1345 width,
1346 status));
1347 handleError(status, "exportCase");
1348
1349 FILE* f = prepareOutputFile("ucase");
1350
1351 UVersionInfo versionInfo;
1352 u_getUnicodeVersion(versionInfo);
1353 char uvbuf[U_MAX_VERSION_STRING_LENGTH];
1354 u_versionToString(versionInfo, uvbuf);
1355 fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n",
1356 U_ICU_VERSION,
1357 uvbuf);
1358
1359 fputs("[ucase.code_point_trie]\n", f);
1360 usrc_writeUCPTrie(f, "case_trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
1361 fputs("\n", f);
1362
1363 const char* indent = " ";
1364 const char* suffix = "\n]\n";
1365
1366 fputs("[ucase.exceptions]\n", f);
1367 const char* exceptionsPrefix = "exceptions = [\n ";
1368 int32_t exceptionsWidth = 16;
1369 usrc_writeArray(f, exceptionsPrefix, caseProps->exceptions, exceptionsWidth,
1370 exceptionsLength, indent, suffix);
1371 fputs("\n", f);
1372
1373 fputs("[ucase.unfold]\n", f);
1374 const char* unfoldPrefix = "unfold = [\n ";
1375 int32_t unfoldWidth = 16;
1376 usrc_writeArray(f, unfoldPrefix, caseProps->unfold, unfoldWidth,
1377 unfoldLength, indent, suffix);
1378
1379 return 0;
1380 }
1381
1382 #if !UCONFIG_NO_NORMALIZATION
1383
exportNorm()1384 int exportNorm() {
1385 IcuToolErrorCode status("icuexportdata: exportNorm");
1386 USet* backwardCombiningStarters = uset_openEmpty();
1387 writeCanonicalCompositions(backwardCombiningStarters);
1388
1389 std::vector<uint16_t> storage16;
1390 std::vector<uint32_t> storage32;
1391
1392 // Note: the USets are not exported. They are only used to check that a new
1393 // Unicode version doesn't violate expectations that are hard-coded in ICU4X.
1394 USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty();
1395 USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1396 std::vector<PendingDescriptor> nfdPendingTrieInsertions;
1397 UChar32 nfdBound = 0x10FFFF;
1398 UChar32 nfcBound = 0x10FFFF;
1399 computeDecompositions("nfd",
1400 backwardCombiningStarters,
1401 storage16,
1402 storage32,
1403 nfdDecompositionStartsWithNonStarter,
1404 nfdDecompositionStartsWithBackwardCombiningStarter,
1405 nfdPendingTrieInsertions,
1406 nfdBound,
1407 nfcBound);
1408 if (!(nfdBound == 0xC0 && nfcBound == 0x300)) {
1409 // Unexpected bounds for NFD/NFC.
1410 status.set(U_INTERNAL_PROGRAM_ERROR);
1411 handleError(status, "exportNorm");
1412 }
1413
1414 uint32_t baseSize16 = storage16.size();
1415 uint32_t baseSize32 = storage32.size();
1416
1417 USet* nfkdDecompositionStartsWithNonStarter = uset_openEmpty();
1418 USet* nfkdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1419 std::vector<PendingDescriptor> nfkdPendingTrieInsertions;
1420 UChar32 nfkdBound = 0x10FFFF;
1421 UChar32 nfkcBound = 0x10FFFF;
1422 computeDecompositions("nfkd",
1423 backwardCombiningStarters,
1424 storage16,
1425 storage32,
1426 nfkdDecompositionStartsWithNonStarter,
1427 nfkdDecompositionStartsWithBackwardCombiningStarter,
1428 nfkdPendingTrieInsertions,
1429 nfkdBound,
1430 nfkcBound);
1431 if (!(nfkdBound <= 0xC0 && nfkcBound <= 0x300)) {
1432 status.set(U_INTERNAL_PROGRAM_ERROR);
1433 handleError(status, "exportNorm");
1434 }
1435 if (nfkcBound > 0xC0) {
1436 if (nfkdBound != 0xC0) {
1437 status.set(U_INTERNAL_PROGRAM_ERROR);
1438 handleError(status, "exportNorm");
1439 }
1440 } else {
1441 if (nfkdBound != nfkcBound) {
1442 status.set(U_INTERNAL_PROGRAM_ERROR);
1443 handleError(status, "exportNorm");
1444 }
1445 }
1446
1447 USet* uts46DecompositionStartsWithNonStarter = uset_openEmpty();
1448 USet* uts46DecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1449 std::vector<PendingDescriptor> uts46PendingTrieInsertions;
1450 UChar32 uts46dBound = 0x10FFFF;
1451 UChar32 uts46Bound = 0x10FFFF;
1452 computeDecompositions("uts46d",
1453 backwardCombiningStarters,
1454 storage16,
1455 storage32,
1456 uts46DecompositionStartsWithNonStarter,
1457 uts46DecompositionStartsWithBackwardCombiningStarter,
1458 uts46PendingTrieInsertions,
1459 uts46dBound,
1460 uts46Bound);
1461 if (!(uts46dBound <= 0xC0 && uts46Bound <= 0x300)) {
1462 status.set(U_INTERNAL_PROGRAM_ERROR);
1463 handleError(status, "exportNorm");
1464 }
1465 if (uts46Bound > 0xC0) {
1466 if (uts46dBound != 0xC0) {
1467 status.set(U_INTERNAL_PROGRAM_ERROR);
1468 handleError(status, "exportNorm");
1469 }
1470 } else {
1471 if (uts46dBound != uts46Bound) {
1472 status.set(U_INTERNAL_PROGRAM_ERROR);
1473 handleError(status, "exportNorm");
1474 }
1475 }
1476
1477 uint32_t supplementSize16 = storage16.size() - baseSize16;
1478 uint32_t supplementSize32 = storage32.size() - baseSize32;
1479
1480 writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions, char16_t(nfcBound));
1481 writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions, char16_t(nfkcBound));
1482 writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions, char16_t(uts46Bound));
1483
1484 writeDecompositionTables("nfdex", storage16.data(), baseSize16, storage32.data(), baseSize32);
1485 writeDecompositionTables("nfkdex", storage16.data() + baseSize16, supplementSize16, storage32.data() + baseSize32, supplementSize32);
1486
1487 uset_close(nfdDecompositionStartsWithNonStarter);
1488 uset_close(nfkdDecompositionStartsWithNonStarter);
1489 uset_close(uts46DecompositionStartsWithNonStarter);
1490
1491 uset_close(nfdDecompositionStartsWithBackwardCombiningStarter);
1492 uset_close(nfkdDecompositionStartsWithBackwardCombiningStarter);
1493 uset_close(uts46DecompositionStartsWithBackwardCombiningStarter);
1494
1495 uset_close(backwardCombiningStarters);
1496 handleError(status, "exportNorm");
1497 return 0;
1498 }
1499
1500 #endif // !UCONFIG_NO_NORMALIZATION
1501
main(int argc,char * argv[])1502 int main(int argc, char* argv[]) {
1503 U_MAIN_INIT_ARGS(argc, argv);
1504
1505 /* preset then read command line options */
1506 options[OPT_DESTDIR].value=u_getDataDirectory();
1507 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
1508
1509 if(options[OPT_VERSION].doesOccur) {
1510 printf("icuexportdata version %s, ICU tool to dump data files for external consumers\n",
1511 U_ICU_DATA_VERSION);
1512 printf("%s\n", U_COPYRIGHT_STRING);
1513 exit(0);
1514 }
1515
1516 /* error handling, printing usage message */
1517 if(argc<0) {
1518 fprintf(stderr,
1519 "error in command line argument \"%s\"\n",
1520 argv[-argc]);
1521 }
1522
1523 if (argc < 0
1524 || options[OPT_HELP_H].doesOccur
1525 || options[OPT_HELP_QUESTION_MARK].doesOccur
1526 || !options[OPT_MODE].doesOccur) {
1527 FILE *stdfile=argc<0 ? stderr : stdout;
1528 printHelp(stdfile, argv[0]);
1529 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1530 }
1531
1532 /* get the options values */
1533 haveCopyright = options[OPT_COPYRIGHT].doesOccur;
1534 destdir = options[OPT_DESTDIR].value;
1535 VERBOSE = options[OPT_VERBOSE].doesOccur;
1536 QUIET = options[OPT_QUIET].doesOccur;
1537
1538 if (options[OPT_TRIE_TYPE].doesOccur) {
1539 if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
1540 trieType = UCPTRIE_TYPE_FAST;
1541 } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
1542 trieType = UCPTRIE_TYPE_SMALL;
1543 } else {
1544 fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
1545 return U_ILLEGAL_ARGUMENT_ERROR;
1546 }
1547 }
1548
1549 const char* mode = options[OPT_MODE].value;
1550 if (uprv_strcmp(mode, "norm") == 0) {
1551 #if !UCONFIG_NO_NORMALIZATION
1552 return exportNorm();
1553 #else
1554 fprintf(stderr, "Exporting normalization data not supported when compiling without normalization support.\n");
1555 return U_ILLEGAL_ARGUMENT_ERROR;
1556 #endif
1557 }
1558 if (uprv_strcmp(mode, "uprops") == 0) {
1559 return exportUprops(argc, argv);
1560 } else if (uprv_strcmp(mode, "ucase") == 0) {
1561 return exportCase(argc, argv);
1562 }
1563
1564 fprintf(stderr, "Invalid option for --mode (must be uprops, ucase, or norm)\n");
1565 return U_ILLEGAL_ARGUMENT_ERROR;
1566 }
1567