• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include <cstddef>
5 #include <cstdint>
6 #include <cstdio>
7 #include <iostream>
8 #include <unicode/localpointer.h>
9 #include <unicode/umachine.h>
10 #include <unicode/unistr.h>
11 #include <unicode/urename.h>
12 #include <unicode/uset.h>
13 #include <vector>
14 #include <algorithm>
15 #include "toolutil.h"
16 #include "uoptions.h"
17 #include "cmemory.h"
18 #include "charstr.h"
19 #include "cstring.h"
20 #include "unicode/uchar.h"
21 #include "unicode/errorcode.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uscript.h"
24 #include "unicode/putil.h"
25 #include "unicode/umutablecptrie.h"
26 #include "unicode/ucharstriebuilder.h"
27 #include "ucase.h"
28 #include "unicode/normalizer2.h"
29 #include "normalizer2impl.h"
30 #include "writesrc.h"
31 
32 U_NAMESPACE_USE
33 
34 /*
35  * Global - verbosity
36  */
37 UBool VERBOSE = false;
38 UBool QUIET = false;
39 
40 UBool haveCopyright = true;
41 UCPTrieType trieType = UCPTRIE_TYPE_SMALL;
42 const char* destdir = "";
43 
44 // Mask constants for modified values in the Script CodePointTrie, values are logically 12-bits.
45 int16_t DATAEXPORT_SCRIPT_X_WITH_COMMON    = 0x0400;
46 int16_t DATAEXPORT_SCRIPT_X_WITH_INHERITED = 0x0800;
47 int16_t DATAEXPORT_SCRIPT_X_WITH_OTHER     = 0x0c00;
48 
49 // TODO(ICU-21821): Replace this with a call to a library function
50 int32_t scxCodePoints[] = {
51       7415, 7377, 7380, 7387, 7390, 7391, 7394, 7395, 7396, 7397,
52       7398, 7399, 7400, 7403, 7404, 7406, 7407, 7408, 7409, 113824,
53       113825, 113826, 113827, 834, 837, 7616, 7617, 12294, 12350, 12351,
54       12688, 12689, 12690, 12691, 12692, 12693, 12694, 12695, 12696, 12697,
55       12698, 12699, 12700, 12701, 12702, 12703, 12736, 12737, 12738, 12739,
56       12740, 12741, 12742, 12743, 12744, 12745, 12746, 12747, 12748, 12749,
57       12750, 12751, 12752, 12753, 12754, 12755, 12756, 12757, 12758, 12759,
58       12760, 12761, 12762, 12763, 12764, 12765, 12766, 12767, 12768, 12769,
59       12770, 12771, 12832, 12833, 12834, 12835, 12836, 12837, 12838, 12839,
60       12840, 12841, 12842, 12843, 12844, 12845, 12846, 12847, 12848, 12849,
61       12850, 12851, 12852, 12853, 12854, 12855, 12856, 12857, 12858, 12859,
62       12860, 12861, 12862, 12863, 12864, 12865, 12866, 12867, 12868, 12869,
63       12870, 12871, 12928, 12929, 12930, 12931, 12932, 12933, 12934, 12935,
64       12936, 12937, 12938, 12939, 12940, 12941, 12942, 12943, 12944, 12945,
65       12946, 12947, 12948, 12949, 12950, 12951, 12952, 12953, 12954, 12955,
66       12956, 12957, 12958, 12959, 12960, 12961, 12962, 12963, 12964, 12965,
67       12966, 12967, 12968, 12969, 12970, 12971, 12972, 12973, 12974, 12975,
68       12976, 12992, 12993, 12994, 12995, 12996, 12997, 12998, 12999, 13000,
69       13001, 13002, 13003, 13055, 13144, 13145, 13146, 13147, 13148, 13149,
70       13150, 13151, 13152, 13153, 13154, 13155, 13156, 13157, 13158, 13159,
71       13160, 13161, 13162, 13163, 13164, 13165, 13166, 13167, 13168, 13179,
72       13180, 13181, 13182, 13183, 13280, 13281, 13282, 13283, 13284, 13285,
73       13286, 13287, 13288, 13289, 13290, 13291, 13292, 13293, 13294, 13295,
74       13296, 13297, 13298, 13299, 13300, 13301, 13302, 13303, 13304, 13305,
75       13306, 13307, 13308, 13309, 13310, 119648, 119649, 119650, 119651, 119652,
76       119653, 119654, 119655, 119656, 119657, 119658, 119659, 119660, 119661, 119662,
77       119663, 119664, 119665, 127568, 127569, 867, 868, 869, 870, 871,
78       872, 873, 874, 875, 876, 877, 878, 879, 7418, 7674,
79       66272, 66273, 66274, 66275, 66276, 66277, 66278, 66279, 66280, 66281,
80       66282, 66283, 66284, 66285, 66286, 66287, 66288, 66289, 66290, 66291,
81       66292, 66293, 66294, 66295, 66296, 66297, 66298, 66299, 1748, 64830,
82       64831, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619,
83       1620, 1621, 1648, 65010, 65021, 7381, 7382, 7384, 7393, 7402,
84       7405, 7413, 7414, 43249, 12330, 12331, 12332, 12333, 43471, 65794,
85       65847, 65848, 65849, 65850, 65851, 65852, 65853, 65854, 65855, 1156,
86       1159, 11843, 42607, 1157, 1158, 1155, 7672, 7379, 7411, 7416,
87       7417, 7401, 7383, 7385, 7388, 7389, 7392, 43251, 4347, 3046,
88       3047, 3048, 3049, 3050, 3051, 3052, 3053, 3054, 3055, 3056,
89       3057, 3058, 3059, 70401, 70403, 70459, 70460, 73680, 73681, 73683,
90       2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2799,
91       2662, 2663, 2664, 2665, 2666, 2667, 2668, 2669, 2670, 2671,
92       42752, 42753, 42754, 42755, 42756, 42757, 42758, 42759, 12337, 12338,
93       12339, 12340, 12341, 12441, 12442, 12443, 12444, 12448, 12540, 65392,
94       65438, 65439, 3302, 3303, 3304, 3305, 3306, 3307, 3308, 3309,
95       3310, 3311, 8239, 68338, 6146, 6147, 6149, 1564, 1632, 1633,
96       1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 2534, 2535,
97       2536, 2537, 2538, 2539, 2540, 2541, 2542, 2543, 4160, 4161,
98       4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 65792, 65793,
99       65799, 65800, 65801, 65802, 65803, 65804, 65805, 65806, 65807, 65808,
100       65809, 65810, 65811, 65812, 65813, 65814, 65815, 65816, 65817, 65818,
101       65819, 65820, 65821, 65822, 65823, 65824, 65825, 65826, 65827, 65828,
102       65829, 65830, 65831, 65832, 65833, 65834, 65835, 65836, 65837, 65838,
103       65839, 65840, 65841, 65842, 65843, 7412, 8432, 12348, 12349, 43310,
104       7376, 7378, 5941, 5942, 2406, 2407, 2408, 2409, 2410, 2411,
105       2412, 2413, 2414, 2415, 12291, 12307, 12316, 12317, 12318, 12319,
106       12336, 12343, 65093, 65094, 1548, 1563, 12289, 12290, 12296, 12297,
107       12298, 12299, 12300, 12301, 12302, 12303, 12304, 12305, 12308, 12309,
108       12310, 12311, 12312, 12313, 12314, 12315, 12539, 65377, 65378, 65379,
109       65380, 65381, 7386, 1567, 7410, 1600, 43062, 43063, 43064, 43065,
110       2386, 2385, 43059, 43060, 43061, 43056, 43057, 43058, 2404, 2405
111     };
112 
handleError(ErrorCode & status,const char * context)113 void handleError(ErrorCode& status, const char* context) {
114     if (status.isFailure()) {
115         std::cerr << "Error: " << context << ": " << status.errorName() << std::endl;
116         exit(status.reset());
117     }
118 }
119 
120 class PropertyValueNameGetter : public ValueNameGetter {
121 public:
PropertyValueNameGetter(UProperty prop)122     PropertyValueNameGetter(UProperty prop) : property(prop) {}
123     ~PropertyValueNameGetter() override;
getName(uint32_t value)124     const char *getName(uint32_t value) override {
125         return u_getPropertyValueName(property, value, U_SHORT_PROPERTY_NAME);
126     }
127 
128 private:
129     UProperty property;
130 };
131 
~PropertyValueNameGetter()132 PropertyValueNameGetter::~PropertyValueNameGetter() {}
133 
134 // Dump an aliases = [...] key for properties with aliases
dumpPropertyAliases(UProperty uproperty,FILE * f)135 void dumpPropertyAliases(UProperty uproperty, FILE* f) {
136     int i = U_LONG_PROPERTY_NAME + 1;
137 
138     while(true) {
139         // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially,
140         // and returning null after that
141         const char* alias = u_getPropertyName(uproperty, (UPropertyNameChoice) i);
142         if (!alias) {
143             break;
144         }
145         if (i == U_LONG_PROPERTY_NAME + 1) {
146             fprintf(f, "aliases = [\"%s\"", alias);
147         } else {
148             fprintf(f, ", \"%s\"", alias);
149         }
150         i++;
151     }
152     if (i != U_LONG_PROPERTY_NAME + 1) {
153         fprintf(f, "]\n");
154     }
155 }
156 
dumpBinaryProperty(UProperty uproperty,FILE * f)157 void dumpBinaryProperty(UProperty uproperty, FILE* f) {
158     IcuToolErrorCode status("icuexportdata: dumpBinaryProperty");
159     const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
160     const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
161     const USet* uset = u_getBinaryPropertySet(uproperty, status);
162     handleError(status, fullPropName);
163 
164     fputs("[[binary_property]]\n", f);
165     fprintf(f, "long_name = \"%s\"\n", fullPropName);
166     if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
167     fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
168     dumpPropertyAliases(uproperty, f);
169     usrc_writeUnicodeSet(f, uset, UPRV_TARGET_SYNTAX_TOML);
170 }
171 
172 // If the value exists, dump an indented entry of the format
173 // `"  {discr = <discriminant>, long = <longname>, short = <shortname>, aliases = [<aliases>]},"`
dumpValueEntry(UProperty uproperty,int v,bool is_mask,FILE * f)174 void dumpValueEntry(UProperty uproperty, int v, bool is_mask, FILE* f) {
175     const char* fullValueName = u_getPropertyValueName(uproperty, v, U_LONG_PROPERTY_NAME);
176     const char* shortValueName = u_getPropertyValueName(uproperty, v, U_SHORT_PROPERTY_NAME);
177     if (!fullValueName) {
178         return;
179     }
180     if (is_mask) {
181         fprintf(f, "  {discr = 0x%X", v);
182     } else {
183         fprintf(f, "  {discr = %i", v);
184     }
185     fprintf(f, ", long = \"%s\"", fullValueName);
186     if (shortValueName) {
187         fprintf(f, ", short = \"%s\"", shortValueName);
188     }
189     int i = U_LONG_PROPERTY_NAME + 1;
190     while(true) {
191         // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially,
192         // and returning null after that
193         const char* alias = u_getPropertyValueName(uproperty, v, (UPropertyNameChoice) i);
194         if (!alias) {
195             break;
196         }
197         if (i == U_LONG_PROPERTY_NAME + 1) {
198             fprintf(f, ", aliases = [\"%s\"", alias);
199         } else {
200             fprintf(f, ", \"%s\"", alias);
201         }
202         i++;
203     }
204     if (i != U_LONG_PROPERTY_NAME + 1) {
205         fprintf(f, "]");
206     }
207     fprintf(f, "},\n");
208 }
209 
dumpEnumeratedProperty(UProperty uproperty,FILE * f)210 void dumpEnumeratedProperty(UProperty uproperty, FILE* f) {
211     IcuToolErrorCode status("icuexportdata: dumpEnumeratedProperty");
212     const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
213     const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
214     const UCPMap* umap = u_getIntPropertyMap(uproperty, status);
215     handleError(status, fullPropName);
216 
217     fputs("[[enum_property]]\n", f);
218     fprintf(f, "long_name = \"%s\"\n", fullPropName);
219     if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
220     fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
221     dumpPropertyAliases(uproperty, f);
222 
223     int32_t minValue = u_getIntPropertyMinValue(uproperty);
224     U_ASSERT(minValue >= 0);
225     int32_t maxValue = u_getIntPropertyMaxValue(uproperty);
226     U_ASSERT(maxValue >= 0);
227 
228     fprintf(f, "values = [\n");
229     for (int v = minValue; v <= maxValue; v++) {
230         dumpValueEntry(uproperty, v, false, f);
231     }
232     fprintf(f, "]\n");
233 
234     PropertyValueNameGetter valueNameGetter(uproperty);
235     usrc_writeUCPMap(f, umap, &valueNameGetter, UPRV_TARGET_SYNTAX_TOML);
236     fputs("\n", f);
237 
238 
239     UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32;
240     if (maxValue <= 0xff) {
241         width = UCPTRIE_VALUE_BITS_8;
242     } else if (maxValue <= 0xffff) {
243         width = UCPTRIE_VALUE_BITS_16;
244     }
245     LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(umap, status));
246     LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
247         builder.getAlias(),
248         trieType,
249         width,
250         status));
251     handleError(status, fullPropName);
252 
253     fputs("[enum_property.code_point_trie]\n", f);
254     usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
255 }
256 
257 /*
258 * Export Bidi_Mirroring_Glyph values (code points) in a similar way to how enumerated
259 * properties are dumped to file.
260 * Note: the data will store 0 for code points without a value defined for
261 * Bidi_Mirroring_Glyph.
262 */
dumpBidiMirroringGlyph(FILE * f)263 void dumpBidiMirroringGlyph(FILE* f) {
264     UProperty uproperty = UCHAR_BIDI_MIRRORING_GLYPH;
265     IcuToolErrorCode status("icuexportdata: dumpBidiMirroringGlyph");
266     const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
267     const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
268     handleError(status, fullPropName);
269 
270     // Store 21-bit code point as is
271     UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32;
272 
273     // note: unlike dumpEnumeratedProperty, which can get inversion map data using
274     // u_getIntPropertyMap(uproperty), the only reliable way to get Bidi_Mirroring_Glyph
275     // is to use u_charMirror(cp) over the code point space.
276     LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
277     for(UChar32 c = UCHAR_MIN_VALUE; c <= UCHAR_MAX_VALUE; c++) {
278         UChar32 mirroringGlyph = u_charMirror(c);
279         // The trie builder code throws an error when it cannot compress the data sufficiently.
280         // Therefore, when the value is undefined for a code point, keep a 0 in the trie
281         // instead of the ICU API behavior of returning the code point value. Using 0
282         // results in a relatively significant space savings by not including redundant data.
283         if (c != mirroringGlyph) {
284             umutablecptrie_set(builder.getAlias(), c, mirroringGlyph, status);
285         }
286     }
287 
288     LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
289         builder.getAlias(),
290         trieType,
291         width,
292         status));
293     handleError(status, fullPropName);
294 
295     // currently a trie and inversion map are the same (as relied upon in characterproperties.cpp)
296     const UCPMap* umap = reinterpret_cast<UCPMap *>(utrie.getAlias());
297 
298     fputs("[[enum_property]]\n", f);
299     fprintf(f, "long_name = \"%s\"\n", fullPropName);
300     if (shortPropName) {
301         fprintf(f, "short_name = \"%s\"\n", shortPropName);
302     }
303     fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
304     dumpPropertyAliases(uproperty, f);
305 
306     usrc_writeUCPMap(f, umap, nullptr, UPRV_TARGET_SYNTAX_TOML);
307     fputs("\n", f);
308 
309     fputs("[enum_property.code_point_trie]\n", f);
310     usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
311 }
312 
313 // After printing property value `v`, print `mask` if and only if `mask` comes immediately
314 // after the property in the listing
maybeDumpMaskValue(UProperty uproperty,uint32_t v,uint32_t mask,FILE * f)315 void maybeDumpMaskValue(UProperty uproperty, uint32_t v, uint32_t mask, FILE* f) {
316     if (U_MASK(v) < mask && U_MASK(v + 1) > mask)
317         dumpValueEntry(uproperty, mask, true, f);
318 }
319 
dumpGeneralCategoryMask(FILE * f)320 void dumpGeneralCategoryMask(FILE* f) {
321     IcuToolErrorCode status("icuexportdata: dumpGeneralCategoryMask");
322     UProperty uproperty = UCHAR_GENERAL_CATEGORY_MASK;
323 
324     fputs("[[mask_property]]\n", f);
325     const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
326     const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
327     fprintf(f, "long_name = \"%s\"\n", fullPropName);
328     if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
329     fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
330     dumpPropertyAliases(uproperty, f);
331 
332 
333     fprintf(f, "mask_for = \"General_Category\"\n");
334     uint32_t minValue = u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY);
335     U_ASSERT(minValue >= 0);
336     uint32_t maxValue = u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY);
337     U_ASSERT(maxValue >= 0);
338 
339     fprintf(f, "values = [\n");
340     for (uint32_t v = minValue; v <= maxValue; v++) {
341         dumpValueEntry(uproperty, U_MASK(v), true, f);
342 
343         // We want to dump these masks "in order", which means they
344         // should come immediately after every property they contain
345         maybeDumpMaskValue(uproperty, v, U_GC_L_MASK, f);
346         maybeDumpMaskValue(uproperty, v, U_GC_LC_MASK, f);
347         maybeDumpMaskValue(uproperty, v, U_GC_M_MASK, f);
348         maybeDumpMaskValue(uproperty, v, U_GC_N_MASK, f);
349         maybeDumpMaskValue(uproperty, v, U_GC_Z_MASK, f);
350         maybeDumpMaskValue(uproperty, v, U_GC_C_MASK, f);
351         maybeDumpMaskValue(uproperty, v, U_GC_P_MASK, f);
352         maybeDumpMaskValue(uproperty, v, U_GC_S_MASK, f);
353     }
354     fprintf(f, "]\n");
355 }
356 
dumpScriptExtensions(FILE * f)357 void dumpScriptExtensions(FILE* f) {
358     IcuToolErrorCode status("icuexportdata: dumpScriptExtensions");
359 
360     fputs("[[script_extensions]]\n", f);
361     const char* scxFullPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_LONG_PROPERTY_NAME);
362     const char* scxShortPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_SHORT_PROPERTY_NAME);
363     fprintf(f, "long_name = \"%s\"\n", scxFullPropName);
364     if (scxShortPropName) fprintf(f, "short_name = \"%s\"\n", scxShortPropName);
365     fprintf(f, "uproperty_discr = 0x%X\n", UCHAR_SCRIPT_EXTENSIONS);
366     dumpPropertyAliases(UCHAR_SCRIPT_EXTENSIONS, f);
367 
368     // We want to use 16 bits for our exported trie of sc/scx data because we
369     // need 12 bits to match the 12 bits of data stored for sc/scx in the trie
370     // in the uprops.icu data file.
371     UCPTrieValueWidth scWidth = UCPTRIE_VALUE_BITS_16;
372 
373     // Create a mutable UCPTrie builder populated with Script property values data.
374     const UCPMap* scInvMap = u_getIntPropertyMap(UCHAR_SCRIPT, status);
375     handleError(status, scxFullPropName);
376     LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(scInvMap, status));
377     handleError(status, scxFullPropName);
378 
379     // The values for the output scx companion array.
380     // Invariant is that all subvectors are distinct.
381     std::vector< std::vector<uint16_t> > outputDedupVec;
382 
383     // The sc/scx companion array is an array of arrays (of script codes)
384     fputs("script_code_array = [\n", f);
385     for(const UChar32 cp : scxCodePoints) {
386         // Get the Script value
387         uint32_t scVal = umutablecptrie_get(builder.getAlias(), cp);
388         // Get the Script_Extensions value (array of Script codes)
389         const int32_t SCX_ARRAY_CAPACITY = 32;
390         UScriptCode scxValArray[SCX_ARRAY_CAPACITY];
391         int32_t numScripts = uscript_getScriptExtensions(cp, scxValArray, SCX_ARRAY_CAPACITY, status);
392         handleError(status, scxFullPropName);
393 
394         // Convert the scx array into a vector
395         std::vector<uint16_t> scxValVec;
396         for(int i = 0; i < numScripts; i++) {
397             scxValVec.push_back(scxValArray[i]);
398         }
399         // Ensure that it is sorted
400         std::sort(scxValVec.begin(), scxValVec.end());
401         // Copy the Script value into the first position of the scx array only
402         // if we have the "other" case (Script value is not Common nor Inherited).
403         // This offers faster access when users want only the Script value.
404         if (scVal != USCRIPT_COMMON && scVal != USCRIPT_INHERITED) {
405             scxValVec.insert(scxValVec.begin(), scVal);
406         }
407 
408         // See if there is already an scx value array matching the newly built one.
409         // If there is, then use its index.
410         // If not, then append the new value array.
411         bool isScxValUnique = true;
412         size_t outputIndex = 0;
413         for (outputIndex = 0; outputIndex < outputDedupVec.size(); outputIndex++) {
414             if (outputDedupVec[outputIndex] == scxValVec) {
415                 isScxValUnique = false;
416                 break;
417             }
418         }
419 
420         if (isScxValUnique) {
421             outputDedupVec.push_back(scxValVec);
422             usrc_writeArray(f, "  [", scxValVec.data(), 16, scxValVec.size(), "    ", "],\n");
423         }
424 
425         // We must update the value in the UCPTrie for the code point to contain:
426         // 9..0 the Script code in the lower 10 bits when 11..10 is 0, else it is
427         //   the index into the companion array
428         // 11..10 the same higher-order 2 bits in the trie in uprops.icu indicating whether
429         //   3: other
430         //   2: Script=Inherited
431         //   1: Script=Common
432         //   0: Script=value in 9..0 (N/A because we are in this loop to create the companion array for non-0 cases)
433         uint16_t mask = 0;
434         if (scVal == USCRIPT_COMMON) {
435             mask = DATAEXPORT_SCRIPT_X_WITH_COMMON;
436         } else if (scVal == USCRIPT_INHERITED) {
437             mask = DATAEXPORT_SCRIPT_X_WITH_INHERITED;
438         } else {
439             mask = DATAEXPORT_SCRIPT_X_WITH_OTHER;
440         }
441 
442         // The new trie value is the index into the new array with the high order bits set
443         uint32_t newScVal = outputIndex | mask;
444 
445         // Update the code point in the mutable trie builder with the trie value
446         umutablecptrie_set(builder.getAlias(), cp, newScVal, status);
447         handleError(status, scxFullPropName);
448     }
449     fputs("]\n\n", f);  // Print the TOML close delimiter for the outer array.
450 
451     // Convert from mutable trie builder to immutable trie.
452     LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
453         builder.getAlias(),
454         trieType,
455         scWidth,
456         status));
457     handleError(status, scxFullPropName);
458 
459     fputs("[script_extensions.code_point_trie]\n", f);
460     usrc_writeUCPTrie(f, scxShortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
461 }
462 
prepareOutputFile(const char * basename)463 FILE* prepareOutputFile(const char* basename) {
464     IcuToolErrorCode status("icuexportdata");
465     CharString outFileName;
466     if (destdir != nullptr && *destdir != 0) {
467         outFileName.append(destdir, status).ensureEndsWithFileSeparator(status);
468     }
469     outFileName.append(basename, status);
470     outFileName.append(".toml", status);
471     handleError(status, basename);
472 
473     FILE* f = fopen(outFileName.data(), "w");
474     if (f == nullptr) {
475         std::cerr << "Unable to open file: " << outFileName.data() << std::endl;
476         exit(U_FILE_ACCESS_ERROR);
477     }
478     if (!QUIET) {
479         std::cout << "Writing to: " << outFileName.data() << std::endl;
480     }
481 
482     if (haveCopyright) {
483         usrc_writeCopyrightHeader(f, "#", 2021);
484     }
485     usrc_writeFileNameGeneratedBy(f, "#", basename, "icuexportdata.cpp");
486 
487     return f;
488 }
489 
490 #if !UCONFIG_NO_NORMALIZATION
491 
492 struct PendingDescriptor {
493     UChar32 scalar;
494     uint32_t descriptor;
495     UBool supplementary;
496 };
497 
writeCanonicalCompositions(USet * backwardCombiningStarters)498 void writeCanonicalCompositions(USet* backwardCombiningStarters) {
499     IcuToolErrorCode status("icuexportdata: computeCanonicalCompositions");
500     const char* basename = "compositions";
501     FILE* f = prepareOutputFile(basename);
502 
503     LocalPointer<UCharsTrieBuilder> backwardBuilder(new UCharsTrieBuilder(status), status);
504 
505     const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
506     UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
507 
508     const Normalizer2* nfc = Normalizer2::getNFCInstance(status);
509     for (UChar32 c = 0; c <= 0x10FFFF; ++c) {
510         if (c >= 0xD800 && c < 0xE000) {
511             // Surrogate
512             continue;
513         }
514         UnicodeString decomposition;
515         if (!nfc->getRawDecomposition(c, decomposition)) {
516             continue;
517         }
518         int32_t len = decomposition.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
519         if (len != 2) {
520             continue;
521         }
522         UChar32 starter = utf32[0];
523         UChar32 second = utf32[1];
524         UChar32 composite = nfc->composePair(starter, second);
525         if (composite < 0) {
526             continue;
527         }
528         if (c != composite) {
529             status.set(U_INTERNAL_PROGRAM_ERROR);
530             handleError(status, basename);
531         }
532         if (!u_getCombiningClass(second)) {
533             uset_add(backwardCombiningStarters, second);
534         }
535         if (composite >= 0xAC00 && composite <= 0xD7A3) {
536             // Hangul syllable
537             continue;
538         }
539 
540         UnicodeString backward;
541         backward.append(second);
542         backward.append(starter);
543         backwardBuilder->add(backward, int32_t(composite), status);
544     }
545     UnicodeString canonicalCompositionTrie;
546     backwardBuilder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, canonicalCompositionTrie, status);
547 
548     usrc_writeArray(f, "compositions = [\n  ", canonicalCompositionTrie.getBuffer(), 16, canonicalCompositionTrie.length(), "  ", "\n]\n");
549     fclose(f);
550     handleError(status, basename);
551 }
552 
writeDecompositionTables(const char * basename,const uint16_t * ptr16,size_t len16,const uint32_t * ptr32,size_t len32)553 void writeDecompositionTables(const char* basename, const uint16_t* ptr16, size_t len16, const uint32_t* ptr32, size_t len32) {
554     FILE* f = prepareOutputFile(basename);
555     usrc_writeArray(f, "scalars16 = [\n  ", ptr16, 16, len16, "  ", "\n]\n");
556     usrc_writeArray(f, "scalars32 = [\n  ", ptr32, 32, len32, "  ", "\n]\n");
557     fclose(f);
558 }
559 
writeDecompositionData(const char * basename,uint32_t baseSize16,uint32_t baseSize32,uint32_t supplementSize16,USet * uset,USet * reference,const std::vector<PendingDescriptor> & pendingTrieInsertions,char16_t passthroughCap)560 void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector<PendingDescriptor>& pendingTrieInsertions, char16_t passthroughCap) {
561     IcuToolErrorCode status("icuexportdata: writeDecompositionData");
562     FILE* f = prepareOutputFile(basename);
563 
564     // Zero is a magic number that means the character decomposes to itself.
565     LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
566 
567     // Iterate backwards to insert lower code points in the trie first in case it matters
568     // for trie block allocation.
569     for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) {
570         const PendingDescriptor& pending = pendingTrieInsertions[i];
571         uint32_t additional = 0;
572         if (!(pending.descriptor & 0xFFFE0000)) {
573             uint32_t offset = pending.descriptor & 0xFFF;
574             if (!pending.supplementary) {
575                 if (offset >= baseSize16) {
576                     // This is a offset to supplementary 16-bit data. We have
577                     // 16-bit base data and 32-bit base data before. However,
578                     // the 16-bit base data length is already part of offset.
579                     additional = baseSize32;
580                 }
581             } else {
582                 if (offset >= baseSize32) {
583                     // This is an offset to supplementary 32-bit data. We have 16-bit
584                     // base data, 32-bit base data, and 16-bit supplementary data before.
585                     // However, the 32-bit base data length is already part
586                     // of offset.
587                     additional = baseSize16 + supplementSize16;
588                 } else {
589                     // This is an offset to 32-bit base data. We have 16-bit
590                     // base data before.
591                     additional = baseSize16;
592                 }
593             }
594             if (offset + additional > 0xFFF) {
595                 status.set(U_INTERNAL_PROGRAM_ERROR);
596                 handleError(status, basename);
597             }
598         }
599         // It turns out it's better to swap the halves compared to the initial
600         // idea in order to put special marker values close to zero so that
601         // an important marker value becomes 1, so it's efficient to compare
602         // "1 or 0". Unfortunately, going through all the code to swap
603         // things is too error prone, so let's do the swapping here in one
604         // place.
605         uint32_t oldTrieValue = pending.descriptor + additional;
606         uint32_t swappedTrieValue = (oldTrieValue >> 16) | (oldTrieValue << 16);
607         umutablecptrie_set(builder.getAlias(), pending.scalar, swappedTrieValue, status);
608     }
609     LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
610         builder.getAlias(),
611         trieType,
612         UCPTRIE_VALUE_BITS_32,
613         status));
614     handleError(status, basename);
615 
616     if (reference) {
617         if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) {
618             // NFD expectations don't hold. The set must not contain the half-width
619             // kana voicing marks and must contain iota subscript.
620             status.set(U_INTERNAL_PROGRAM_ERROR);
621             handleError(status, basename);
622         }
623 
624         USet* halfWidthVoicing = uset_openEmpty();
625         uset_add(halfWidthVoicing, 0xFF9E);
626         uset_add(halfWidthVoicing, 0xFF9F);
627 
628         USet* iotaSubscript = uset_openEmpty();
629         uset_add(iotaSubscript, 0x0345);
630 
631         uint8_t flags = 0;
632 
633         USet* halfWidthCheck = uset_cloneAsThawed(uset);
634         uset_removeAll(halfWidthCheck, reference);
635         if (uset_equals(halfWidthCheck, halfWidthVoicing)) {
636             flags |= 1;
637         } else if (!uset_isEmpty(halfWidthCheck)) {
638             // The result was neither empty nor contained exactly
639             // the two half-width voicing marks. The ICU4X
640             // normalizer doesn't know how to deal with this case.
641             status.set(U_INTERNAL_PROGRAM_ERROR);
642             handleError(status, basename);
643         }
644         uset_close(halfWidthCheck);
645 
646         USet* iotaCheck = uset_cloneAsThawed(reference);
647         uset_removeAll(iotaCheck, uset);
648         if (!(uset_equals(iotaCheck, iotaSubscript)) && !uset_isEmpty(iotaCheck)) {
649             // The result was neither empty nor contained exactly
650             // the iota subscript. The ICU4X normalizer doesn't
651             // know how to deal with this case.
652             status.set(U_INTERNAL_PROGRAM_ERROR);
653             handleError(status, basename);
654         }
655         uset_close(halfWidthCheck);
656 
657         uset_close(iotaSubscript);
658         uset_close(halfWidthVoicing);
659 
660         fprintf(f, "flags = 0x%X\n", flags);
661         fprintf(f, "cap = 0x%X\n", passthroughCap);
662     }
663     fprintf(f, "[trie]\n");
664     usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
665     fclose(f);
666     handleError(status, basename);
667 }
668 
669 // Special marker for the NFKD form of U+FDFA
670 const int32_t FDFA_MARKER = 3;
671 
672 // Special marker for characters whose decomposition starts with a non-starter
673 // and the decomposition isn't the character itself.
674 const int32_t SPECIAL_NON_STARTER_DECOMPOSITION_MARKER = 2;
675 
676 // Special marker for starters that decompose to themselves but that may
677 // combine backwards under canonical composition
678 const int32_t BACKWARD_COMBINING_STARTER_MARKER = 1;
679 
680 /// Marker that a complex decomposition isn't round-trippable
681 /// under re-composition.
682 const uint32_t NON_ROUND_TRIP_MARKER = 1;
683 
permissibleBmpPair(UBool knownToRoundTrip,UChar32 c,UChar32 second)684 UBool permissibleBmpPair(UBool knownToRoundTrip, UChar32 c, UChar32 second) {
685     if (knownToRoundTrip) {
686         return true;
687     }
688     // Nuktas, Hebrew presentation forms and polytonic Greek with oxia
689     // are special-cased in ICU4X.
690     if (c >= 0xFB1D && c <= 0xFB4E) {
691         // Hebrew presentation forms
692         return true;
693     }
694     if (c >= 0x1F71 && c <= 0x1FFB) {
695         // Polytonic Greek with oxia
696         return true;
697     }
698     if ((second & 0x7F) == 0x3C && second >= 0x0900 && second <= 0x0BFF) {
699         // Nukta
700         return true;
701     }
702     // To avoid more branchiness, 4 characters that decompose to
703     // a BMP starter followed by a BMP non-starter are excluded
704     // from being encoded directly into the trie value and are
705     // handled as complex decompositions instead. These are:
706     // U+0F76 TIBETAN VOWEL SIGN VOCALIC R
707     // U+0F78 TIBETAN VOWEL SIGN VOCALIC L
708     // U+212B ANGSTROM SIGN
709     // U+2ADC FORKING
710     return false;
711 }
712 
713 // Computes data for canonical decompositions
computeDecompositions(const char * basename,const USet * backwardCombiningStarters,std::vector<uint16_t> & storage16,std::vector<uint32_t> & storage32,USet * decompositionStartsWithNonStarter,USet * decompositionStartsWithBackwardCombiningStarter,std::vector<PendingDescriptor> & pendingTrieInsertions,UChar32 & decompositionPassthroughBound,UChar32 & compositionPassthroughBound)714 void computeDecompositions(const char* basename,
715                            const USet* backwardCombiningStarters,
716                            std::vector<uint16_t>& storage16,
717                            std::vector<uint32_t>& storage32,
718                            USet* decompositionStartsWithNonStarter,
719                            USet* decompositionStartsWithBackwardCombiningStarter,
720                            std::vector<PendingDescriptor>& pendingTrieInsertions,
721                            UChar32& decompositionPassthroughBound,
722                            UChar32& compositionPassthroughBound) {
723     IcuToolErrorCode status("icuexportdata: computeDecompositions");
724     const Normalizer2* mainNormalizer;
725     const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(status);
726     const Normalizer2* nfcNormalizer = Normalizer2::getNFCInstance(status);
727     FILE* f = nullptr;
728     std::vector<uint32_t> nonRecursive32;
729     LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status));
730 
731     if (uprv_strcmp(basename, "nfkd") == 0) {
732         mainNormalizer = Normalizer2::getNFKDInstance(status);
733     } else if (uprv_strcmp(basename, "uts46d") == 0) {
734         mainNormalizer = Normalizer2::getInstance(nullptr, "uts46", UNORM2_COMPOSE, status);
735     } else {
736         mainNormalizer = nfdNormalizer;
737         f = prepareOutputFile("decompositionex");
738     }
739 
740     // Max length as of Unicode 14 is 4 for NFD. For NFKD the max
741     // is 18 (U+FDFA; special-cased), and the next longest is 8 (U+FDFB).
742     const int32_t LONGEST_ENCODABLE_LENGTH_16 = 9;
743     const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8;
744     const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
745     UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
746     const int32_t RAW_DECOMPOSITION_BUFFER_SIZE = 2;
747     UChar32 rawUtf32[RAW_DECOMPOSITION_BUFFER_SIZE];
748 
749     // Iterate over all scalar values excluding Hangul syllables.
750     //
751     // We go backwards in order to better find overlapping decompositions.
752     //
753     // As of Unicode 14:
754     // Iterate forward without overlap search:
755     // nfd: 16 size: 896, 32 size: 173
756     // nfkd: 16 size: 3854, 32 size: 179
757     //
758     // Iterate forward with overlap search:
759     // nfd: 16 size: 888, 32 size: 173
760     // nfkd: 16 size: 3266, 32 size: 179
761     //
762     // Iterate backward with overlap search:
763     // nfd: 16 size: 776, 32 size: 173
764     // nfkd: 16 size: 2941, 32 size: 179
765     //
766     // UChar32 is signed!
767     for (UChar32 c = 0x10FFFF; c >= 0; --c) {
768         if (c >= 0xAC00 && c <= 0xD7A3) {
769             // Hangul syllable
770             continue;
771         }
772         if (c >= 0xD800 && c < 0xE000) {
773             // Surrogate
774             continue;
775         }
776         UnicodeString src;
777         UnicodeString dst;
778         // True if we're building non-NFD or we're building NFD but
779         // the `c` round trips to NFC.
780         // False if we're building NFD and `c` does not round trip to NFC.
781         UBool nonNfdOrRoundTrips = true;
782         src.append(c);
783         if (mainNormalizer != nfdNormalizer) {
784             UnicodeString inter;
785             mainNormalizer->normalize(src, inter, status);
786             nfdNormalizer->normalize(inter, dst, status);
787         } else {
788             nfdNormalizer->normalize(src, dst, status);
789             UnicodeString nfc;
790             nfcNormalizer->normalize(dst, nfc, status);
791             nonNfdOrRoundTrips = (src == nfc);
792         }
793         int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
794         if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
795             // Characters that normalize to nothing or to U+FFFD (without the
796             // input being U+FFFD) in ICU4C's UTS 46 normalization normalize
797             // as in NFD in ICU4X's UTF 46 normalization in the interest
798             // of data size and ICU4X's normalizer being unable to handle
799             // normalizing to nothing.
800             // When UTS 46 is implemented on top of ICU4X, a preprocessing
801             // step is supposed to remove these characters before the
802             // normalization step.
803             if (uprv_strcmp(basename, "uts46d") != 0) {
804                 status.set(U_INTERNAL_PROGRAM_ERROR);
805                 handleError(status, basename);
806             }
807             nfdNormalizer->normalize(src, dst, status);
808             len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
809             if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
810                 status.set(U_INTERNAL_PROGRAM_ERROR);
811                 handleError(status, basename);
812             }
813         }
814         if (len > DECOMPOSITION_BUFFER_SIZE) {
815             status.set(U_INTERNAL_PROGRAM_ERROR);
816             handleError(status, basename);
817         }
818         uint8_t firstCombiningClass = u_getCombiningClass(utf32[0]);
819         bool specialNonStarterDecomposition = false;
820         bool startsWithBackwardCombiningStarter = false;
821         if (firstCombiningClass) {
822             decompositionPassthroughBound = c;
823             compositionPassthroughBound = c;
824             uset_add(decompositionStartsWithNonStarter, c);
825             if (src != dst) {
826                 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F) {
827                     specialNonStarterDecomposition = true;
828                 } else {
829                     // A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X.
830                     status.set(U_INTERNAL_PROGRAM_ERROR);
831                     handleError(status, basename);
832                 }
833             }
834         } else if (uset_contains(backwardCombiningStarters, utf32[0])) {
835             compositionPassthroughBound = c;
836             startsWithBackwardCombiningStarter = true;
837             uset_add(decompositionStartsWithBackwardCombiningStarter, c);
838         }
839         if (c != BACKWARD_COMBINING_STARTER_MARKER && len == 1 && utf32[0] == BACKWARD_COMBINING_STARTER_MARKER) {
840             status.set(U_INTERNAL_PROGRAM_ERROR);
841             handleError(status, basename);
842         }
843         if (c != SPECIAL_NON_STARTER_DECOMPOSITION_MARKER && len == 1 && utf32[0] == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER) {
844             status.set(U_INTERNAL_PROGRAM_ERROR);
845             handleError(status, basename);
846         }
847         if (c != FDFA_MARKER && len == 1 && utf32[0] == FDFA_MARKER) {
848             status.set(U_INTERNAL_PROGRAM_ERROR);
849             handleError(status, basename);
850         }
851         if (mainNormalizer != nfdNormalizer) {
852             UnicodeString nfd;
853             nfdNormalizer->normalize(src, nfd, status);
854             if (dst == nfd) {
855                 continue;
856             }
857             decompositionPassthroughBound = c;
858             compositionPassthroughBound = c;
859         } else if (firstCombiningClass) {
860             len = 1;
861             if (specialNonStarterDecomposition) {
862                 utf32[0] = SPECIAL_NON_STARTER_DECOMPOSITION_MARKER; // magic value
863             } else {
864                 // Use the surrogate range to store the canonical combining class
865                 utf32[0] = 0xD800 | UChar32(firstCombiningClass);
866             }
867         } else {
868             if (src == dst) {
869                 if (startsWithBackwardCombiningStarter) {
870                     pendingTrieInsertions.push_back({c, BACKWARD_COMBINING_STARTER_MARKER << 16, false});
871                 }
872                 continue;
873             }
874             decompositionPassthroughBound = c;
875             // ICU4X hard-codes ANGSTROM SIGN
876             if (c != 0x212B) {
877                 UnicodeString raw;
878                 if (!nfdNormalizer->getRawDecomposition(c, raw)) {
879                     // We're always supposed to have a non-recursive decomposition
880                     // if we had a recursive one.
881                     status.set(U_INTERNAL_PROGRAM_ERROR);
882                     handleError(status, basename);
883                 }
884                 // In addition to actual difference, put the whole range that contains characters
885                 // with oxia into the non-recursive trie in order to catch cases where characters
886                 // with oxia have singleton decompositions to corresponding characters with tonos.
887                 // This way, the run-time decision to fall through can be done on the range
888                 // without checking for individual characters inside the range.
889                 if (raw != dst || (c >= 0x1F71 && c <= 0x1FFB)) {
890                     int32_t rawLen = raw.toUTF32(rawUtf32, RAW_DECOMPOSITION_BUFFER_SIZE, status);
891                     if (!rawLen) {
892                         status.set(U_INTERNAL_PROGRAM_ERROR);
893                         handleError(status, basename);
894                     }
895                     if (rawLen == 1) {
896                         if (c >= 0xFFFF) {
897                             status.set(U_INTERNAL_PROGRAM_ERROR);
898                             handleError(status, basename);
899                         }
900                         umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, uint32_t(rawUtf32[0]), status);
901                     } else if (rawUtf32[0] <= 0xFFFF && rawUtf32[1] <= 0xFFFF) {
902                         if (!rawUtf32[0] || !rawUtf32[1]) {
903                             status.set(U_INTERNAL_PROGRAM_ERROR);
904                             handleError(status, basename);
905                         }
906                         // Swapped for consistency with the primary trie
907                         uint32_t bmpPair = uint32_t(rawUtf32[1]) << 16 | uint32_t(rawUtf32[0]);
908                         umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, bmpPair, status);
909                     } else {
910                         // Let's add 1 to index to make it always non-zero to distinguish
911                         // it from the default zero.
912                         uint32_t index = nonRecursive32.size() + 1;
913                         nonRecursive32.push_back(uint32_t(rawUtf32[0]));
914                         nonRecursive32.push_back(uint32_t(rawUtf32[1]));
915                         if (index > 0xFFFF) {
916                             status.set(U_INTERNAL_PROGRAM_ERROR);
917                             handleError(status, basename);
918                         }
919                         umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, index << 16, status);
920                     }
921                 }
922             }
923         }
924         if (!nonNfdOrRoundTrips) {
925             compositionPassthroughBound = c;
926         }
927         if (len == 1 && utf32[0] <= 0xFFFF) {
928             if (startsWithBackwardCombiningStarter) {
929                 if (mainNormalizer == nfdNormalizer) {
930                     // Not supposed to happen in NFD
931                     status.set(U_INTERNAL_PROGRAM_ERROR);
932                     handleError(status, basename);
933                 } else if (!((utf32[0] >= 0x1161 && utf32[0] <= 0x1175) || (utf32[0] >= 0x11A8 && utf32[0] <= 0x11C2))) {
934                     // Other than conjoining jamo vowels and trails
935                     // unsupported for non-NFD.
936                     status.set(U_INTERNAL_PROGRAM_ERROR);
937                     handleError(status, basename);
938                 }
939             }
940             pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, false});
941         } else if (len == 2 &&
942                    utf32[0] <= 0xFFFF &&
943                    utf32[1] <= 0xFFFF &&
944                    !u_getCombiningClass(utf32[0]) &&
945                    u_getCombiningClass(utf32[1]) &&
946                    permissibleBmpPair(nonNfdOrRoundTrips, c, utf32[1])) {
947             for (int32_t i = 0; i < len; ++i) {
948                 if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) {
949                     // Assert that iota subscript and half-width voicing marks never occur in these
950                     // expansions in the normalization forms where they are special.
951                     status.set(U_INTERNAL_PROGRAM_ERROR);
952                     handleError(status, basename);
953                 }
954             }
955             if (startsWithBackwardCombiningStarter) {
956                 status.set(U_INTERNAL_PROGRAM_ERROR);
957                 handleError(status, basename);
958             }
959             pendingTrieInsertions.push_back({c, (uint32_t(utf32[0]) << 16) | uint32_t(utf32[1]), false});
960         } else {
961             if (startsWithBackwardCombiningStarter) {
962                 status.set(U_INTERNAL_PROGRAM_ERROR);
963                 handleError(status, basename);
964             }
965 
966             UBool supplementary = false;
967             UBool nonInitialStarter = false;
968             for (int32_t i = 0; i < len; ++i) {
969                 if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) {
970                     // Assert that iota subscript and half-width voicing marks never occur in these
971                     // expansions in the normalization forms where they are special.
972                     status.set(U_INTERNAL_PROGRAM_ERROR);
973                     handleError(status, basename);
974                 }
975 
976                 if (utf32[i] > 0xFFFF) {
977                     supplementary = true;
978                 }
979                 if (utf32[i] == 0) {
980                     status.set(U_INTERNAL_PROGRAM_ERROR);
981                     handleError(status, basename);
982                 }
983                 if (i != 0 && !u_getCombiningClass(utf32[i])) {
984                     nonInitialStarter = true;
985                 }
986             }
987             if (!supplementary) {
988                 if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) {
989                     if (len == 18 && c == 0xFDFA) {
990                         // Special marker for the one character whose decomposition
991                         // is too long.
992                         pendingTrieInsertions.push_back({c, FDFA_MARKER << 16, supplementary});
993                         continue;
994                     } else {
995                         status.set(U_INTERNAL_PROGRAM_ERROR);
996                         handleError(status, basename);
997                     }
998                 }
999             } else if (len > LONGEST_ENCODABLE_LENGTH_32 || !len) {
1000                 status.set(U_INTERNAL_PROGRAM_ERROR);
1001                 handleError(status, basename);
1002             }
1003             // Complex decomposition
1004             // Format for 16-bit value:
1005             // 15..13: length minus two for 16-bit case and length minus one for
1006             //         the 32-bit case. Length 8 needs to fit in three bits in
1007             //         the 16-bit case, and this way the value is future-proofed
1008             //         up to 9 in the 16-bit case. Zero is unused and length one
1009             //         in the 16-bit case goes directly into the trie.
1010             //     12: 1 if all trailing characters are guaranteed non-starters,
1011             //         0 if no guarantees about non-starterness.
1012             //         Note: The bit choice is this way around to allow for
1013             //         dynamically falling back to not having this but instead
1014             //         having one more bit for length by merely choosing
1015             //         different masks.
1016             //  11..0: Start offset in storage. The offset is to the logical
1017             //         sequence of scalars16, scalars32, supplementary_scalars16,
1018             //         supplementary_scalars32.
1019             uint32_t descriptor = uint32_t(!nonInitialStarter) << 12;
1020             if (!supplementary) {
1021                 descriptor |= (uint32_t(len) - 2) << 13;
1022             } else {
1023                 descriptor |= (uint32_t(len) - 1) << 13;
1024             }
1025             if (descriptor & 0xFFF) {
1026                 status.set(U_INTERNAL_PROGRAM_ERROR);
1027                 handleError(status, basename);
1028             }
1029             size_t index = 0;
1030             bool writeToStorage = false;
1031             // Sadly, C++ lacks break and continue by label, so using goto in the
1032             // inner loops to break or continue the outer loop.
1033             if (!supplementary) {
1034                 outer16: for (;;) {
1035                     if (index == storage16.size()) {
1036                         writeToStorage = true;
1037                         break;
1038                     }
1039                     if (storage16[index] == utf32[0]) {
1040                         for (int32_t i = 1; i < len; ++i) {
1041                             if (storage16[index + i] != uint32_t(utf32[i])) {
1042                                 ++index;
1043                                 // continue outer
1044                                 goto outer16;
1045                             }
1046                         }
1047                         // break outer
1048                         goto after;
1049                     }
1050                     ++index;
1051                 }
1052             } else {
1053                 outer32: for (;;) {
1054                     if (index == storage32.size()) {
1055                         writeToStorage = true;
1056                         break;
1057                     }
1058                     if (storage32[index] == uint32_t(utf32[0])) {
1059                         for (int32_t i = 1; i < len; ++i) {
1060                             if (storage32[index + i] != uint32_t(utf32[i])) {
1061                                 ++index;
1062                                 // continue outer
1063                                 goto outer32;
1064                             }
1065                         }
1066                         // break outer
1067                         goto after;
1068                     }
1069                     ++index;
1070                 }
1071             }
1072             after:
1073             if (index > 0xFFF) {
1074                 status.set(U_INTERNAL_PROGRAM_ERROR);
1075                 handleError(status, basename);
1076             }
1077             descriptor |= uint32_t(index);
1078             if (!descriptor || descriptor > 0xFFFF) {
1079                 // > 0xFFFF should never happen if the code above is correct.
1080                 // == 0 should not happen due to the nature of the data.
1081                 status.set(U_INTERNAL_PROGRAM_ERROR);
1082                 handleError(status, basename);
1083             }
1084             if (writeToStorage) {
1085                 if (!supplementary) {
1086                     for (int32_t i = 0; i < len; ++i) {
1087                         storage16.push_back(uint16_t(utf32[i]));
1088                     }
1089                 } else {
1090                     for (int32_t i = 0; i < len; ++i) {
1091                         storage32.push_back(uint32_t(utf32[i]));
1092                     }
1093                 }
1094             }
1095 
1096             uint32_t nonRoundTripMarker = 0;
1097             if (!nonNfdOrRoundTrips) {
1098                 nonRoundTripMarker = (NON_ROUND_TRIP_MARKER << 16);
1099             }
1100             pendingTrieInsertions.push_back({c, descriptor | nonRoundTripMarker, supplementary});
1101         }
1102     }
1103     if (storage16.size() + storage32.size() > 0xFFF) {
1104         status.set(U_INTERNAL_PROGRAM_ERROR);
1105     }
1106     if (f) {
1107         usrc_writeArray(f, "scalars32 = [\n  ", nonRecursive32.data(), 32, nonRecursive32.size(), "  ", "\n]\n");
1108 
1109         LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
1110             nonRecursiveBuilder.getAlias(),
1111             trieType,
1112             UCPTRIE_VALUE_BITS_32,
1113             status));
1114         handleError(status, basename);
1115 
1116         fprintf(f, "[trie]\n");
1117         usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
1118 
1119         fclose(f);
1120     }
1121     handleError(status, basename);
1122 }
1123 
1124 #endif // !UCONFIG_NO_NORMALIZATION
1125 
1126 enum {
1127     OPT_HELP_H,
1128     OPT_HELP_QUESTION_MARK,
1129     OPT_MODE,
1130     OPT_TRIE_TYPE,
1131     OPT_VERSION,
1132     OPT_DESTDIR,
1133     OPT_ALL,
1134     OPT_INDEX,
1135     OPT_COPYRIGHT,
1136     OPT_VERBOSE,
1137     OPT_QUIET,
1138 
1139     OPT_COUNT
1140 };
1141 
1142 #define UOPTION_MODE UOPTION_DEF("mode", 'm', UOPT_REQUIRES_ARG)
1143 #define UOPTION_TRIE_TYPE UOPTION_DEF("trie-type", '\1', UOPT_REQUIRES_ARG)
1144 #define UOPTION_ALL UOPTION_DEF("all", '\1', UOPT_NO_ARG)
1145 #define UOPTION_INDEX UOPTION_DEF("index", '\1', UOPT_NO_ARG)
1146 
1147 static UOption options[]={
1148     UOPTION_HELP_H,
1149     UOPTION_HELP_QUESTION_MARK,
1150     UOPTION_MODE,
1151     UOPTION_TRIE_TYPE,
1152     UOPTION_VERSION,
1153     UOPTION_DESTDIR,
1154     UOPTION_ALL,
1155     UOPTION_INDEX,
1156     UOPTION_COPYRIGHT,
1157     UOPTION_VERBOSE,
1158     UOPTION_QUIET,
1159 };
1160 
printHelp(FILE * stdfile,const char * program)1161 void printHelp(FILE* stdfile, const char* program) {
1162   fprintf(stdfile,
1163           "usage: %s -m mode [-options] [--all | properties...]\n"
1164           "\tdump Unicode property data to .toml files\n"
1165           "options:\n"
1166           "\t-h or -? or --help  this usage text\n"
1167           "\t-V or --version     show a version message\n"
1168           "\t-m or --mode        mode: currently only 'uprops', 'ucase', and 'norm', but more may be added\n"
1169           "\t      --trie-type   set the trie type (small or fast, default small)\n"
1170           "\t-d or --destdir     destination directory, followed by the path\n"
1171           "\t      --all         write out all properties known to icuexportdata\n"
1172           "\t      --index       write an _index.toml summarizing all data exported\n"
1173           "\t-c or --copyright   include a copyright notice\n"
1174           "\t-v or --verbose     Turn on verbose output\n"
1175           "\t-q or --quiet       do not display warnings and progress\n",
1176           program);
1177 }
1178 
exportUprops(int argc,char * argv[])1179 int exportUprops(int argc, char* argv[]) {
1180     // Load list of Unicode properties
1181     std::vector<const char*> propNames;
1182     for (int i=1; i<argc; i++) {
1183         propNames.push_back(argv[i]);
1184     }
1185     if (options[OPT_ALL].doesOccur) {
1186         int i = UCHAR_BINARY_START;
1187         while (true) {
1188             if (i == UCHAR_BINARY_LIMIT) {
1189                 i = UCHAR_INT_START;
1190             }
1191             if (i == UCHAR_INT_LIMIT) {
1192                 i = UCHAR_GENERAL_CATEGORY_MASK;
1193             }
1194             if (i == UCHAR_GENERAL_CATEGORY_MASK + 1) {
1195                 i = UCHAR_BIDI_MIRRORING_GLYPH;
1196             }
1197             if (i == UCHAR_BIDI_MIRRORING_GLYPH + 1) {
1198                 i = UCHAR_SCRIPT_EXTENSIONS;
1199             }
1200             if (i == UCHAR_SCRIPT_EXTENSIONS + 1) {
1201                 break;
1202             }
1203             UProperty uprop = static_cast<UProperty>(i);
1204             const char* propName = u_getPropertyName(uprop, U_SHORT_PROPERTY_NAME);
1205             if (propName == nullptr) {
1206                 propName = u_getPropertyName(uprop, U_LONG_PROPERTY_NAME);
1207                 if (propName != nullptr && VERBOSE) {
1208                     std::cerr << "Note: falling back to long name for: " << propName << std::endl;
1209                 }
1210             }
1211             if (propName != nullptr) {
1212                 propNames.push_back(propName);
1213             } else {
1214                 std::cerr << "Warning: Could not find name for: " << uprop << std::endl;
1215             }
1216             i++;
1217         }
1218     }
1219 
1220     if (propNames.empty()
1221             || options[OPT_HELP_H].doesOccur
1222             || options[OPT_HELP_QUESTION_MARK].doesOccur
1223             || !options[OPT_MODE].doesOccur) {
1224         FILE *stdfile=argc<0 ? stderr : stdout;
1225         fprintf(stdfile,
1226             "usage: %s -m uprops [-options] [--all | properties...]\n"
1227             "\tdump Unicode property data to .toml files\n"
1228             "options:\n"
1229             "\t-h or -? or --help  this usage text\n"
1230             "\t-V or --version     show a version message\n"
1231             "\t-m or --mode        mode: currently only 'uprops', but more may be added\n"
1232             "\t      --trie-type   set the trie type (small or fast, default small)\n"
1233             "\t-d or --destdir     destination directory, followed by the path\n"
1234             "\t      --all         write out all properties known to icuexportdata\n"
1235             "\t      --index       write an _index.toml summarizing all data exported\n"
1236             "\t-c or --copyright   include a copyright notice\n"
1237             "\t-v or --verbose     Turn on verbose output\n"
1238             "\t-q or --quiet       do not display warnings and progress\n",
1239             argv[0]);
1240         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1241     }
1242 
1243     const char* mode = options[OPT_MODE].value;
1244     if (uprv_strcmp(mode, "uprops") != 0) {
1245         fprintf(stderr, "Invalid option for --mode (must be uprops)\n");
1246         return U_ILLEGAL_ARGUMENT_ERROR;
1247     }
1248 
1249     if (options[OPT_TRIE_TYPE].doesOccur) {
1250         if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
1251             trieType = UCPTRIE_TYPE_FAST;
1252         } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
1253             trieType = UCPTRIE_TYPE_SMALL;
1254         } else {
1255             fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
1256             return U_ILLEGAL_ARGUMENT_ERROR;
1257         }
1258     }
1259 
1260     for (const char* propName : propNames) {
1261         UProperty propEnum = u_getPropertyEnum(propName);
1262         if (propEnum == UCHAR_INVALID_CODE) {
1263             std::cerr << "Error: Invalid property alias: " << propName << std::endl;
1264             return U_ILLEGAL_ARGUMENT_ERROR;
1265         }
1266 
1267         FILE* f = prepareOutputFile(propName);
1268 
1269         UVersionInfo versionInfo;
1270         u_getUnicodeVersion(versionInfo);
1271         char uvbuf[U_MAX_VERSION_STRING_LENGTH];
1272         u_versionToString(versionInfo, uvbuf);
1273         fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n",
1274             U_ICU_VERSION,
1275             uvbuf);
1276 
1277         if (propEnum < UCHAR_BINARY_LIMIT) {
1278             dumpBinaryProperty(propEnum, f);
1279         } else if (UCHAR_INT_START <= propEnum && propEnum <= UCHAR_INT_LIMIT) {
1280             dumpEnumeratedProperty(propEnum, f);
1281         } else if (propEnum == UCHAR_GENERAL_CATEGORY_MASK) {
1282             dumpGeneralCategoryMask(f);
1283         } else if (propEnum == UCHAR_BIDI_MIRRORING_GLYPH) {
1284             dumpBidiMirroringGlyph(f);
1285         } else if (propEnum == UCHAR_SCRIPT_EXTENSIONS) {
1286             dumpScriptExtensions(f);
1287         } else {
1288             std::cerr << "Don't know how to write property: " << propEnum << std::endl;
1289             return U_INTERNAL_PROGRAM_ERROR;
1290         }
1291 
1292         fclose(f);
1293     }
1294 
1295     if (options[OPT_INDEX].doesOccur) {
1296         FILE* f = prepareOutputFile("_index");
1297         fprintf(f, "index = [\n");
1298         for (const char* propName : propNames) {
1299             // At this point, propName is a valid property name, so it should be alphanum ASCII
1300             fprintf(f, "  { filename=\"%s.toml\" },\n", propName);
1301         }
1302         fprintf(f, "]\n");
1303         fclose(f);
1304     }
1305 
1306     return 0;
1307 }
1308 
1309 struct AddRangeHelper {
1310     UMutableCPTrie* ucptrie;
1311 };
1312 
1313 static UBool U_CALLCONV
addRangeToUCPTrie(const void * context,UChar32 start,UChar32 end,uint32_t value)1314 addRangeToUCPTrie(const void* context, UChar32 start, UChar32 end, uint32_t value) {
1315     IcuToolErrorCode status("addRangeToUCPTrie");
1316     UMutableCPTrie* ucptrie = ((const AddRangeHelper*) context)->ucptrie;
1317     umutablecptrie_setRange(ucptrie, start, end, value, status);
1318     handleError(status, "setRange");
1319 
1320     return true;
1321 }
1322 
exportCase(int argc,char * argv[])1323 int exportCase(int argc, char* argv[]) {
1324     if (argc > 1) {
1325         fprintf(stderr, "ucase mode does not expect additional arguments\n");
1326         return U_ILLEGAL_ARGUMENT_ERROR;
1327     }
1328     (void) argv; // Suppress unused variable warning
1329 
1330     IcuToolErrorCode status("icuexportdata");
1331     LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
1332     handleError(status, "exportCase");
1333 
1334     int32_t exceptionsLength, unfoldLength;
1335     const UCaseProps *caseProps = ucase_getSingleton(&exceptionsLength, &unfoldLength);
1336     const UTrie2* caseTrie = &caseProps->trie;
1337 
1338     AddRangeHelper helper = { builder.getAlias() };
1339     utrie2_enum(caseTrie, nullptr, addRangeToUCPTrie, &helper);
1340 
1341     UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_16;
1342     LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
1343         builder.getAlias(),
1344         trieType,
1345         width,
1346         status));
1347     handleError(status, "exportCase");
1348 
1349     FILE* f = prepareOutputFile("ucase");
1350 
1351     UVersionInfo versionInfo;
1352     u_getUnicodeVersion(versionInfo);
1353     char uvbuf[U_MAX_VERSION_STRING_LENGTH];
1354     u_versionToString(versionInfo, uvbuf);
1355     fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n",
1356             U_ICU_VERSION,
1357             uvbuf);
1358 
1359     fputs("[ucase.code_point_trie]\n", f);
1360     usrc_writeUCPTrie(f, "case_trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
1361     fputs("\n", f);
1362 
1363     const char* indent = "  ";
1364     const char* suffix = "\n]\n";
1365 
1366     fputs("[ucase.exceptions]\n", f);
1367     const char* exceptionsPrefix = "exceptions = [\n  ";
1368     int32_t exceptionsWidth = 16;
1369     usrc_writeArray(f, exceptionsPrefix, caseProps->exceptions, exceptionsWidth,
1370                     exceptionsLength, indent, suffix);
1371     fputs("\n", f);
1372 
1373     fputs("[ucase.unfold]\n", f);
1374     const char* unfoldPrefix = "unfold = [\n  ";
1375     int32_t unfoldWidth = 16;
1376     usrc_writeArray(f, unfoldPrefix, caseProps->unfold, unfoldWidth,
1377                     unfoldLength, indent, suffix);
1378 
1379     return 0;
1380 }
1381 
1382 #if !UCONFIG_NO_NORMALIZATION
1383 
exportNorm()1384 int exportNorm() {
1385     IcuToolErrorCode status("icuexportdata: exportNorm");
1386     USet* backwardCombiningStarters = uset_openEmpty();
1387     writeCanonicalCompositions(backwardCombiningStarters);
1388 
1389     std::vector<uint16_t> storage16;
1390     std::vector<uint32_t> storage32;
1391 
1392     // Note: the USets are not exported. They are only used to check that a new
1393     // Unicode version doesn't violate expectations that are hard-coded in ICU4X.
1394     USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty();
1395     USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1396     std::vector<PendingDescriptor> nfdPendingTrieInsertions;
1397     UChar32 nfdBound = 0x10FFFF;
1398     UChar32 nfcBound = 0x10FFFF;
1399     computeDecompositions("nfd",
1400                           backwardCombiningStarters,
1401                           storage16,
1402                           storage32,
1403                           nfdDecompositionStartsWithNonStarter,
1404                           nfdDecompositionStartsWithBackwardCombiningStarter,
1405                           nfdPendingTrieInsertions,
1406                           nfdBound,
1407                           nfcBound);
1408     if (!(nfdBound == 0xC0 && nfcBound == 0x300)) {
1409         // Unexpected bounds for NFD/NFC.
1410         status.set(U_INTERNAL_PROGRAM_ERROR);
1411         handleError(status, "exportNorm");
1412     }
1413 
1414     uint32_t baseSize16 = storage16.size();
1415     uint32_t baseSize32 = storage32.size();
1416 
1417     USet* nfkdDecompositionStartsWithNonStarter = uset_openEmpty();
1418     USet* nfkdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1419     std::vector<PendingDescriptor> nfkdPendingTrieInsertions;
1420     UChar32 nfkdBound = 0x10FFFF;
1421     UChar32 nfkcBound = 0x10FFFF;
1422     computeDecompositions("nfkd",
1423                           backwardCombiningStarters,
1424                           storage16,
1425                           storage32,
1426                           nfkdDecompositionStartsWithNonStarter,
1427                           nfkdDecompositionStartsWithBackwardCombiningStarter,
1428                           nfkdPendingTrieInsertions,
1429                           nfkdBound,
1430                           nfkcBound);
1431     if (!(nfkdBound <= 0xC0 && nfkcBound <= 0x300)) {
1432         status.set(U_INTERNAL_PROGRAM_ERROR);
1433         handleError(status, "exportNorm");
1434     }
1435     if (nfkcBound > 0xC0) {
1436         if (nfkdBound != 0xC0) {
1437             status.set(U_INTERNAL_PROGRAM_ERROR);
1438             handleError(status, "exportNorm");
1439         }
1440     } else {
1441         if (nfkdBound != nfkcBound) {
1442             status.set(U_INTERNAL_PROGRAM_ERROR);
1443             handleError(status, "exportNorm");
1444         }
1445     }
1446 
1447     USet* uts46DecompositionStartsWithNonStarter = uset_openEmpty();
1448     USet* uts46DecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1449     std::vector<PendingDescriptor> uts46PendingTrieInsertions;
1450     UChar32 uts46dBound = 0x10FFFF;
1451     UChar32 uts46Bound = 0x10FFFF;
1452     computeDecompositions("uts46d",
1453                           backwardCombiningStarters,
1454                           storage16,
1455                           storage32,
1456                           uts46DecompositionStartsWithNonStarter,
1457                           uts46DecompositionStartsWithBackwardCombiningStarter,
1458                           uts46PendingTrieInsertions,
1459                           uts46dBound,
1460                           uts46Bound);
1461     if (!(uts46dBound <= 0xC0 && uts46Bound <= 0x300)) {
1462         status.set(U_INTERNAL_PROGRAM_ERROR);
1463         handleError(status, "exportNorm");
1464     }
1465     if (uts46Bound > 0xC0) {
1466         if (uts46dBound != 0xC0) {
1467             status.set(U_INTERNAL_PROGRAM_ERROR);
1468             handleError(status, "exportNorm");
1469         }
1470     } else {
1471         if (uts46dBound != uts46Bound) {
1472             status.set(U_INTERNAL_PROGRAM_ERROR);
1473             handleError(status, "exportNorm");
1474         }
1475     }
1476 
1477     uint32_t supplementSize16 = storage16.size() - baseSize16;
1478     uint32_t supplementSize32 = storage32.size() - baseSize32;
1479 
1480     writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions, char16_t(nfcBound));
1481     writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions, char16_t(nfkcBound));
1482     writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions, char16_t(uts46Bound));
1483 
1484     writeDecompositionTables("nfdex", storage16.data(), baseSize16, storage32.data(), baseSize32);
1485     writeDecompositionTables("nfkdex", storage16.data() + baseSize16, supplementSize16, storage32.data() + baseSize32, supplementSize32);
1486 
1487     uset_close(nfdDecompositionStartsWithNonStarter);
1488     uset_close(nfkdDecompositionStartsWithNonStarter);
1489     uset_close(uts46DecompositionStartsWithNonStarter);
1490 
1491     uset_close(nfdDecompositionStartsWithBackwardCombiningStarter);
1492     uset_close(nfkdDecompositionStartsWithBackwardCombiningStarter);
1493     uset_close(uts46DecompositionStartsWithBackwardCombiningStarter);
1494 
1495     uset_close(backwardCombiningStarters);
1496     handleError(status, "exportNorm");
1497     return 0;
1498 }
1499 
1500 #endif // !UCONFIG_NO_NORMALIZATION
1501 
main(int argc,char * argv[])1502 int main(int argc, char* argv[]) {
1503     U_MAIN_INIT_ARGS(argc, argv);
1504 
1505     /* preset then read command line options */
1506     options[OPT_DESTDIR].value=u_getDataDirectory();
1507     argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
1508 
1509     if(options[OPT_VERSION].doesOccur) {
1510         printf("icuexportdata version %s, ICU tool to dump data files for external consumers\n",
1511                U_ICU_DATA_VERSION);
1512         printf("%s\n", U_COPYRIGHT_STRING);
1513         exit(0);
1514     }
1515 
1516     /* error handling, printing usage message */
1517     if(argc<0) {
1518         fprintf(stderr,
1519             "error in command line argument \"%s\"\n",
1520             argv[-argc]);
1521     }
1522 
1523     if (argc < 0
1524             || options[OPT_HELP_H].doesOccur
1525             || options[OPT_HELP_QUESTION_MARK].doesOccur
1526             || !options[OPT_MODE].doesOccur) {
1527         FILE *stdfile=argc<0 ? stderr : stdout;
1528         printHelp(stdfile, argv[0]);
1529         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1530     }
1531 
1532     /* get the options values */
1533     haveCopyright = options[OPT_COPYRIGHT].doesOccur;
1534     destdir = options[OPT_DESTDIR].value;
1535     VERBOSE = options[OPT_VERBOSE].doesOccur;
1536     QUIET = options[OPT_QUIET].doesOccur;
1537 
1538     if (options[OPT_TRIE_TYPE].doesOccur) {
1539         if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
1540             trieType = UCPTRIE_TYPE_FAST;
1541         } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
1542             trieType = UCPTRIE_TYPE_SMALL;
1543         } else {
1544             fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
1545             return U_ILLEGAL_ARGUMENT_ERROR;
1546         }
1547     }
1548 
1549     const char* mode = options[OPT_MODE].value;
1550     if (uprv_strcmp(mode, "norm") == 0) {
1551 #if !UCONFIG_NO_NORMALIZATION
1552         return exportNorm();
1553 #else
1554     fprintf(stderr, "Exporting normalization data not supported when compiling without normalization support.\n");
1555     return U_ILLEGAL_ARGUMENT_ERROR;
1556 #endif
1557     }
1558     if (uprv_strcmp(mode, "uprops") == 0) {
1559         return exportUprops(argc, argv);
1560     } else if (uprv_strcmp(mode, "ucase") == 0) {
1561         return exportCase(argc, argv);
1562     }
1563 
1564     fprintf(stderr, "Invalid option for --mode (must be uprops, ucase, or norm)\n");
1565     return U_ILLEGAL_ARGUMENT_ERROR;
1566 }
1567