• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include <cstddef>
5 #include <cstdint>
6 #include <cstdio>
7 #include <iostream>
8 #include "unicode/localpointer.h"
9 #include "unicode/umachine.h"
10 #include "unicode/unistr.h"
11 #include "unicode/urename.h"
12 #include "unicode/uset.h"
13 #include <vector>
14 #include <algorithm>
15 #include "toolutil.h"
16 #include "uoptions.h"
17 #include "cmemory.h"
18 #include "charstr.h"
19 #include "cstring.h"
20 #include "unicode/uchar.h"
21 #include "unicode/errorcode.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uscript.h"
24 #include "unicode/putil.h"
25 #include "unicode/umutablecptrie.h"
26 #include "unicode/ucharstriebuilder.h"
27 #include "ucase.h"
28 #include "unicode/normalizer2.h"
29 #include "normalizer2impl.h"
30 #include "writesrc.h"
31 
32 U_NAMESPACE_USE
33 
34 /*
35  * Global - verbosity
36  */
37 UBool VERBOSE = false;
38 UBool QUIET = false;
39 
40 UBool haveCopyright = true;
41 UCPTrieType trieType = UCPTRIE_TYPE_SMALL;
42 const char* destdir = "";
43 
44 // Mask constants for modified values in the Script CodePointTrie, values are logically 12-bits.
45 int16_t DATAEXPORT_SCRIPT_X_WITH_COMMON    = 0x0400;
46 int16_t DATAEXPORT_SCRIPT_X_WITH_INHERITED = 0x0800;
47 int16_t DATAEXPORT_SCRIPT_X_WITH_OTHER     = 0x0c00;
48 
49 // TODO(ICU-21821): Replace this with a call to a library function
50 int32_t scxCodePoints[] = {
51       7415, 7377, 7380, 7387, 7390, 7391, 7394, 7395, 7396, 7397,
52       7398, 7399, 7400, 7403, 7404, 7406, 7407, 7408, 7409, 113824,
53       113825, 113826, 113827, 834, 837, 7616, 7617, 12294, 12350, 12351,
54       12688, 12689, 12690, 12691, 12692, 12693, 12694, 12695, 12696, 12697,
55       12698, 12699, 12700, 12701, 12702, 12703, 12736, 12737, 12738, 12739,
56       12740, 12741, 12742, 12743, 12744, 12745, 12746, 12747, 12748, 12749,
57       12750, 12751, 12752, 12753, 12754, 12755, 12756, 12757, 12758, 12759,
58       12760, 12761, 12762, 12763, 12764, 12765, 12766, 12767, 12768, 12769,
59       12770, 12771, 12832, 12833, 12834, 12835, 12836, 12837, 12838, 12839,
60       12840, 12841, 12842, 12843, 12844, 12845, 12846, 12847, 12848, 12849,
61       12850, 12851, 12852, 12853, 12854, 12855, 12856, 12857, 12858, 12859,
62       12860, 12861, 12862, 12863, 12864, 12865, 12866, 12867, 12868, 12869,
63       12870, 12871, 12928, 12929, 12930, 12931, 12932, 12933, 12934, 12935,
64       12936, 12937, 12938, 12939, 12940, 12941, 12942, 12943, 12944, 12945,
65       12946, 12947, 12948, 12949, 12950, 12951, 12952, 12953, 12954, 12955,
66       12956, 12957, 12958, 12959, 12960, 12961, 12962, 12963, 12964, 12965,
67       12966, 12967, 12968, 12969, 12970, 12971, 12972, 12973, 12974, 12975,
68       12976, 12992, 12993, 12994, 12995, 12996, 12997, 12998, 12999, 13000,
69       13001, 13002, 13003, 13055, 13144, 13145, 13146, 13147, 13148, 13149,
70       13150, 13151, 13152, 13153, 13154, 13155, 13156, 13157, 13158, 13159,
71       13160, 13161, 13162, 13163, 13164, 13165, 13166, 13167, 13168, 13179,
72       13180, 13181, 13182, 13183, 13280, 13281, 13282, 13283, 13284, 13285,
73       13286, 13287, 13288, 13289, 13290, 13291, 13292, 13293, 13294, 13295,
74       13296, 13297, 13298, 13299, 13300, 13301, 13302, 13303, 13304, 13305,
75       13306, 13307, 13308, 13309, 13310, 119648, 119649, 119650, 119651, 119652,
76       119653, 119654, 119655, 119656, 119657, 119658, 119659, 119660, 119661, 119662,
77       119663, 119664, 119665, 127568, 127569, 867, 868, 869, 870, 871,
78       872, 873, 874, 875, 876, 877, 878, 879, 7418, 7674,
79       66272, 66273, 66274, 66275, 66276, 66277, 66278, 66279, 66280, 66281,
80       66282, 66283, 66284, 66285, 66286, 66287, 66288, 66289, 66290, 66291,
81       66292, 66293, 66294, 66295, 66296, 66297, 66298, 66299, 1748, 64830,
82       64831, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619,
83       1620, 1621, 1648, 65010, 65021, 7381, 7382, 7384, 7393, 7402,
84       7405, 7413, 7414, 43249, 12330, 12331, 12332, 12333, 43471, 65794,
85       65847, 65848, 65849, 65850, 65851, 65852, 65853, 65854, 65855, 1156,
86       1159, 11843, 42607, 1157, 1158, 1155, 7672, 7379, 7411, 7416,
87       7417, 7401, 7383, 7385, 7388, 7389, 7392, 43251, 4347, 3046,
88       3047, 3048, 3049, 3050, 3051, 3052, 3053, 3054, 3055, 3056,
89       3057, 3058, 3059, 70401, 70403, 70459, 70460, 73680, 73681, 73683,
90       2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2799,
91       2662, 2663, 2664, 2665, 2666, 2667, 2668, 2669, 2670, 2671,
92       42752, 42753, 42754, 42755, 42756, 42757, 42758, 42759, 12337, 12338,
93       12339, 12340, 12341, 12441, 12442, 12443, 12444, 12448, 12540, 65392,
94       65438, 65439, 3302, 3303, 3304, 3305, 3306, 3307, 3308, 3309,
95       3310, 3311, 8239, 68338, 6146, 6147, 6149, 1564, 1632, 1633,
96       1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 2534, 2535,
97       2536, 2537, 2538, 2539, 2540, 2541, 2542, 2543, 4160, 4161,
98       4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 65792, 65793,
99       65799, 65800, 65801, 65802, 65803, 65804, 65805, 65806, 65807, 65808,
100       65809, 65810, 65811, 65812, 65813, 65814, 65815, 65816, 65817, 65818,
101       65819, 65820, 65821, 65822, 65823, 65824, 65825, 65826, 65827, 65828,
102       65829, 65830, 65831, 65832, 65833, 65834, 65835, 65836, 65837, 65838,
103       65839, 65840, 65841, 65842, 65843, 7412, 8432, 12348, 12349, 43310,
104       7376, 7378, 5941, 5942, 2406, 2407, 2408, 2409, 2410, 2411,
105       2412, 2413, 2414, 2415, 12291, 12307, 12316, 12317, 12318, 12319,
106       12336, 12343, 65093, 65094, 1548, 1563, 12289, 12290, 12296, 12297,
107       12298, 12299, 12300, 12301, 12302, 12303, 12304, 12305, 12308, 12309,
108       12310, 12311, 12312, 12313, 12314, 12315, 12539, 65377, 65378, 65379,
109       65380, 65381, 7386, 1567, 7410, 1600, 43062, 43063, 43064, 43065,
110       2386, 2385, 43059, 43060, 43061, 43056, 43057, 43058, 2404, 2405
111     };
112 
handleError(ErrorCode & status,const char * context)113 void handleError(ErrorCode& status, const char* context) {
114     if (status.isFailure()) {
115         std::cerr << "Error: " << context << ": " << status.errorName() << std::endl;
116         exit(status.reset());
117     }
118 }
119 
120 class PropertyValueNameGetter : public ValueNameGetter {
121 public:
PropertyValueNameGetter(UProperty prop)122     PropertyValueNameGetter(UProperty prop) : property(prop) {}
123     ~PropertyValueNameGetter() override;
getName(uint32_t value)124     const char *getName(uint32_t value) override {
125         return u_getPropertyValueName(property, value, U_SHORT_PROPERTY_NAME);
126     }
127 
128 private:
129     UProperty property;
130 };
131 
~PropertyValueNameGetter()132 PropertyValueNameGetter::~PropertyValueNameGetter() {}
133 
134 // Dump an aliases = [...] key for properties with aliases
dumpPropertyAliases(UProperty uproperty,FILE * f)135 void dumpPropertyAliases(UProperty uproperty, FILE* f) {
136     int i = U_LONG_PROPERTY_NAME + 1;
137 
138     while(true) {
139         // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially,
140         // and returning null after that
141         const char* alias = u_getPropertyName(uproperty, (UPropertyNameChoice) i);
142         if (!alias) {
143             break;
144         }
145         if (i == U_LONG_PROPERTY_NAME + 1) {
146             fprintf(f, "aliases = [\"%s\"", alias);
147         } else {
148             fprintf(f, ", \"%s\"", alias);
149         }
150         i++;
151     }
152     if (i != U_LONG_PROPERTY_NAME + 1) {
153         fprintf(f, "]\n");
154     }
155 }
156 
dumpBinaryProperty(UProperty uproperty,FILE * f)157 void dumpBinaryProperty(UProperty uproperty, FILE* f) {
158     IcuToolErrorCode status("icuexportdata: dumpBinaryProperty");
159     const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
160     const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
161     const USet* uset = u_getBinaryPropertySet(uproperty, status);
162     handleError(status, fullPropName);
163 
164     fputs("[[binary_property]]\n", f);
165     fprintf(f, "long_name = \"%s\"\n", fullPropName);
166     if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
167     fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
168     dumpPropertyAliases(uproperty, f);
169     usrc_writeUnicodeSet(f, uset, UPRV_TARGET_SYNTAX_TOML);
170 }
171 
172 // If the value exists, dump an indented entry of the format
173 // `"  {discr = <discriminant>, long = <longname>, short = <shortname>, aliases = [<aliases>]},"`
dumpValueEntry(UProperty uproperty,int v,bool is_mask,FILE * f)174 void dumpValueEntry(UProperty uproperty, int v, bool is_mask, FILE* f) {
175     const char* fullValueName = u_getPropertyValueName(uproperty, v, U_LONG_PROPERTY_NAME);
176     const char* shortValueName = u_getPropertyValueName(uproperty, v, U_SHORT_PROPERTY_NAME);
177     if (!fullValueName) {
178         return;
179     }
180     if (is_mask) {
181         fprintf(f, "  {discr = 0x%X", v);
182     } else {
183         fprintf(f, "  {discr = %i", v);
184     }
185     fprintf(f, ", long = \"%s\"", fullValueName);
186     if (shortValueName) {
187         fprintf(f, ", short = \"%s\"", shortValueName);
188     }
189     int i = U_LONG_PROPERTY_NAME + 1;
190     while(true) {
191         // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially,
192         // and returning null after that
193         const char* alias = u_getPropertyValueName(uproperty, v, (UPropertyNameChoice) i);
194         if (!alias) {
195             break;
196         }
197         if (i == U_LONG_PROPERTY_NAME + 1) {
198             fprintf(f, ", aliases = [\"%s\"", alias);
199         } else {
200             fprintf(f, ", \"%s\"", alias);
201         }
202         i++;
203     }
204     if (i != U_LONG_PROPERTY_NAME + 1) {
205         fprintf(f, "]");
206     }
207     fprintf(f, "},\n");
208 }
209 
dumpEnumeratedProperty(UProperty uproperty,FILE * f)210 void dumpEnumeratedProperty(UProperty uproperty, FILE* f) {
211     IcuToolErrorCode status("icuexportdata: dumpEnumeratedProperty");
212     const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
213     const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
214     const UCPMap* umap = u_getIntPropertyMap(uproperty, status);
215     handleError(status, fullPropName);
216 
217     fputs("[[enum_property]]\n", f);
218     fprintf(f, "long_name = \"%s\"\n", fullPropName);
219     if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
220     fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
221     dumpPropertyAliases(uproperty, f);
222 
223     int32_t minValue = u_getIntPropertyMinValue(uproperty);
224     U_ASSERT(minValue >= 0);
225     int32_t maxValue = u_getIntPropertyMaxValue(uproperty);
226     U_ASSERT(maxValue >= 0);
227 
228     fprintf(f, "values = [\n");
229     for (int v = minValue; v <= maxValue; v++) {
230         dumpValueEntry(uproperty, v, false, f);
231     }
232     fprintf(f, "]\n");
233 
234     PropertyValueNameGetter valueNameGetter(uproperty);
235     usrc_writeUCPMap(f, umap, &valueNameGetter, UPRV_TARGET_SYNTAX_TOML);
236     fputs("\n", f);
237 
238 
239     UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32;
240     if (maxValue <= 0xff) {
241         width = UCPTRIE_VALUE_BITS_8;
242     } else if (maxValue <= 0xffff) {
243         width = UCPTRIE_VALUE_BITS_16;
244     }
245     LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(umap, status));
246     LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
247         builder.getAlias(),
248         trieType,
249         width,
250         status));
251     handleError(status, fullPropName);
252 
253     fputs("[enum_property.code_point_trie]\n", f);
254     usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
255 }
256 
257 /*
258 * Export Bidi_Mirroring_Glyph values (code points) in a similar way to how enumerated
259 * properties are dumped to file.
260 * Note: the data will store 0 for code points without a value defined for
261 * Bidi_Mirroring_Glyph.
262 */
dumpBidiMirroringGlyph(FILE * f)263 void dumpBidiMirroringGlyph(FILE* f) {
264     UProperty uproperty = UCHAR_BIDI_MIRRORING_GLYPH;
265     IcuToolErrorCode status("icuexportdata: dumpBidiMirroringGlyph");
266     const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
267     const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
268     handleError(status, fullPropName);
269 
270     // Store 21-bit code point as is
271     UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32;
272 
273     // note: unlike dumpEnumeratedProperty, which can get inversion map data using
274     // u_getIntPropertyMap(uproperty), the only reliable way to get Bidi_Mirroring_Glyph
275     // is to use u_charMirror(cp) over the code point space.
276     LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
277     for(UChar32 c = UCHAR_MIN_VALUE; c <= UCHAR_MAX_VALUE; c++) {
278         UChar32 mirroringGlyph = u_charMirror(c);
279         // The trie builder code throws an error when it cannot compress the data sufficiently.
280         // Therefore, when the value is undefined for a code point, keep a 0 in the trie
281         // instead of the ICU API behavior of returning the code point value. Using 0
282         // results in a relatively significant space savings by not including redundant data.
283         if (c != mirroringGlyph) {
284             umutablecptrie_set(builder.getAlias(), c, mirroringGlyph, status);
285         }
286     }
287 
288     LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
289         builder.getAlias(),
290         trieType,
291         width,
292         status));
293     handleError(status, fullPropName);
294 
295     // currently a trie and inversion map are the same (as relied upon in characterproperties.cpp)
296     const UCPMap* umap = reinterpret_cast<UCPMap *>(utrie.getAlias());
297 
298     fputs("[[enum_property]]\n", f);
299     fprintf(f, "long_name = \"%s\"\n", fullPropName);
300     if (shortPropName) {
301         fprintf(f, "short_name = \"%s\"\n", shortPropName);
302     }
303     fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
304     dumpPropertyAliases(uproperty, f);
305 
306     usrc_writeUCPMap(f, umap, nullptr, UPRV_TARGET_SYNTAX_TOML);
307     fputs("\n", f);
308 
309     fputs("[enum_property.code_point_trie]\n", f);
310     usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
311 }
312 
313 // After printing property value `v`, print `mask` if and only if `mask` comes immediately
314 // after the property in the listing
maybeDumpMaskValue(UProperty uproperty,uint32_t v,uint32_t mask,FILE * f)315 void maybeDumpMaskValue(UProperty uproperty, uint32_t v, uint32_t mask, FILE* f) {
316     if (U_MASK(v) < mask && U_MASK(v + 1) > mask)
317         dumpValueEntry(uproperty, mask, true, f);
318 }
319 
dumpGeneralCategoryMask(FILE * f)320 void dumpGeneralCategoryMask(FILE* f) {
321     IcuToolErrorCode status("icuexportdata: dumpGeneralCategoryMask");
322     UProperty uproperty = UCHAR_GENERAL_CATEGORY_MASK;
323 
324     fputs("[[mask_property]]\n", f);
325     const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
326     const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
327     fprintf(f, "long_name = \"%s\"\n", fullPropName);
328     if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
329     fprintf(f, "uproperty_discr = 0x%X\n", uproperty);
330     dumpPropertyAliases(uproperty, f);
331 
332 
333     fprintf(f, "mask_for = \"General_Category\"\n");
334     uint32_t minValue = u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY);
335     U_ASSERT(minValue >= 0);
336     uint32_t maxValue = u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY);
337     U_ASSERT(maxValue >= 0);
338 
339     fprintf(f, "values = [\n");
340     for (uint32_t v = minValue; v <= maxValue; v++) {
341         dumpValueEntry(uproperty, U_MASK(v), true, f);
342 
343         // We want to dump these masks "in order", which means they
344         // should come immediately after every property they contain
345         maybeDumpMaskValue(uproperty, v, U_GC_L_MASK, f);
346         maybeDumpMaskValue(uproperty, v, U_GC_LC_MASK, f);
347         maybeDumpMaskValue(uproperty, v, U_GC_M_MASK, f);
348         maybeDumpMaskValue(uproperty, v, U_GC_N_MASK, f);
349         maybeDumpMaskValue(uproperty, v, U_GC_Z_MASK, f);
350         maybeDumpMaskValue(uproperty, v, U_GC_C_MASK, f);
351         maybeDumpMaskValue(uproperty, v, U_GC_P_MASK, f);
352         maybeDumpMaskValue(uproperty, v, U_GC_S_MASK, f);
353     }
354     fprintf(f, "]\n");
355 }
356 
dumpScriptExtensions(FILE * f)357 void dumpScriptExtensions(FILE* f) {
358     IcuToolErrorCode status("icuexportdata: dumpScriptExtensions");
359 
360     fputs("[[script_extensions]]\n", f);
361     const char* scxFullPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_LONG_PROPERTY_NAME);
362     const char* scxShortPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_SHORT_PROPERTY_NAME);
363     fprintf(f, "long_name = \"%s\"\n", scxFullPropName);
364     if (scxShortPropName) fprintf(f, "short_name = \"%s\"\n", scxShortPropName);
365     fprintf(f, "uproperty_discr = 0x%X\n", UCHAR_SCRIPT_EXTENSIONS);
366     dumpPropertyAliases(UCHAR_SCRIPT_EXTENSIONS, f);
367 
368     // We want to use 16 bits for our exported trie of sc/scx data because we
369     // need 12 bits to match the 12 bits of data stored for sc/scx in the trie
370     // in the uprops.icu data file.
371     UCPTrieValueWidth scWidth = UCPTRIE_VALUE_BITS_16;
372 
373     // Create a mutable UCPTrie builder populated with Script property values data.
374     const UCPMap* scInvMap = u_getIntPropertyMap(UCHAR_SCRIPT, status);
375     handleError(status, scxFullPropName);
376     LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(scInvMap, status));
377     handleError(status, scxFullPropName);
378 
379     // The values for the output scx companion array.
380     // Invariant is that all subvectors are distinct.
381     std::vector< std::vector<uint16_t> > outputDedupVec;
382 
383     // The sc/scx companion array is an array of arrays (of script codes)
384     fputs("script_code_array = [\n", f);
385     for(const UChar32 cp : scxCodePoints) {
386         // Get the Script value
387         uint32_t scVal = umutablecptrie_get(builder.getAlias(), cp);
388         // Get the Script_Extensions value (array of Script codes)
389         const int32_t SCX_ARRAY_CAPACITY = 32;
390         UScriptCode scxValArray[SCX_ARRAY_CAPACITY];
391         int32_t numScripts = uscript_getScriptExtensions(cp, scxValArray, SCX_ARRAY_CAPACITY, status);
392         handleError(status, scxFullPropName);
393 
394         // Convert the scx array into a vector
395         std::vector<uint16_t> scxValVec;
396         for(int i = 0; i < numScripts; i++) {
397             scxValVec.push_back(scxValArray[i]);
398         }
399         // Ensure that it is sorted
400         std::sort(scxValVec.begin(), scxValVec.end());
401         // Copy the Script value into the first position of the scx array only
402         // if we have the "other" case (Script value is not Common nor Inherited).
403         // This offers faster access when users want only the Script value.
404         if (scVal != USCRIPT_COMMON && scVal != USCRIPT_INHERITED) {
405             scxValVec.insert(scxValVec.begin(), scVal);
406         }
407 
408         // See if there is already an scx value array matching the newly built one.
409         // If there is, then use its index.
410         // If not, then append the new value array.
411         bool isScxValUnique = true;
412         size_t outputIndex = 0;
413         for (outputIndex = 0; outputIndex < outputDedupVec.size(); outputIndex++) {
414             if (outputDedupVec[outputIndex] == scxValVec) {
415                 isScxValUnique = false;
416                 break;
417             }
418         }
419 
420         if (isScxValUnique) {
421             outputDedupVec.push_back(scxValVec);
422             usrc_writeArray(f, "  [", scxValVec.data(), 16, scxValVec.size(), "    ", "],\n");
423         }
424 
425         // We must update the value in the UCPTrie for the code point to contain:
426         // 9..0 the Script code in the lower 10 bits when 11..10 is 0, else it is
427         //   the index into the companion array
428         // 11..10 the same higher-order 2 bits in the trie in uprops.icu indicating whether
429         //   3: other
430         //   2: Script=Inherited
431         //   1: Script=Common
432         //   0: Script=value in 9..0 (N/A because we are in this loop to create the companion array for non-0 cases)
433         uint16_t mask = 0;
434         if (scVal == USCRIPT_COMMON) {
435             mask = DATAEXPORT_SCRIPT_X_WITH_COMMON;
436         } else if (scVal == USCRIPT_INHERITED) {
437             mask = DATAEXPORT_SCRIPT_X_WITH_INHERITED;
438         } else {
439             mask = DATAEXPORT_SCRIPT_X_WITH_OTHER;
440         }
441 
442         // The new trie value is the index into the new array with the high order bits set
443         uint32_t newScVal = outputIndex | mask;
444 
445         // Update the code point in the mutable trie builder with the trie value
446         umutablecptrie_set(builder.getAlias(), cp, newScVal, status);
447         handleError(status, scxFullPropName);
448     }
449     fputs("]\n\n", f);  // Print the TOML close delimiter for the outer array.
450 
451     // Convert from mutable trie builder to immutable trie.
452     LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
453         builder.getAlias(),
454         trieType,
455         scWidth,
456         status));
457     handleError(status, scxFullPropName);
458 
459     fputs("[script_extensions.code_point_trie]\n", f);
460     usrc_writeUCPTrie(f, scxShortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
461 }
462 
prepareOutputFile(const char * basename)463 FILE* prepareOutputFile(const char* basename) {
464     IcuToolErrorCode status("icuexportdata");
465     CharString outFileName;
466     if (destdir != nullptr && *destdir != 0) {
467         outFileName.append(destdir, status).ensureEndsWithFileSeparator(status);
468     }
469     outFileName.append(basename, status);
470     outFileName.append(".toml", status);
471     handleError(status, basename);
472 
473     FILE* f = fopen(outFileName.data(), "w");
474     if (f == nullptr) {
475         std::cerr << "Unable to open file: " << outFileName.data() << std::endl;
476         exit(U_FILE_ACCESS_ERROR);
477     }
478     if (!QUIET) {
479         std::cout << "Writing to: " << outFileName.data() << std::endl;
480     }
481 
482     if (haveCopyright) {
483         usrc_writeCopyrightHeader(f, "#", 2021);
484     }
485     usrc_writeFileNameGeneratedBy(f, "#", basename, "icuexportdata.cpp");
486 
487     return f;
488 }
489 
490 #if !UCONFIG_NO_NORMALIZATION
491 
492 struct PendingDescriptor {
493     UChar32 scalar;
494     uint32_t descriptor;
495     UBool supplementary;
496 };
497 
writeCanonicalCompositions(USet * backwardCombiningStarters)498 void writeCanonicalCompositions(USet* backwardCombiningStarters) {
499     IcuToolErrorCode status("icuexportdata: computeCanonicalCompositions");
500     const char* basename = "compositions";
501     FILE* f = prepareOutputFile(basename);
502 
503     LocalPointer<UCharsTrieBuilder> backwardBuilder(new UCharsTrieBuilder(status), status);
504 
505     const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
506     UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
507 
508     const Normalizer2* nfc = Normalizer2::getNFCInstance(status);
509     for (UChar32 c = 0; c <= 0x10FFFF; ++c) {
510         if (c >= 0xD800 && c < 0xE000) {
511             // Surrogate
512             continue;
513         }
514         UnicodeString decomposition;
515         if (!nfc->getRawDecomposition(c, decomposition)) {
516             continue;
517         }
518         int32_t len = decomposition.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
519         if (len != 2) {
520             continue;
521         }
522         UChar32 starter = utf32[0];
523         UChar32 second = utf32[1];
524         UChar32 composite = nfc->composePair(starter, second);
525         if (composite < 0) {
526             continue;
527         }
528         if (c != composite) {
529             status.set(U_INTERNAL_PROGRAM_ERROR);
530             handleError(status, basename);
531         }
532         if (!u_getCombiningClass(second)) {
533             uset_add(backwardCombiningStarters, second);
534         }
535         if (composite >= 0xAC00 && composite <= 0xD7A3) {
536             // Hangul syllable
537             continue;
538         }
539 
540         UnicodeString backward;
541         backward.append(second);
542         backward.append(starter);
543         backwardBuilder->add(backward, int32_t(composite), status);
544     }
545     UnicodeString canonicalCompositionTrie;
546     backwardBuilder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, canonicalCompositionTrie, status);
547 
548     usrc_writeArray(f, "compositions = [\n  ", canonicalCompositionTrie.getBuffer(), 16, canonicalCompositionTrie.length(), "  ", "\n]\n");
549     fclose(f);
550     handleError(status, basename);
551 }
552 
writeDecompositionTables(const char * basename,const uint16_t * ptr16,size_t len16,const uint32_t * ptr32,size_t len32)553 void writeDecompositionTables(const char* basename, const uint16_t* ptr16, size_t len16, const uint32_t* ptr32, size_t len32) {
554     FILE* f = prepareOutputFile(basename);
555     usrc_writeArray(f, "scalars16 = [\n  ", ptr16, 16, len16, "  ", "\n]\n");
556     usrc_writeArray(f, "scalars32 = [\n  ", ptr32, 32, len32, "  ", "\n]\n");
557     fclose(f);
558 }
559 
writeDecompositionData(const char * basename,uint32_t baseSize16,uint32_t baseSize32,uint32_t supplementSize16,USet * uset,USet * reference,const std::vector<PendingDescriptor> & pendingTrieInsertions,char16_t passthroughCap)560 void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector<PendingDescriptor>& pendingTrieInsertions, char16_t passthroughCap) {
561     IcuToolErrorCode status("icuexportdata: writeDecompositionData");
562     FILE* f = prepareOutputFile(basename);
563 
564     // Zero is a magic number that means the character decomposes to itself.
565     LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
566 
567     // Iterate backwards to insert lower code points in the trie first in case it matters
568     // for trie block allocation.
569     for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) {
570         const PendingDescriptor& pending = pendingTrieInsertions[i];
571         uint32_t additional = 0;
572         if (!(pending.descriptor & 0xFFFE0000)) {
573             uint32_t offset = pending.descriptor & 0xFFF;
574             if (!pending.supplementary) {
575                 if (offset >= baseSize16) {
576                     // This is a offset to supplementary 16-bit data. We have
577                     // 16-bit base data and 32-bit base data before. However,
578                     // the 16-bit base data length is already part of offset.
579                     additional = baseSize32;
580                 }
581             } else {
582                 if (offset >= baseSize32) {
583                     // This is an offset to supplementary 32-bit data. We have 16-bit
584                     // base data, 32-bit base data, and 16-bit supplementary data before.
585                     // However, the 32-bit base data length is already part
586                     // of offset.
587                     additional = baseSize16 + supplementSize16;
588                 } else {
589                     // This is an offset to 32-bit base data. We have 16-bit
590                     // base data before.
591                     additional = baseSize16;
592                 }
593             }
594             if (offset + additional > 0xFFF) {
595                 status.set(U_INTERNAL_PROGRAM_ERROR);
596                 handleError(status, basename);
597             }
598         }
599         // It turns out it's better to swap the halves compared to the initial
600         // idea in order to put special marker values close to zero so that
601         // an important marker value becomes 1, so it's efficient to compare
602         // "1 or 0". Unfortunately, going through all the code to swap
603         // things is too error prone, so let's do the swapping here in one
604         // place.
605         uint32_t oldTrieValue = pending.descriptor + additional;
606         uint32_t swappedTrieValue = (oldTrieValue >> 16) | (oldTrieValue << 16);
607         umutablecptrie_set(builder.getAlias(), pending.scalar, swappedTrieValue, status);
608     }
609     LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
610         builder.getAlias(),
611         trieType,
612         UCPTRIE_VALUE_BITS_32,
613         status));
614     handleError(status, basename);
615 
616     if (reference) {
617         if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) {
618             // NFD expectations don't hold. The set must not contain the half-width
619             // kana voicing marks and must contain iota subscript.
620             status.set(U_INTERNAL_PROGRAM_ERROR);
621             handleError(status, basename);
622         }
623 
624         USet* halfWidthVoicing = uset_openEmpty();
625         uset_add(halfWidthVoicing, 0xFF9E);
626         uset_add(halfWidthVoicing, 0xFF9F);
627 
628         USet* iotaSubscript = uset_openEmpty();
629         uset_add(iotaSubscript, 0x0345);
630 
631         uint8_t flags = 0;
632 
633         USet* halfWidthCheck = uset_cloneAsThawed(uset);
634         uset_removeAll(halfWidthCheck, reference);
635         if (uset_equals(halfWidthCheck, halfWidthVoicing)) {
636             flags |= 1;
637         } else if (!uset_isEmpty(halfWidthCheck)) {
638             // The result was neither empty nor contained exactly
639             // the two half-width voicing marks. The ICU4X
640             // normalizer doesn't know how to deal with this case.
641             status.set(U_INTERNAL_PROGRAM_ERROR);
642             handleError(status, basename);
643         }
644         uset_close(halfWidthCheck);
645 
646         USet* iotaCheck = uset_cloneAsThawed(reference);
647         uset_removeAll(iotaCheck, uset);
648         if (!(uset_equals(iotaCheck, iotaSubscript)) && !uset_isEmpty(iotaCheck)) {
649             // The result was neither empty nor contained exactly
650             // the iota subscript. The ICU4X normalizer doesn't
651             // know how to deal with this case.
652             status.set(U_INTERNAL_PROGRAM_ERROR);
653             handleError(status, basename);
654         }
655 
656         uset_close(iotaSubscript);
657         uset_close(halfWidthVoicing);
658 
659         fprintf(f, "flags = 0x%X\n", flags);
660         fprintf(f, "cap = 0x%X\n", passthroughCap);
661     }
662     fprintf(f, "[trie]\n");
663     usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
664     fclose(f);
665     handleError(status, basename);
666 }
667 
668 // Special marker for the NFKD form of U+FDFA
669 const int32_t FDFA_MARKER = 3;
670 
671 // Special marker for characters whose decomposition starts with a non-starter
672 // and the decomposition isn't the character itself.
673 const int32_t SPECIAL_NON_STARTER_DECOMPOSITION_MARKER = 2;
674 
675 // Special marker for starters that decompose to themselves but that may
676 // combine backwards under canonical composition
677 const int32_t BACKWARD_COMBINING_STARTER_MARKER = 1;
678 
679 /// Marker that a complex decomposition isn't round-trippable
680 /// under re-composition.
681 const uint32_t NON_ROUND_TRIP_MARKER = 1;
682 
permissibleBmpPair(UBool knownToRoundTrip,UChar32 c,UChar32 second)683 UBool permissibleBmpPair(UBool knownToRoundTrip, UChar32 c, UChar32 second) {
684     if (knownToRoundTrip) {
685         return true;
686     }
687     // Nuktas, Hebrew presentation forms and polytonic Greek with oxia
688     // are special-cased in ICU4X.
689     if (c >= 0xFB1D && c <= 0xFB4E) {
690         // Hebrew presentation forms
691         return true;
692     }
693     if (c >= 0x1F71 && c <= 0x1FFB) {
694         // Polytonic Greek with oxia
695         return true;
696     }
697     if ((second & 0x7F) == 0x3C && second >= 0x0900 && second <= 0x0BFF) {
698         // Nukta
699         return true;
700     }
701     // To avoid more branchiness, 4 characters that decompose to
702     // a BMP starter followed by a BMP non-starter are excluded
703     // from being encoded directly into the trie value and are
704     // handled as complex decompositions instead. These are:
705     // U+0F76 TIBETAN VOWEL SIGN VOCALIC R
706     // U+0F78 TIBETAN VOWEL SIGN VOCALIC L
707     // U+212B ANGSTROM SIGN
708     // U+2ADC FORKING
709     return false;
710 }
711 
712 
713 // Find the slice `needle` within `storage` and return its index, failing which,
714 // append all elements of `needle` to `storage` and return the index of it at the end.
715 template<typename T>
findOrAppend(std::vector<T> & storage,const UChar32 * needle,size_t needleLen)716 size_t findOrAppend(std::vector<T>& storage, const UChar32* needle, size_t needleLen) {
717     // Last index where we might find the start of the complete needle.
718     // bounds check is `i + needleLen <= storage.size()` since the inner
719     // loop will range from `i` to `i + needleLen - 1` (the `-1` is why we use `<=`)
720     for (size_t i = 0; i + needleLen <= storage.size(); i++) {
721         for (size_t j = 0;; j++) {
722             if (j == needleLen) {
723                 return i;  // found a match
724             }
725             if (storage[i + j] != uint32_t(needle[j])) {
726                 break;
727             }
728         }
729     }
730     // We didn't find anything. Append, keeping the append index in mind.
731     size_t index = storage.size();
732     for(size_t i = 0; i < needleLen; i++) {
733         storage.push_back(T(needle[i]));
734     }
735 
736     return index;
737 }
738 
739 
740 // Computes data for canonical decompositions
computeDecompositions(const char * basename,const USet * backwardCombiningStarters,std::vector<uint16_t> & storage16,std::vector<uint32_t> & storage32,USet * decompositionStartsWithNonStarter,USet * decompositionStartsWithBackwardCombiningStarter,std::vector<PendingDescriptor> & pendingTrieInsertions,UChar32 & decompositionPassthroughBound,UChar32 & compositionPassthroughBound)741 void computeDecompositions(const char* basename,
742                            const USet* backwardCombiningStarters,
743                            std::vector<uint16_t>& storage16,
744                            std::vector<uint32_t>& storage32,
745                            USet* decompositionStartsWithNonStarter,
746                            USet* decompositionStartsWithBackwardCombiningStarter,
747                            std::vector<PendingDescriptor>& pendingTrieInsertions,
748                            UChar32& decompositionPassthroughBound,
749                            UChar32& compositionPassthroughBound) {
750     IcuToolErrorCode status("icuexportdata: computeDecompositions");
751     const Normalizer2* mainNormalizer;
752     const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(status);
753     const Normalizer2* nfcNormalizer = Normalizer2::getNFCInstance(status);
754     FILE* f = nullptr;
755     std::vector<uint32_t> nonRecursive32;
756     LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status));
757 
758     if (uprv_strcmp(basename, "nfkd") == 0) {
759         mainNormalizer = Normalizer2::getNFKDInstance(status);
760     } else if (uprv_strcmp(basename, "uts46d") == 0) {
761         mainNormalizer = Normalizer2::getInstance(nullptr, "uts46", UNORM2_COMPOSE, status);
762     } else {
763         mainNormalizer = nfdNormalizer;
764         f = prepareOutputFile("decompositionex");
765     }
766 
767     // Max length as of Unicode 14 is 4 for NFD. For NFKD the max
768     // is 18 (U+FDFA; special-cased), and the next longest is 8 (U+FDFB).
769     const int32_t LONGEST_ENCODABLE_LENGTH_16 = 9;
770     const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8;
771     const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
772     UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
773     const int32_t RAW_DECOMPOSITION_BUFFER_SIZE = 2;
774     UChar32 rawUtf32[RAW_DECOMPOSITION_BUFFER_SIZE];
775 
776     // Iterate over all scalar values excluding Hangul syllables.
777     //
778     // We go backwards in order to better find overlapping decompositions.
779     //
780     // As of Unicode 14:
781     // Iterate forward without overlap search:
782     // nfd: 16 size: 896, 32 size: 173
783     // nfkd: 16 size: 3854, 32 size: 179
784     //
785     // Iterate forward with overlap search:
786     // nfd: 16 size: 888, 32 size: 173
787     // nfkd: 16 size: 3266, 32 size: 179
788     //
789     // Iterate backward with overlap search:
790     // nfd: 16 size: 776, 32 size: 173
791     // nfkd: 16 size: 2941, 32 size: 179
792     //
793     // UChar32 is signed!
794     for (UChar32 c = 0x10FFFF; c >= 0; --c) {
795         if (c >= 0xAC00 && c <= 0xD7A3) {
796             // Hangul syllable
797             continue;
798         }
799         if (c >= 0xD800 && c < 0xE000) {
800             // Surrogate
801             continue;
802         }
803         UnicodeString src;
804         UnicodeString dst;
805         // True if we're building non-NFD or we're building NFD but
806         // the `c` round trips to NFC.
807         // False if we're building NFD and `c` does not round trip to NFC.
808         UBool nonNfdOrRoundTrips = true;
809         src.append(c);
810         if (mainNormalizer != nfdNormalizer) {
811             UnicodeString inter;
812             mainNormalizer->normalize(src, inter, status);
813             nfdNormalizer->normalize(inter, dst, status);
814         } else {
815             nfdNormalizer->normalize(src, dst, status);
816             UnicodeString nfc;
817             nfcNormalizer->normalize(dst, nfc, status);
818             nonNfdOrRoundTrips = (src == nfc);
819         }
820         int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
821         if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
822             // Characters that normalize to nothing or to U+FFFD (without the
823             // input being U+FFFD) in ICU4C's UTS 46 normalization normalize
824             // as in NFD in ICU4X's UTF 46 normalization in the interest
825             // of data size and ICU4X's normalizer being unable to handle
826             // normalizing to nothing.
827             // When UTS 46 is implemented on top of ICU4X, a preprocessing
828             // step is supposed to remove these characters before the
829             // normalization step.
830             if (uprv_strcmp(basename, "uts46d") != 0) {
831                 status.set(U_INTERNAL_PROGRAM_ERROR);
832                 handleError(status, basename);
833             }
834             nfdNormalizer->normalize(src, dst, status);
835             len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
836             if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
837                 status.set(U_INTERNAL_PROGRAM_ERROR);
838                 handleError(status, basename);
839             }
840         }
841         if (len > DECOMPOSITION_BUFFER_SIZE) {
842             status.set(U_INTERNAL_PROGRAM_ERROR);
843             handleError(status, basename);
844         }
845         uint8_t firstCombiningClass = u_getCombiningClass(utf32[0]);
846         bool specialNonStarterDecomposition = false;
847         bool startsWithBackwardCombiningStarter = false;
848         if (firstCombiningClass) {
849             decompositionPassthroughBound = c;
850             compositionPassthroughBound = c;
851             uset_add(decompositionStartsWithNonStarter, c);
852             if (src != dst) {
853                 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F) {
854                     specialNonStarterDecomposition = true;
855                 } else {
856                     // A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X.
857                     status.set(U_INTERNAL_PROGRAM_ERROR);
858                     handleError(status, basename);
859                 }
860             }
861         } else if (uset_contains(backwardCombiningStarters, utf32[0])) {
862             compositionPassthroughBound = c;
863             startsWithBackwardCombiningStarter = true;
864             uset_add(decompositionStartsWithBackwardCombiningStarter, c);
865         }
866         if (c != BACKWARD_COMBINING_STARTER_MARKER && len == 1 && utf32[0] == BACKWARD_COMBINING_STARTER_MARKER) {
867             status.set(U_INTERNAL_PROGRAM_ERROR);
868             handleError(status, basename);
869         }
870         if (c != SPECIAL_NON_STARTER_DECOMPOSITION_MARKER && len == 1 && utf32[0] == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER) {
871             status.set(U_INTERNAL_PROGRAM_ERROR);
872             handleError(status, basename);
873         }
874         if (c != FDFA_MARKER && len == 1 && utf32[0] == FDFA_MARKER) {
875             status.set(U_INTERNAL_PROGRAM_ERROR);
876             handleError(status, basename);
877         }
878         if (mainNormalizer != nfdNormalizer) {
879             UnicodeString nfd;
880             nfdNormalizer->normalize(src, nfd, status);
881             if (dst == nfd) {
882                 continue;
883             }
884             decompositionPassthroughBound = c;
885             compositionPassthroughBound = c;
886         } else if (firstCombiningClass) {
887             len = 1;
888             if (specialNonStarterDecomposition) {
889                 utf32[0] = SPECIAL_NON_STARTER_DECOMPOSITION_MARKER; // magic value
890             } else {
891                 // Use the surrogate range to store the canonical combining class
892                 utf32[0] = 0xD800 | UChar32(firstCombiningClass);
893             }
894         } else {
895             if (src == dst) {
896                 if (startsWithBackwardCombiningStarter) {
897                     pendingTrieInsertions.push_back({c, BACKWARD_COMBINING_STARTER_MARKER << 16, false});
898                 }
899                 continue;
900             }
901             decompositionPassthroughBound = c;
902             // ICU4X hard-codes ANGSTROM SIGN
903             if (c != 0x212B) {
904                 UnicodeString raw;
905                 if (!nfdNormalizer->getRawDecomposition(c, raw)) {
906                     // We're always supposed to have a non-recursive decomposition
907                     // if we had a recursive one.
908                     status.set(U_INTERNAL_PROGRAM_ERROR);
909                     handleError(status, basename);
910                 }
911                 // In addition to actual difference, put the whole range that contains characters
912                 // with oxia into the non-recursive trie in order to catch cases where characters
913                 // with oxia have singleton decompositions to corresponding characters with tonos.
914                 // This way, the run-time decision to fall through can be done on the range
915                 // without checking for individual characters inside the range.
916                 if (raw != dst || (c >= 0x1F71 && c <= 0x1FFB)) {
917                     int32_t rawLen = raw.toUTF32(rawUtf32, RAW_DECOMPOSITION_BUFFER_SIZE, status);
918                     if (!rawLen) {
919                         status.set(U_INTERNAL_PROGRAM_ERROR);
920                         handleError(status, basename);
921                     }
922                     if (rawLen == 1) {
923                         if (c >= 0xFFFF) {
924                             status.set(U_INTERNAL_PROGRAM_ERROR);
925                             handleError(status, basename);
926                         }
927                         umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, uint32_t(rawUtf32[0]), status);
928                     } else if (rawUtf32[0] <= 0xFFFF && rawUtf32[1] <= 0xFFFF) {
929                         if (!rawUtf32[0] || !rawUtf32[1]) {
930                             status.set(U_INTERNAL_PROGRAM_ERROR);
931                             handleError(status, basename);
932                         }
933                         // Swapped for consistency with the primary trie
934                         uint32_t bmpPair = uint32_t(rawUtf32[1]) << 16 | uint32_t(rawUtf32[0]);
935                         umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, bmpPair, status);
936                     } else {
937                         // Let's add 1 to index to make it always non-zero to distinguish
938                         // it from the default zero.
939                         uint32_t index = nonRecursive32.size() + 1;
940                         nonRecursive32.push_back(uint32_t(rawUtf32[0]));
941                         nonRecursive32.push_back(uint32_t(rawUtf32[1]));
942                         if (index > 0xFFFF) {
943                             status.set(U_INTERNAL_PROGRAM_ERROR);
944                             handleError(status, basename);
945                         }
946                         umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, index << 16, status);
947                     }
948                 }
949             }
950         }
951         if (!nonNfdOrRoundTrips) {
952             compositionPassthroughBound = c;
953         }
954         if (len == 1 && utf32[0] <= 0xFFFF) {
955             if (startsWithBackwardCombiningStarter) {
956                 if (mainNormalizer == nfdNormalizer) {
957                     // Not supposed to happen in NFD
958                     status.set(U_INTERNAL_PROGRAM_ERROR);
959                     handleError(status, basename);
960                 } else if (!((utf32[0] >= 0x1161 && utf32[0] <= 0x1175) || (utf32[0] >= 0x11A8 && utf32[0] <= 0x11C2))) {
961                     // Other than conjoining jamo vowels and trails
962                     // unsupported for non-NFD.
963                     status.set(U_INTERNAL_PROGRAM_ERROR);
964                     handleError(status, basename);
965                 }
966             }
967             pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, false});
968         } else if (len == 2 &&
969                    utf32[0] <= 0xFFFF &&
970                    utf32[1] <= 0xFFFF &&
971                    !u_getCombiningClass(utf32[0]) &&
972                    u_getCombiningClass(utf32[1]) &&
973                    permissibleBmpPair(nonNfdOrRoundTrips, c, utf32[1])) {
974             for (int32_t i = 0; i < len; ++i) {
975                 if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) {
976                     // Assert that iota subscript and half-width voicing marks never occur in these
977                     // expansions in the normalization forms where they are special.
978                     status.set(U_INTERNAL_PROGRAM_ERROR);
979                     handleError(status, basename);
980                 }
981             }
982             if (startsWithBackwardCombiningStarter) {
983                 status.set(U_INTERNAL_PROGRAM_ERROR);
984                 handleError(status, basename);
985             }
986             pendingTrieInsertions.push_back({c, (uint32_t(utf32[0]) << 16) | uint32_t(utf32[1]), false});
987         } else {
988             if (startsWithBackwardCombiningStarter) {
989                 status.set(U_INTERNAL_PROGRAM_ERROR);
990                 handleError(status, basename);
991             }
992 
993             UBool supplementary = false;
994             UBool nonInitialStarter = false;
995             for (int32_t i = 0; i < len; ++i) {
996                 if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) {
997                     // Assert that iota subscript and half-width voicing marks never occur in these
998                     // expansions in the normalization forms where they are special.
999                     status.set(U_INTERNAL_PROGRAM_ERROR);
1000                     handleError(status, basename);
1001                 }
1002 
1003                 if (utf32[i] > 0xFFFF) {
1004                     supplementary = true;
1005                 }
1006                 if (utf32[i] == 0) {
1007                     status.set(U_INTERNAL_PROGRAM_ERROR);
1008                     handleError(status, basename);
1009                 }
1010                 if (i != 0 && !u_getCombiningClass(utf32[i])) {
1011                     nonInitialStarter = true;
1012                 }
1013             }
1014             if (!supplementary) {
1015                 if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) {
1016                     if (len == 18 && c == 0xFDFA) {
1017                         // Special marker for the one character whose decomposition
1018                         // is too long.
1019                         pendingTrieInsertions.push_back({c, FDFA_MARKER << 16, supplementary});
1020                         continue;
1021                     } else {
1022                         status.set(U_INTERNAL_PROGRAM_ERROR);
1023                         handleError(status, basename);
1024                     }
1025                 }
1026             } else if (len > LONGEST_ENCODABLE_LENGTH_32 || !len) {
1027                 status.set(U_INTERNAL_PROGRAM_ERROR);
1028                 handleError(status, basename);
1029             }
1030             // Complex decomposition
1031             // Format for 16-bit value:
1032             // 15..13: length minus two for 16-bit case and length minus one for
1033             //         the 32-bit case. Length 8 needs to fit in three bits in
1034             //         the 16-bit case, and this way the value is future-proofed
1035             //         up to 9 in the 16-bit case. Zero is unused and length one
1036             //         in the 16-bit case goes directly into the trie.
1037             //     12: 1 if all trailing characters are guaranteed non-starters,
1038             //         0 if no guarantees about non-starterness.
1039             //         Note: The bit choice is this way around to allow for
1040             //         dynamically falling back to not having this but instead
1041             //         having one more bit for length by merely choosing
1042             //         different masks.
1043             //  11..0: Start offset in storage. The offset is to the logical
1044             //         sequence of scalars16, scalars32, supplementary_scalars16,
1045             //         supplementary_scalars32.
1046             uint32_t descriptor = uint32_t(!nonInitialStarter) << 12;
1047             if (!supplementary) {
1048                 descriptor |= (uint32_t(len) - 2) << 13;
1049             } else {
1050                 descriptor |= (uint32_t(len) - 1) << 13;
1051             }
1052             if (descriptor & 0xFFF) {
1053                 status.set(U_INTERNAL_PROGRAM_ERROR);
1054                 handleError(status, basename);
1055             }
1056             size_t index = 0;
1057             if (!supplementary) {
1058                 index = findOrAppend(storage16, utf32, len);
1059             } else {
1060                 index = findOrAppend(storage32, utf32, len);
1061             }
1062             if (index > 0xFFF) {
1063                 status.set(U_INTERNAL_PROGRAM_ERROR);
1064                 handleError(status, basename);
1065             }
1066             descriptor |= uint32_t(index);
1067             if (!descriptor || descriptor > 0xFFFF) {
1068                 // > 0xFFFF should never happen if the code above is correct.
1069                 // == 0 should not happen due to the nature of the data.
1070                 status.set(U_INTERNAL_PROGRAM_ERROR);
1071                 handleError(status, basename);
1072             }
1073             uint32_t nonRoundTripMarker = 0;
1074             if (!nonNfdOrRoundTrips) {
1075                 nonRoundTripMarker = (NON_ROUND_TRIP_MARKER << 16);
1076             }
1077             pendingTrieInsertions.push_back({c, descriptor | nonRoundTripMarker, supplementary});
1078         }
1079     }
1080     if (storage16.size() + storage32.size() > 0xFFF) {
1081         status.set(U_INTERNAL_PROGRAM_ERROR);
1082     }
1083     if (f) {
1084         usrc_writeArray(f, "scalars32 = [\n  ", nonRecursive32.data(), 32, nonRecursive32.size(), "  ", "\n]\n");
1085 
1086         LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
1087             nonRecursiveBuilder.getAlias(),
1088             trieType,
1089             UCPTRIE_VALUE_BITS_32,
1090             status));
1091         handleError(status, basename);
1092 
1093         fprintf(f, "[trie]\n");
1094         usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
1095 
1096         fclose(f);
1097     }
1098     handleError(status, basename);
1099 }
1100 
1101 #endif // !UCONFIG_NO_NORMALIZATION
1102 
1103 enum {
1104     OPT_HELP_H,
1105     OPT_HELP_QUESTION_MARK,
1106     OPT_MODE,
1107     OPT_TRIE_TYPE,
1108     OPT_VERSION,
1109     OPT_DESTDIR,
1110     OPT_ALL,
1111     OPT_INDEX,
1112     OPT_COPYRIGHT,
1113     OPT_VERBOSE,
1114     OPT_QUIET,
1115 
1116     OPT_COUNT
1117 };
1118 
1119 #define UOPTION_MODE UOPTION_DEF("mode", 'm', UOPT_REQUIRES_ARG)
1120 #define UOPTION_TRIE_TYPE UOPTION_DEF("trie-type", '\1', UOPT_REQUIRES_ARG)
1121 #define UOPTION_ALL UOPTION_DEF("all", '\1', UOPT_NO_ARG)
1122 #define UOPTION_INDEX UOPTION_DEF("index", '\1', UOPT_NO_ARG)
1123 
1124 static UOption options[]={
1125     UOPTION_HELP_H,
1126     UOPTION_HELP_QUESTION_MARK,
1127     UOPTION_MODE,
1128     UOPTION_TRIE_TYPE,
1129     UOPTION_VERSION,
1130     UOPTION_DESTDIR,
1131     UOPTION_ALL,
1132     UOPTION_INDEX,
1133     UOPTION_COPYRIGHT,
1134     UOPTION_VERBOSE,
1135     UOPTION_QUIET,
1136 };
1137 
printHelp(FILE * stdfile,const char * program)1138 void printHelp(FILE* stdfile, const char* program) {
1139   fprintf(stdfile,
1140           "usage: %s -m mode [-options] [--all | properties...]\n"
1141           "\tdump Unicode property data to .toml files\n"
1142           "options:\n"
1143           "\t-h or -? or --help  this usage text\n"
1144           "\t-V or --version     show a version message\n"
1145           "\t-m or --mode        mode: currently only 'uprops', 'ucase', and 'norm', but more may be added\n"
1146           "\t      --trie-type   set the trie type (small or fast, default small)\n"
1147           "\t-d or --destdir     destination directory, followed by the path\n"
1148           "\t      --all         write out all properties known to icuexportdata\n"
1149           "\t      --index       write an _index.toml summarizing all data exported\n"
1150           "\t-c or --copyright   include a copyright notice\n"
1151           "\t-v or --verbose     Turn on verbose output\n"
1152           "\t-q or --quiet       do not display warnings and progress\n",
1153           program);
1154 }
1155 
exportUprops(int argc,char * argv[])1156 int exportUprops(int argc, char* argv[]) {
1157     // Load list of Unicode properties
1158     std::vector<const char*> propNames;
1159     for (int i=1; i<argc; i++) {
1160         propNames.push_back(argv[i]);
1161     }
1162     if (options[OPT_ALL].doesOccur) {
1163         int i = UCHAR_BINARY_START;
1164         while (true) {
1165             if (i == UCHAR_BINARY_LIMIT) {
1166                 i = UCHAR_INT_START;
1167             }
1168             if (i == UCHAR_INT_LIMIT) {
1169                 i = UCHAR_GENERAL_CATEGORY_MASK;
1170             }
1171             if (i == UCHAR_GENERAL_CATEGORY_MASK + 1) {
1172                 i = UCHAR_BIDI_MIRRORING_GLYPH;
1173             }
1174             if (i == UCHAR_BIDI_MIRRORING_GLYPH + 1) {
1175                 i = UCHAR_SCRIPT_EXTENSIONS;
1176             }
1177             if (i == UCHAR_SCRIPT_EXTENSIONS + 1) {
1178                 break;
1179             }
1180             UProperty uprop = static_cast<UProperty>(i);
1181             const char* propName = u_getPropertyName(uprop, U_SHORT_PROPERTY_NAME);
1182             if (propName == nullptr) {
1183                 propName = u_getPropertyName(uprop, U_LONG_PROPERTY_NAME);
1184                 if (propName != nullptr && VERBOSE) {
1185                     std::cerr << "Note: falling back to long name for: " << propName << std::endl;
1186                 }
1187             }
1188             if (propName != nullptr) {
1189                 propNames.push_back(propName);
1190             } else {
1191                 std::cerr << "Warning: Could not find name for: " << uprop << std::endl;
1192             }
1193             i++;
1194         }
1195     }
1196 
1197     if (propNames.empty()
1198             || options[OPT_HELP_H].doesOccur
1199             || options[OPT_HELP_QUESTION_MARK].doesOccur
1200             || !options[OPT_MODE].doesOccur) {
1201         FILE *stdfile=argc<0 ? stderr : stdout;
1202         fprintf(stdfile,
1203             "usage: %s -m uprops [-options] [--all | properties...]\n"
1204             "\tdump Unicode property data to .toml files\n"
1205             "options:\n"
1206             "\t-h or -? or --help  this usage text\n"
1207             "\t-V or --version     show a version message\n"
1208             "\t-m or --mode        mode: currently only 'uprops', but more may be added\n"
1209             "\t      --trie-type   set the trie type (small or fast, default small)\n"
1210             "\t-d or --destdir     destination directory, followed by the path\n"
1211             "\t      --all         write out all properties known to icuexportdata\n"
1212             "\t      --index       write an _index.toml summarizing all data exported\n"
1213             "\t-c or --copyright   include a copyright notice\n"
1214             "\t-v or --verbose     Turn on verbose output\n"
1215             "\t-q or --quiet       do not display warnings and progress\n",
1216             argv[0]);
1217         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1218     }
1219 
1220     const char* mode = options[OPT_MODE].value;
1221     if (uprv_strcmp(mode, "uprops") != 0) {
1222         fprintf(stderr, "Invalid option for --mode (must be uprops)\n");
1223         return U_ILLEGAL_ARGUMENT_ERROR;
1224     }
1225 
1226     if (options[OPT_TRIE_TYPE].doesOccur) {
1227         if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
1228             trieType = UCPTRIE_TYPE_FAST;
1229         } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
1230             trieType = UCPTRIE_TYPE_SMALL;
1231         } else {
1232             fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
1233             return U_ILLEGAL_ARGUMENT_ERROR;
1234         }
1235     }
1236 
1237     for (const char* propName : propNames) {
1238         UProperty propEnum = u_getPropertyEnum(propName);
1239         if (propEnum == UCHAR_INVALID_CODE) {
1240             std::cerr << "Error: Invalid property alias: " << propName << std::endl;
1241             return U_ILLEGAL_ARGUMENT_ERROR;
1242         }
1243 
1244         FILE* f = prepareOutputFile(propName);
1245 
1246         UVersionInfo versionInfo;
1247         u_getUnicodeVersion(versionInfo);
1248         char uvbuf[U_MAX_VERSION_STRING_LENGTH];
1249         u_versionToString(versionInfo, uvbuf);
1250         fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n",
1251             U_ICU_VERSION,
1252             uvbuf);
1253 
1254         if (propEnum < UCHAR_BINARY_LIMIT) {
1255             dumpBinaryProperty(propEnum, f);
1256         } else if (UCHAR_INT_START <= propEnum && propEnum <= UCHAR_INT_LIMIT) {
1257             dumpEnumeratedProperty(propEnum, f);
1258         } else if (propEnum == UCHAR_GENERAL_CATEGORY_MASK) {
1259             dumpGeneralCategoryMask(f);
1260         } else if (propEnum == UCHAR_BIDI_MIRRORING_GLYPH) {
1261             dumpBidiMirroringGlyph(f);
1262         } else if (propEnum == UCHAR_SCRIPT_EXTENSIONS) {
1263             dumpScriptExtensions(f);
1264         } else {
1265             std::cerr << "Don't know how to write property: " << propEnum << std::endl;
1266             return U_INTERNAL_PROGRAM_ERROR;
1267         }
1268 
1269         fclose(f);
1270     }
1271 
1272     if (options[OPT_INDEX].doesOccur) {
1273         FILE* f = prepareOutputFile("_index");
1274         fprintf(f, "index = [\n");
1275         for (const char* propName : propNames) {
1276             // At this point, propName is a valid property name, so it should be alphanum ASCII
1277             fprintf(f, "  { filename=\"%s.toml\" },\n", propName);
1278         }
1279         fprintf(f, "]\n");
1280         fclose(f);
1281     }
1282 
1283     return 0;
1284 }
1285 
1286 struct AddRangeHelper {
1287     UMutableCPTrie* ucptrie;
1288 };
1289 
1290 static UBool U_CALLCONV
addRangeToUCPTrie(const void * context,UChar32 start,UChar32 end,uint32_t value)1291 addRangeToUCPTrie(const void* context, UChar32 start, UChar32 end, uint32_t value) {
1292     IcuToolErrorCode status("addRangeToUCPTrie");
1293     UMutableCPTrie* ucptrie = ((const AddRangeHelper*) context)->ucptrie;
1294     umutablecptrie_setRange(ucptrie, start, end, value, status);
1295     handleError(status, "setRange");
1296 
1297     return true;
1298 }
1299 
exportCase(int argc,char * argv[])1300 int exportCase(int argc, char* argv[]) {
1301     if (argc > 1) {
1302         fprintf(stderr, "ucase mode does not expect additional arguments\n");
1303         return U_ILLEGAL_ARGUMENT_ERROR;
1304     }
1305     (void) argv; // Suppress unused variable warning
1306 
1307     IcuToolErrorCode status("icuexportdata");
1308     LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
1309     handleError(status, "exportCase");
1310 
1311     int32_t exceptionsLength, unfoldLength;
1312     const UCaseProps *caseProps = ucase_getSingleton(&exceptionsLength, &unfoldLength);
1313     const UTrie2* caseTrie = &caseProps->trie;
1314 
1315     AddRangeHelper helper = { builder.getAlias() };
1316     utrie2_enum(caseTrie, nullptr, addRangeToUCPTrie, &helper);
1317 
1318     UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_16;
1319     LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
1320         builder.getAlias(),
1321         trieType,
1322         width,
1323         status));
1324     handleError(status, "exportCase");
1325 
1326     FILE* f = prepareOutputFile("ucase");
1327 
1328     UVersionInfo versionInfo;
1329     u_getUnicodeVersion(versionInfo);
1330     char uvbuf[U_MAX_VERSION_STRING_LENGTH];
1331     u_versionToString(versionInfo, uvbuf);
1332     fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n",
1333             U_ICU_VERSION,
1334             uvbuf);
1335 
1336     fputs("[ucase.code_point_trie]\n", f);
1337     usrc_writeUCPTrie(f, "case_trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
1338     fputs("\n", f);
1339 
1340     const char* indent = "  ";
1341     const char* suffix = "\n]\n";
1342 
1343     fputs("[ucase.exceptions]\n", f);
1344     const char* exceptionsPrefix = "exceptions = [\n  ";
1345     int32_t exceptionsWidth = 16;
1346     usrc_writeArray(f, exceptionsPrefix, caseProps->exceptions, exceptionsWidth,
1347                     exceptionsLength, indent, suffix);
1348     fputs("\n", f);
1349 
1350     fputs("[ucase.unfold]\n", f);
1351     const char* unfoldPrefix = "unfold = [\n  ";
1352     int32_t unfoldWidth = 16;
1353     usrc_writeArray(f, unfoldPrefix, caseProps->unfold, unfoldWidth,
1354                     unfoldLength, indent, suffix);
1355 
1356     return 0;
1357 }
1358 
1359 #if !UCONFIG_NO_NORMALIZATION
1360 
exportNorm()1361 int exportNorm() {
1362     IcuToolErrorCode status("icuexportdata: exportNorm");
1363     USet* backwardCombiningStarters = uset_openEmpty();
1364     writeCanonicalCompositions(backwardCombiningStarters);
1365 
1366     std::vector<uint16_t> storage16;
1367     std::vector<uint32_t> storage32;
1368 
1369     // Note: the USets are not exported. They are only used to check that a new
1370     // Unicode version doesn't violate expectations that are hard-coded in ICU4X.
1371     USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty();
1372     USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1373     std::vector<PendingDescriptor> nfdPendingTrieInsertions;
1374     UChar32 nfdBound = 0x10FFFF;
1375     UChar32 nfcBound = 0x10FFFF;
1376     computeDecompositions("nfd",
1377                           backwardCombiningStarters,
1378                           storage16,
1379                           storage32,
1380                           nfdDecompositionStartsWithNonStarter,
1381                           nfdDecompositionStartsWithBackwardCombiningStarter,
1382                           nfdPendingTrieInsertions,
1383                           nfdBound,
1384                           nfcBound);
1385     if (!(nfdBound == 0xC0 && nfcBound == 0x300)) {
1386         // Unexpected bounds for NFD/NFC.
1387         status.set(U_INTERNAL_PROGRAM_ERROR);
1388         handleError(status, "exportNorm");
1389     }
1390 
1391     uint32_t baseSize16 = storage16.size();
1392     uint32_t baseSize32 = storage32.size();
1393 
1394     USet* nfkdDecompositionStartsWithNonStarter = uset_openEmpty();
1395     USet* nfkdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1396     std::vector<PendingDescriptor> nfkdPendingTrieInsertions;
1397     UChar32 nfkdBound = 0x10FFFF;
1398     UChar32 nfkcBound = 0x10FFFF;
1399     computeDecompositions("nfkd",
1400                           backwardCombiningStarters,
1401                           storage16,
1402                           storage32,
1403                           nfkdDecompositionStartsWithNonStarter,
1404                           nfkdDecompositionStartsWithBackwardCombiningStarter,
1405                           nfkdPendingTrieInsertions,
1406                           nfkdBound,
1407                           nfkcBound);
1408     if (!(nfkdBound <= 0xC0 && nfkcBound <= 0x300)) {
1409         status.set(U_INTERNAL_PROGRAM_ERROR);
1410         handleError(status, "exportNorm");
1411     }
1412     if (nfkcBound > 0xC0) {
1413         if (nfkdBound != 0xC0) {
1414             status.set(U_INTERNAL_PROGRAM_ERROR);
1415             handleError(status, "exportNorm");
1416         }
1417     } else {
1418         if (nfkdBound != nfkcBound) {
1419             status.set(U_INTERNAL_PROGRAM_ERROR);
1420             handleError(status, "exportNorm");
1421         }
1422     }
1423 
1424     USet* uts46DecompositionStartsWithNonStarter = uset_openEmpty();
1425     USet* uts46DecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1426     std::vector<PendingDescriptor> uts46PendingTrieInsertions;
1427     UChar32 uts46dBound = 0x10FFFF;
1428     UChar32 uts46Bound = 0x10FFFF;
1429     computeDecompositions("uts46d",
1430                           backwardCombiningStarters,
1431                           storage16,
1432                           storage32,
1433                           uts46DecompositionStartsWithNonStarter,
1434                           uts46DecompositionStartsWithBackwardCombiningStarter,
1435                           uts46PendingTrieInsertions,
1436                           uts46dBound,
1437                           uts46Bound);
1438     if (!(uts46dBound <= 0xC0 && uts46Bound <= 0x300)) {
1439         status.set(U_INTERNAL_PROGRAM_ERROR);
1440         handleError(status, "exportNorm");
1441     }
1442     if (uts46Bound > 0xC0) {
1443         if (uts46dBound != 0xC0) {
1444             status.set(U_INTERNAL_PROGRAM_ERROR);
1445             handleError(status, "exportNorm");
1446         }
1447     } else {
1448         if (uts46dBound != uts46Bound) {
1449             status.set(U_INTERNAL_PROGRAM_ERROR);
1450             handleError(status, "exportNorm");
1451         }
1452     }
1453 
1454     uint32_t supplementSize16 = storage16.size() - baseSize16;
1455     uint32_t supplementSize32 = storage32.size() - baseSize32;
1456 
1457     writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions, char16_t(nfcBound));
1458     writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions, char16_t(nfkcBound));
1459     writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions, char16_t(uts46Bound));
1460 
1461     writeDecompositionTables("nfdex", storage16.data(), baseSize16, storage32.data(), baseSize32);
1462     writeDecompositionTables("nfkdex", storage16.data() + baseSize16, supplementSize16, storage32.data() + baseSize32, supplementSize32);
1463 
1464     uset_close(nfdDecompositionStartsWithNonStarter);
1465     uset_close(nfkdDecompositionStartsWithNonStarter);
1466     uset_close(uts46DecompositionStartsWithNonStarter);
1467 
1468     uset_close(nfdDecompositionStartsWithBackwardCombiningStarter);
1469     uset_close(nfkdDecompositionStartsWithBackwardCombiningStarter);
1470     uset_close(uts46DecompositionStartsWithBackwardCombiningStarter);
1471 
1472     uset_close(backwardCombiningStarters);
1473     handleError(status, "exportNorm");
1474     return 0;
1475 }
1476 
1477 #endif // !UCONFIG_NO_NORMALIZATION
1478 
main(int argc,char * argv[])1479 int main(int argc, char* argv[]) {
1480     U_MAIN_INIT_ARGS(argc, argv);
1481 
1482     /* preset then read command line options */
1483     options[OPT_DESTDIR].value=u_getDataDirectory();
1484     argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
1485 
1486     if(options[OPT_VERSION].doesOccur) {
1487         printf("icuexportdata version %s, ICU tool to dump data files for external consumers\n",
1488                U_ICU_DATA_VERSION);
1489         printf("%s\n", U_COPYRIGHT_STRING);
1490         exit(0);
1491     }
1492 
1493     /* error handling, printing usage message */
1494     if(argc<0) {
1495         fprintf(stderr,
1496             "error in command line argument \"%s\"\n",
1497             argv[-argc]);
1498     }
1499 
1500     if (argc < 0
1501             || options[OPT_HELP_H].doesOccur
1502             || options[OPT_HELP_QUESTION_MARK].doesOccur
1503             || !options[OPT_MODE].doesOccur) {
1504         FILE *stdfile=argc<0 ? stderr : stdout;
1505         printHelp(stdfile, argv[0]);
1506         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1507     }
1508 
1509     /* get the options values */
1510     haveCopyright = options[OPT_COPYRIGHT].doesOccur;
1511     destdir = options[OPT_DESTDIR].value;
1512     VERBOSE = options[OPT_VERBOSE].doesOccur;
1513     QUIET = options[OPT_QUIET].doesOccur;
1514 
1515     if (options[OPT_TRIE_TYPE].doesOccur) {
1516         if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
1517             trieType = UCPTRIE_TYPE_FAST;
1518         } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
1519             trieType = UCPTRIE_TYPE_SMALL;
1520         } else {
1521             fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
1522             return U_ILLEGAL_ARGUMENT_ERROR;
1523         }
1524     }
1525 
1526     const char* mode = options[OPT_MODE].value;
1527     if (uprv_strcmp(mode, "norm") == 0) {
1528 #if !UCONFIG_NO_NORMALIZATION
1529         return exportNorm();
1530 #else
1531     fprintf(stderr, "Exporting normalization data not supported when compiling without normalization support.\n");
1532     return U_ILLEGAL_ARGUMENT_ERROR;
1533 #endif
1534     }
1535     if (uprv_strcmp(mode, "uprops") == 0) {
1536         return exportUprops(argc, argv);
1537     } else if (uprv_strcmp(mode, "ucase") == 0) {
1538         return exportCase(argc, argv);
1539     }
1540 
1541     fprintf(stderr, "Invalid option for --mode (must be uprops, ucase, or norm)\n");
1542     return U_ILLEGAL_ARGUMENT_ERROR;
1543 }
1544