• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include <cstddef>
5 #include <cstdint>
6 #include <cstdio>
7 #include <iostream>
8 #include <unicode/localpointer.h>
9 #include <unicode/umachine.h>
10 #include <unicode/unistr.h>
11 #include <unicode/urename.h>
12 #include <unicode/uset.h>
13 #include <vector>
14 #include <algorithm>
15 #include "toolutil.h"
16 #include "uoptions.h"
17 #include "cmemory.h"
18 #include "charstr.h"
19 #include "cstring.h"
20 #include "unicode/uchar.h"
21 #include "unicode/errorcode.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uscript.h"
24 #include "unicode/putil.h"
25 #include "unicode/umutablecptrie.h"
26 #include "unicode/ucharstriebuilder.h"
27 #include "ucase.h"
28 #include "unicode/normalizer2.h"
29 #include "normalizer2impl.h"
30 #include "writesrc.h"
31 
32 U_NAMESPACE_USE
33 
34 /*
35  * Global - verbosity
36  */
37 UBool VERBOSE = false;
38 UBool QUIET = false;
39 
40 UBool haveCopyright = true;
41 UCPTrieType trieType = UCPTRIE_TYPE_SMALL;
42 const char* destdir = "";
43 
44 // Mask constants for modified values in the Script CodePointTrie, values are logically 12-bits.
45 int16_t DATAEXPORT_SCRIPT_X_WITH_COMMON    = 0x0400;
46 int16_t DATAEXPORT_SCRIPT_X_WITH_INHERITED = 0x0800;
47 int16_t DATAEXPORT_SCRIPT_X_WITH_OTHER     = 0x0c00;
48 
49 // TODO(ICU-21821): Replace this with a call to a library function
50 int32_t scxCodePoints[] = {
51       7415, 7377, 7380, 7387, 7390, 7391, 7394, 7395, 7396, 7397,
52       7398, 7399, 7400, 7403, 7404, 7406, 7407, 7408, 7409, 113824,
53       113825, 113826, 113827, 834, 837, 7616, 7617, 12294, 12350, 12351,
54       12688, 12689, 12690, 12691, 12692, 12693, 12694, 12695, 12696, 12697,
55       12698, 12699, 12700, 12701, 12702, 12703, 12736, 12737, 12738, 12739,
56       12740, 12741, 12742, 12743, 12744, 12745, 12746, 12747, 12748, 12749,
57       12750, 12751, 12752, 12753, 12754, 12755, 12756, 12757, 12758, 12759,
58       12760, 12761, 12762, 12763, 12764, 12765, 12766, 12767, 12768, 12769,
59       12770, 12771, 12832, 12833, 12834, 12835, 12836, 12837, 12838, 12839,
60       12840, 12841, 12842, 12843, 12844, 12845, 12846, 12847, 12848, 12849,
61       12850, 12851, 12852, 12853, 12854, 12855, 12856, 12857, 12858, 12859,
62       12860, 12861, 12862, 12863, 12864, 12865, 12866, 12867, 12868, 12869,
63       12870, 12871, 12928, 12929, 12930, 12931, 12932, 12933, 12934, 12935,
64       12936, 12937, 12938, 12939, 12940, 12941, 12942, 12943, 12944, 12945,
65       12946, 12947, 12948, 12949, 12950, 12951, 12952, 12953, 12954, 12955,
66       12956, 12957, 12958, 12959, 12960, 12961, 12962, 12963, 12964, 12965,
67       12966, 12967, 12968, 12969, 12970, 12971, 12972, 12973, 12974, 12975,
68       12976, 12992, 12993, 12994, 12995, 12996, 12997, 12998, 12999, 13000,
69       13001, 13002, 13003, 13055, 13144, 13145, 13146, 13147, 13148, 13149,
70       13150, 13151, 13152, 13153, 13154, 13155, 13156, 13157, 13158, 13159,
71       13160, 13161, 13162, 13163, 13164, 13165, 13166, 13167, 13168, 13179,
72       13180, 13181, 13182, 13183, 13280, 13281, 13282, 13283, 13284, 13285,
73       13286, 13287, 13288, 13289, 13290, 13291, 13292, 13293, 13294, 13295,
74       13296, 13297, 13298, 13299, 13300, 13301, 13302, 13303, 13304, 13305,
75       13306, 13307, 13308, 13309, 13310, 119648, 119649, 119650, 119651, 119652,
76       119653, 119654, 119655, 119656, 119657, 119658, 119659, 119660, 119661, 119662,
77       119663, 119664, 119665, 127568, 127569, 867, 868, 869, 870, 871,
78       872, 873, 874, 875, 876, 877, 878, 879, 7418, 7674,
79       66272, 66273, 66274, 66275, 66276, 66277, 66278, 66279, 66280, 66281,
80       66282, 66283, 66284, 66285, 66286, 66287, 66288, 66289, 66290, 66291,
81       66292, 66293, 66294, 66295, 66296, 66297, 66298, 66299, 1748, 64830,
82       64831, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619,
83       1620, 1621, 1648, 65010, 65021, 7381, 7382, 7384, 7393, 7402,
84       7405, 7413, 7414, 43249, 12330, 12331, 12332, 12333, 43471, 65794,
85       65847, 65848, 65849, 65850, 65851, 65852, 65853, 65854, 65855, 1156,
86       1159, 11843, 42607, 1157, 1158, 1155, 7672, 7379, 7411, 7416,
87       7417, 7401, 7383, 7385, 7388, 7389, 7392, 43251, 4347, 3046,
88       3047, 3048, 3049, 3050, 3051, 3052, 3053, 3054, 3055, 3056,
89       3057, 3058, 3059, 70401, 70403, 70459, 70460, 73680, 73681, 73683,
90       2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2799,
91       2662, 2663, 2664, 2665, 2666, 2667, 2668, 2669, 2670, 2671,
92       42752, 42753, 42754, 42755, 42756, 42757, 42758, 42759, 12337, 12338,
93       12339, 12340, 12341, 12441, 12442, 12443, 12444, 12448, 12540, 65392,
94       65438, 65439, 3302, 3303, 3304, 3305, 3306, 3307, 3308, 3309,
95       3310, 3311, 8239, 68338, 6146, 6147, 6149, 1564, 1632, 1633,
96       1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 2534, 2535,
97       2536, 2537, 2538, 2539, 2540, 2541, 2542, 2543, 4160, 4161,
98       4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 65792, 65793,
99       65799, 65800, 65801, 65802, 65803, 65804, 65805, 65806, 65807, 65808,
100       65809, 65810, 65811, 65812, 65813, 65814, 65815, 65816, 65817, 65818,
101       65819, 65820, 65821, 65822, 65823, 65824, 65825, 65826, 65827, 65828,
102       65829, 65830, 65831, 65832, 65833, 65834, 65835, 65836, 65837, 65838,
103       65839, 65840, 65841, 65842, 65843, 7412, 8432, 12348, 12349, 43310,
104       7376, 7378, 5941, 5942, 2406, 2407, 2408, 2409, 2410, 2411,
105       2412, 2413, 2414, 2415, 12291, 12307, 12316, 12317, 12318, 12319,
106       12336, 12343, 65093, 65094, 1548, 1563, 12289, 12290, 12296, 12297,
107       12298, 12299, 12300, 12301, 12302, 12303, 12304, 12305, 12308, 12309,
108       12310, 12311, 12312, 12313, 12314, 12315, 12539, 65377, 65378, 65379,
109       65380, 65381, 7386, 1567, 7410, 1600, 43062, 43063, 43064, 43065,
110       2386, 2385, 43059, 43060, 43061, 43056, 43057, 43058, 2404, 2405
111     };
112 
handleError(ErrorCode & status,const char * context)113 void handleError(ErrorCode& status, const char* context) {
114     if (status.isFailure()) {
115         std::cerr << "Error: " << context << ": " << status.errorName() << std::endl;
116         exit(status.reset());
117     }
118 }
119 
120 class PropertyValueNameGetter : public ValueNameGetter {
121 public:
PropertyValueNameGetter(UProperty prop)122     PropertyValueNameGetter(UProperty prop) : property(prop) {}
123     ~PropertyValueNameGetter() override;
getName(uint32_t value)124     const char *getName(uint32_t value) override {
125         return u_getPropertyValueName(property, value, U_SHORT_PROPERTY_NAME);
126     }
127 
128 private:
129     UProperty property;
130 };
131 
~PropertyValueNameGetter()132 PropertyValueNameGetter::~PropertyValueNameGetter() {}
133 
dumpBinaryProperty(UProperty uproperty,FILE * f)134 void dumpBinaryProperty(UProperty uproperty, FILE* f) {
135     IcuToolErrorCode status("icuexportdata: dumpBinaryProperty");
136     const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
137     const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
138     const USet* uset = u_getBinaryPropertySet(uproperty, status);
139     handleError(status, fullPropName);
140 
141     fputs("[[binary_property]]\n", f);
142     fprintf(f, "long_name = \"%s\"\n", fullPropName);
143     if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
144     usrc_writeUnicodeSet(f, uset, UPRV_TARGET_SYNTAX_TOML);
145 }
146 
dumpEnumeratedProperty(UProperty uproperty,FILE * f)147 void dumpEnumeratedProperty(UProperty uproperty, FILE* f) {
148     IcuToolErrorCode status("icuexportdata: dumpEnumeratedProperty");
149     const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
150     const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
151     const UCPMap* umap = u_getIntPropertyMap(uproperty, status);
152     handleError(status, fullPropName);
153 
154     fputs("[[enum_property]]\n", f);
155     fprintf(f, "long_name = \"%s\"\n", fullPropName);
156     if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
157     PropertyValueNameGetter valueNameGetter(uproperty);
158     usrc_writeUCPMap(f, umap, &valueNameGetter, UPRV_TARGET_SYNTAX_TOML);
159     fputs("\n", f);
160 
161     U_ASSERT(u_getIntPropertyMinValue(uproperty) >= 0);
162     int32_t maxValue = u_getIntPropertyMaxValue(uproperty);
163     U_ASSERT(maxValue >= 0);
164     UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32;
165     if (maxValue <= 0xff) {
166         width = UCPTRIE_VALUE_BITS_8;
167     } else if (maxValue <= 0xffff) {
168         width = UCPTRIE_VALUE_BITS_16;
169     }
170     LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(umap, status));
171     LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
172         builder.getAlias(),
173         trieType,
174         width,
175         status));
176     handleError(status, fullPropName);
177 
178     fputs("[enum_property.code_point_trie]\n", f);
179     usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
180 }
181 
dumpScriptExtensions(FILE * f)182 void dumpScriptExtensions(FILE* f) {
183     IcuToolErrorCode status("icuexportdata: dumpScriptExtensions");
184 
185     fputs("[[script_extensions]]\n", f);
186     const char* scxFullPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_LONG_PROPERTY_NAME);
187     const char* scxShortPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_SHORT_PROPERTY_NAME);
188     fprintf(f, "long_name = \"%s\"\n", scxFullPropName);
189     if (scxShortPropName) fprintf(f, "short_name = \"%s\"\n", scxShortPropName);
190 
191     // We want to use 16 bits for our exported trie of sc/scx data because we
192     // need 12 bits to match the 12 bits of data stored for sc/scx in the trie
193     // in the uprops.icu data file.
194     UCPTrieValueWidth scWidth = UCPTRIE_VALUE_BITS_16;
195 
196     // Create a mutable UCPTrie builder populated with Script property values data.
197     const UCPMap* scInvMap = u_getIntPropertyMap(UCHAR_SCRIPT, status);
198     handleError(status, scxFullPropName);
199     LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(scInvMap, status));
200     handleError(status, scxFullPropName);
201 
202     // The values for the output scx companion array.
203     // Invariant is that all subvectors are distinct.
204     std::vector< std::vector<uint16_t> > outputDedupVec;
205 
206     // The sc/scx companion array is an array of arrays (of script codes)
207     fputs("script_code_array = [\n", f);
208     for(const UChar32 cp : scxCodePoints) {
209         // Get the Script value
210         uint32_t scVal = umutablecptrie_get(builder.getAlias(), cp);
211         // Get the Script_Extensions value (array of Script codes)
212         const int32_t SCX_ARRAY_CAPACITY = 32;
213         UScriptCode scxValArray[SCX_ARRAY_CAPACITY];
214         int32_t numScripts = uscript_getScriptExtensions(cp, scxValArray, SCX_ARRAY_CAPACITY, status);
215         handleError(status, scxFullPropName);
216 
217         // Convert the scx array into a vector
218         std::vector<uint16_t> scxValVec;
219         for(int i = 0; i < numScripts; i++) {
220             scxValVec.push_back(scxValArray[i]);
221         }
222         // Ensure that it is sorted
223         std::sort(scxValVec.begin(), scxValVec.end());
224         // Copy the Script value into the first position of the scx array only
225         // if we have the "other" case (Script value is not Common nor Inherited).
226         // This offers faster access when users want only the Script value.
227         if (scVal != USCRIPT_COMMON && scVal != USCRIPT_INHERITED) {
228             scxValVec.insert(scxValVec.begin(), scVal);
229         }
230 
231         // See if there is already an scx value array matching the newly built one.
232         // If there is, then use its index.
233         // If not, then append the new value array.
234         bool isScxValUnique = true;
235         size_t outputIndex = 0;
236         for (outputIndex = 0; outputIndex < outputDedupVec.size(); outputIndex++) {
237             if (outputDedupVec[outputIndex] == scxValVec) {
238                 isScxValUnique = false;
239                 break;
240             }
241         }
242 
243         if (isScxValUnique) {
244             outputDedupVec.push_back(scxValVec);
245             usrc_writeArray(f, "  [", scxValVec.data(), 16, scxValVec.size(), "    ", "],\n");
246         }
247 
248         // We must update the value in the UCPTrie for the code point to contain:
249         // 9..0 the Script code in the lower 10 bits when 11..10 is 0, else it is
250         //   the index into the companion array
251         // 11..10 the same higher-order 2 bits in the trie in uprops.icu indicating whether
252         //   3: other
253         //   2: Script=Inherited
254         //   1: Script=Common
255         //   0: Script=value in 9..0 (N/A because we are in this loop to create the companion array for non-0 cases)
256         uint16_t mask = 0;
257         if (scVal == USCRIPT_COMMON) {
258             mask = DATAEXPORT_SCRIPT_X_WITH_COMMON;
259         } else if (scVal == USCRIPT_INHERITED) {
260             mask = DATAEXPORT_SCRIPT_X_WITH_INHERITED;
261         } else {
262             mask = DATAEXPORT_SCRIPT_X_WITH_OTHER;
263         }
264 
265         // The new trie value is the index into the new array with the high order bits set
266         uint32_t newScVal = outputIndex | mask;
267 
268         // Update the code point in the mutable trie builder with the trie value
269         umutablecptrie_set(builder.getAlias(), cp, newScVal, status);
270         handleError(status, scxFullPropName);
271     }
272     fputs("]\n\n", f);  // Print the TOML close delimiter for the outer array.
273 
274     // Convert from mutable trie builder to immutable trie.
275     LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
276         builder.getAlias(),
277         trieType,
278         scWidth,
279         status));
280     handleError(status, scxFullPropName);
281 
282     fputs("[script_extensions.code_point_trie]\n", f);
283     usrc_writeUCPTrie(f, scxShortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
284 }
285 
prepareOutputFile(const char * basename)286 FILE* prepareOutputFile(const char* basename) {
287     IcuToolErrorCode status("icuexportdata");
288     CharString outFileName;
289     if (destdir != nullptr && *destdir != 0) {
290         outFileName.append(destdir, status).ensureEndsWithFileSeparator(status);
291     }
292     outFileName.append(basename, status);
293     outFileName.append(".toml", status);
294     handleError(status, basename);
295 
296     FILE* f = fopen(outFileName.data(), "w");
297     if (f == nullptr) {
298         std::cerr << "Unable to open file: " << outFileName.data() << std::endl;
299         exit(U_FILE_ACCESS_ERROR);
300     }
301     if (!QUIET) {
302         std::cout << "Writing to: " << outFileName.data() << std::endl;
303     }
304 
305     if (haveCopyright) {
306         usrc_writeCopyrightHeader(f, "#", 2021);
307     }
308     usrc_writeFileNameGeneratedBy(f, "#", basename, "icuexportdata.cpp");
309 
310     return f;
311 }
312 
313 #if !UCONFIG_NO_NORMALIZATION
314 
315 struct PendingDescriptor {
316     UChar32 scalar;
317     uint32_t descriptor;
318     UBool supplementary;
319 };
320 
writeCanonicalCompositions(USet * backwardCombiningStarters)321 void writeCanonicalCompositions(USet* backwardCombiningStarters) {
322     IcuToolErrorCode status("icuexportdata: computeCanonicalCompositions");
323     const char* basename = "compositions";
324     FILE* f = prepareOutputFile(basename);
325 
326     LocalPointer<UCharsTrieBuilder> backwardBuilder(new UCharsTrieBuilder(status), status);
327 
328     const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
329     UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
330 
331     const Normalizer2* nfc = Normalizer2::getNFCInstance(status);
332     for (UChar32 c = 0; c <= 0x10FFFF; ++c) {
333         if (c >= 0xD800 && c < 0xE000) {
334             // Surrogate
335             continue;
336         }
337         UnicodeString decomposition;
338         if (!nfc->getRawDecomposition(c, decomposition)) {
339             continue;
340         }
341         int32_t len = decomposition.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
342         if (len != 2) {
343             continue;
344         }
345         UChar32 starter = utf32[0];
346         UChar32 second = utf32[1];
347         UChar32 composite = nfc->composePair(starter, second);
348         if (composite < 0) {
349             continue;
350         }
351         if (c != composite) {
352             status.set(U_INTERNAL_PROGRAM_ERROR);
353             handleError(status, basename);
354         }
355         if (!u_getCombiningClass(second)) {
356             uset_add(backwardCombiningStarters, second);
357         }
358         if (composite >= 0xAC00 && composite <= 0xD7A3) {
359             // Hangul syllable
360             continue;
361         }
362 
363         UnicodeString backward;
364         backward.append(second);
365         backward.append(starter);
366         backwardBuilder->add(backward, int32_t(composite), status);
367     }
368     UnicodeString canonicalCompositionTrie;
369     backwardBuilder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, canonicalCompositionTrie, status);
370 
371     usrc_writeArray(f, "compositions = [\n  ", canonicalCompositionTrie.getBuffer(), 16, canonicalCompositionTrie.length(), "  ", "\n]\n");
372     fclose(f);
373     handleError(status, basename);
374 }
375 
writeDecompositionTables(const char * basename,const uint16_t * ptr16,size_t len16,const uint32_t * ptr32,size_t len32)376 void writeDecompositionTables(const char* basename, const uint16_t* ptr16, size_t len16, const uint32_t* ptr32, size_t len32) {
377     FILE* f = prepareOutputFile(basename);
378     usrc_writeArray(f, "scalars16 = [\n  ", ptr16, 16, len16, "  ", "\n]\n");
379     usrc_writeArray(f, "scalars32 = [\n  ", ptr32, 32, len32, "  ", "\n]\n");
380     fclose(f);
381 }
382 
writeDecompositionData(const char * basename,uint32_t baseSize16,uint32_t baseSize32,uint32_t supplementSize16,USet * uset,USet * reference,const std::vector<PendingDescriptor> & pendingTrieInsertions,char16_t passthroughCap)383 void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector<PendingDescriptor>& pendingTrieInsertions, char16_t passthroughCap) {
384     IcuToolErrorCode status("icuexportdata: writeDecompositionData");
385     FILE* f = prepareOutputFile(basename);
386 
387     // Zero is a magic number that means the character decomposes to itself.
388     LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
389 
390     // Iterate backwards to insert lower code points in the trie first in case it matters
391     // for trie block allocation.
392     for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) {
393         const PendingDescriptor& pending = pendingTrieInsertions[i];
394         uint32_t additional = 0;
395         if (!(pending.descriptor & 0xFFFE0000)) {
396             uint32_t offset = pending.descriptor & 0xFFF;
397             if (!pending.supplementary) {
398                 if (offset >= baseSize16) {
399                     // This is a offset to supplementary 16-bit data. We have
400                     // 16-bit base data and 32-bit base data before. However,
401                     // the 16-bit base data length is already part of offset.
402                     additional = baseSize32;
403                 }
404             } else {
405                 if (offset >= baseSize32) {
406                     // This is an offset to supplementary 32-bit data. We have 16-bit
407                     // base data, 32-bit base data, and 16-bit supplementary data before.
408                     // However, the 32-bit base data length is already part
409                     // of offset.
410                     additional = baseSize16 + supplementSize16;
411                 } else {
412                     // This is an offset to 32-bit base data. We have 16-bit
413                     // base data before.
414                     additional = baseSize16;
415                 }
416             }
417             if (offset + additional > 0xFFF) {
418                 status.set(U_INTERNAL_PROGRAM_ERROR);
419                 handleError(status, basename);
420             }
421         }
422         // It turns out it's better to swap the halves compared to the initial
423         // idea in order to put special marker values close to zero so that
424         // an important marker value becomes 1, so it's efficient to compare
425         // "1 or 0". Unfortunately, going through all the code to swap
426         // things is too error prone, so let's do the swapping here in one
427         // place.
428         uint32_t oldTrieValue = pending.descriptor + additional;
429         uint32_t swappedTrieValue = (oldTrieValue >> 16) | (oldTrieValue << 16);
430         umutablecptrie_set(builder.getAlias(), pending.scalar, swappedTrieValue, status);
431     }
432     LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
433         builder.getAlias(),
434         trieType,
435         UCPTRIE_VALUE_BITS_32,
436         status));
437     handleError(status, basename);
438 
439     if (reference) {
440         if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) {
441             // NFD expectations don't hold. The set must not contain the half-width
442             // kana voicing marks and must contain iota subscript.
443             status.set(U_INTERNAL_PROGRAM_ERROR);
444             handleError(status, basename);
445         }
446 
447         USet* halfWidthVoicing = uset_openEmpty();
448         uset_add(halfWidthVoicing, 0xFF9E);
449         uset_add(halfWidthVoicing, 0xFF9F);
450 
451         USet* iotaSubscript = uset_openEmpty();
452         uset_add(iotaSubscript, 0x0345);
453 
454         uint8_t flags = 0;
455 
456         USet* halfWidthCheck = uset_cloneAsThawed(uset);
457         uset_removeAll(halfWidthCheck, reference);
458         if (uset_equals(halfWidthCheck, halfWidthVoicing)) {
459             flags |= 1;
460         } else if (!uset_isEmpty(halfWidthCheck)) {
461             // The result was neither empty nor contained exactly
462             // the two half-width voicing marks. The ICU4X
463             // normalizer doesn't know how to deal with this case.
464             status.set(U_INTERNAL_PROGRAM_ERROR);
465             handleError(status, basename);
466         }
467         uset_close(halfWidthCheck);
468 
469         USet* iotaCheck = uset_cloneAsThawed(reference);
470         uset_removeAll(iotaCheck, uset);
471         if (!(uset_equals(iotaCheck, iotaSubscript)) && !uset_isEmpty(iotaCheck)) {
472             // The result was neither empty nor contained exactly
473             // the iota subscript. The ICU4X normalizer doesn't
474             // know how to deal with this case.
475             status.set(U_INTERNAL_PROGRAM_ERROR);
476             handleError(status, basename);
477         }
478         uset_close(halfWidthCheck);
479 
480         uset_close(iotaSubscript);
481         uset_close(halfWidthVoicing);
482 
483         fprintf(f, "flags = 0x%X\n", flags);
484         fprintf(f, "cap = 0x%X\n", passthroughCap);
485     }
486     fprintf(f, "[trie]\n");
487     usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
488     fclose(f);
489     handleError(status, basename);
490 }
491 
492 // Special marker for the NFKD form of U+FDFA
493 const int32_t FDFA_MARKER = 3;
494 
495 // Special marker for characters whose decomposition starts with a non-starter
496 // and the decomposition isn't the character itself.
497 const int32_t SPECIAL_NON_STARTER_DECOMPOSITION_MARKER = 2;
498 
499 // Special marker for starters that decompose to themselves but that may
500 // combine backwards under canonical composition
501 const int32_t BACKWARD_COMBINING_STARTER_MARKER = 1;
502 
503 /// Marker that a complex decomposition isn't round-trippable
504 /// under re-composition.
505 const uint32_t NON_ROUND_TRIP_MARKER = 1;
506 
permissibleBmpPair(UBool knownToRoundTrip,UChar32 c,UChar32 second)507 UBool permissibleBmpPair(UBool knownToRoundTrip, UChar32 c, UChar32 second) {
508     if (knownToRoundTrip) {
509         return true;
510     }
511     // Nuktas, Hebrew presentation forms and polytonic Greek with oxia
512     // are special-cased in ICU4X.
513     if (c >= 0xFB1D && c <= 0xFB4E) {
514         // Hebrew presentation forms
515         return true;
516     }
517     if (c >= 0x1F71 && c <= 0x1FFB) {
518         // Polytonic Greek with oxia
519         return true;
520     }
521     if ((second & 0x7F) == 0x3C && second >= 0x0900 && second <= 0x0BFF) {
522         // Nukta
523         return true;
524     }
525     // To avoid more branchiness, 4 characters that decompose to
526     // a BMP starter followed by a BMP non-starter are excluded
527     // from being encoded directly into the trie value and are
528     // handled as complex decompositions instead. These are:
529     // U+0F76 TIBETAN VOWEL SIGN VOCALIC R
530     // U+0F78 TIBETAN VOWEL SIGN VOCALIC L
531     // U+212B ANGSTROM SIGN
532     // U+2ADC FORKING
533     return false;
534 }
535 
536 // Computes data for canonical decompositions
computeDecompositions(const char * basename,const USet * backwardCombiningStarters,std::vector<uint16_t> & storage16,std::vector<uint32_t> & storage32,USet * decompositionStartsWithNonStarter,USet * decompositionStartsWithBackwardCombiningStarter,std::vector<PendingDescriptor> & pendingTrieInsertions,UChar32 & decompositionPassthroughBound,UChar32 & compositionPassthroughBound)537 void computeDecompositions(const char* basename,
538                            const USet* backwardCombiningStarters,
539                            std::vector<uint16_t>& storage16,
540                            std::vector<uint32_t>& storage32,
541                            USet* decompositionStartsWithNonStarter,
542                            USet* decompositionStartsWithBackwardCombiningStarter,
543                            std::vector<PendingDescriptor>& pendingTrieInsertions,
544                            UChar32& decompositionPassthroughBound,
545                            UChar32& compositionPassthroughBound) {
546     IcuToolErrorCode status("icuexportdata: computeDecompositions");
547     const Normalizer2* mainNormalizer;
548     const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(status);
549     const Normalizer2* nfcNormalizer = Normalizer2::getNFCInstance(status);
550     FILE* f = NULL;
551     std::vector<uint32_t> nonRecursive32;
552     LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status));
553 
554     if (uprv_strcmp(basename, "nfkd") == 0) {
555         mainNormalizer = Normalizer2::getNFKDInstance(status);
556     } else if (uprv_strcmp(basename, "uts46d") == 0) {
557         mainNormalizer = Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, status);
558     } else {
559         mainNormalizer = nfdNormalizer;
560         f = prepareOutputFile("decompositionex");
561     }
562 
563     // Max length as of Unicode 14 is 4 for NFD. For NFKD the max
564     // is 18 (U+FDFA; special-cased), and the next longest is 8 (U+FDFB).
565     const int32_t LONGEST_ENCODABLE_LENGTH_16 = 9;
566     const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8;
567     const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
568     UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
569     const int32_t RAW_DECOMPOSITION_BUFFER_SIZE = 2;
570     UChar32 rawUtf32[RAW_DECOMPOSITION_BUFFER_SIZE];
571 
572     // Iterate over all scalar values excluding Hangul syllables.
573     //
574     // We go backwards in order to better find overlapping decompositions.
575     //
576     // As of Unicode 14:
577     // Iterate forward without overlap search:
578     // nfd: 16 size: 896, 32 size: 173
579     // nfkd: 16 size: 3854, 32 size: 179
580     //
581     // Iterate forward with overlap search:
582     // nfd: 16 size: 888, 32 size: 173
583     // nfkd: 16 size: 3266, 32 size: 179
584     //
585     // Iterate backward with overlap search:
586     // nfd: 16 size: 776, 32 size: 173
587     // nfkd: 16 size: 2941, 32 size: 179
588     //
589     // UChar32 is signed!
590     for (UChar32 c = 0x10FFFF; c >= 0; --c) {
591         if (c >= 0xAC00 && c <= 0xD7A3) {
592             // Hangul syllable
593             continue;
594         }
595         if (c >= 0xD800 && c < 0xE000) {
596             // Surrogate
597             continue;
598         }
599         UnicodeString src;
600         UnicodeString dst;
601         // True if we're building non-NFD or we're building NFD but
602         // the `c` round trips to NFC.
603         // False if we're building NFD and `c` does not round trip to NFC.
604         UBool nonNfdOrRoundTrips = true;
605         src.append(c);
606         if (mainNormalizer != nfdNormalizer) {
607             UnicodeString inter;
608             mainNormalizer->normalize(src, inter, status);
609             nfdNormalizer->normalize(inter, dst, status);
610         } else {
611             nfdNormalizer->normalize(src, dst, status);
612             UnicodeString nfc;
613             nfcNormalizer->normalize(dst, nfc, status);
614             nonNfdOrRoundTrips = (src == nfc);
615         }
616         int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
617         if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
618             // Characters that normalize to nothing or to U+FFFD (without the
619             // input being U+FFFD) in ICU4C's UTS 46 normalization normalize
620             // as in NFD in ICU4X's UTF 46 normalization in the interest
621             // of data size and ICU4X's normalizer being unable to handle
622             // normalizing to nothing.
623             // When UTS 46 is implemented on top of ICU4X, a preprocessing
624             // step is supposed to remove these characters before the
625             // normalization step.
626             if (uprv_strcmp(basename, "uts46d") != 0) {
627                 status.set(U_INTERNAL_PROGRAM_ERROR);
628                 handleError(status, basename);
629             }
630             nfdNormalizer->normalize(src, dst, status);
631             len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
632             if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
633                 status.set(U_INTERNAL_PROGRAM_ERROR);
634                 handleError(status, basename);
635             }
636         }
637         if (len > DECOMPOSITION_BUFFER_SIZE) {
638             status.set(U_INTERNAL_PROGRAM_ERROR);
639             handleError(status, basename);
640         }
641         uint8_t firstCombiningClass = u_getCombiningClass(utf32[0]);
642         bool specialNonStarterDecomposition = false;
643         bool startsWithBackwardCombiningStarter = false;
644         if (firstCombiningClass) {
645             decompositionPassthroughBound = c;
646             compositionPassthroughBound = c;
647             uset_add(decompositionStartsWithNonStarter, c);
648             if (src != dst) {
649                 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F) {
650                     specialNonStarterDecomposition = true;
651                 } else {
652                     // A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X.
653                     status.set(U_INTERNAL_PROGRAM_ERROR);
654                     handleError(status, basename);
655                 }
656             }
657         } else if (uset_contains(backwardCombiningStarters, utf32[0])) {
658             compositionPassthroughBound = c;
659             startsWithBackwardCombiningStarter = true;
660             uset_add(decompositionStartsWithBackwardCombiningStarter, c);
661         }
662         if (c != BACKWARD_COMBINING_STARTER_MARKER && len == 1 && utf32[0] == BACKWARD_COMBINING_STARTER_MARKER) {
663             status.set(U_INTERNAL_PROGRAM_ERROR);
664             handleError(status, basename);
665         }
666         if (c != SPECIAL_NON_STARTER_DECOMPOSITION_MARKER && len == 1 && utf32[0] == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER) {
667             status.set(U_INTERNAL_PROGRAM_ERROR);
668             handleError(status, basename);
669         }
670         if (c != FDFA_MARKER && len == 1 && utf32[0] == FDFA_MARKER) {
671             status.set(U_INTERNAL_PROGRAM_ERROR);
672             handleError(status, basename);
673         }
674         if (mainNormalizer != nfdNormalizer) {
675             UnicodeString nfd;
676             nfdNormalizer->normalize(src, nfd, status);
677             if (dst == nfd) {
678                 continue;
679             }
680             decompositionPassthroughBound = c;
681             compositionPassthroughBound = c;
682         } else if (firstCombiningClass) {
683             len = 1;
684             if (specialNonStarterDecomposition) {
685                 utf32[0] = SPECIAL_NON_STARTER_DECOMPOSITION_MARKER; // magic value
686             } else {
687                 // Use the surrogate range to store the canonical combining class
688                 utf32[0] = 0xD800 | UChar32(firstCombiningClass);
689             }
690         } else {
691             if (src == dst) {
692                 if (startsWithBackwardCombiningStarter) {
693                     pendingTrieInsertions.push_back({c, BACKWARD_COMBINING_STARTER_MARKER << 16, false});
694                 }
695                 continue;
696             }
697             decompositionPassthroughBound = c;
698             // ICU4X hard-codes ANGSTROM SIGN
699             if (c != 0x212B) {
700                 UnicodeString raw;
701                 if (!nfdNormalizer->getRawDecomposition(c, raw)) {
702                     // We're always supposed to have a non-recursive decomposition
703                     // if we had a recursive one.
704                     status.set(U_INTERNAL_PROGRAM_ERROR);
705                     handleError(status, basename);
706                 }
707                 // In addition to actual difference, put the whole range that contains characters
708                 // with oxia into the non-recursive trie in order to catch cases where characters
709                 // with oxia have singleton decompositions to corresponding characters with tonos.
710                 // This way, the run-time decision to fall through can be done on the range
711                 // without checking for individual characters inside the range.
712                 if (raw != dst || (c >= 0x1F71 && c <= 0x1FFB)) {
713                     int32_t rawLen = raw.toUTF32(rawUtf32, RAW_DECOMPOSITION_BUFFER_SIZE, status);
714                     if (!rawLen) {
715                         status.set(U_INTERNAL_PROGRAM_ERROR);
716                         handleError(status, basename);
717                     }
718                     if (rawLen == 1) {
719                         if (c >= 0xFFFF) {
720                             status.set(U_INTERNAL_PROGRAM_ERROR);
721                             handleError(status, basename);
722                         }
723                         umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, uint32_t(rawUtf32[0]), status);
724                     } else if (rawUtf32[0] <= 0xFFFF && rawUtf32[1] <= 0xFFFF) {
725                         if (!rawUtf32[0] || !rawUtf32[1]) {
726                             status.set(U_INTERNAL_PROGRAM_ERROR);
727                             handleError(status, basename);
728                         }
729                         // Swapped for consistency with the primary trie
730                         uint32_t bmpPair = uint32_t(rawUtf32[1]) << 16 | uint32_t(rawUtf32[0]);
731                         umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, bmpPair, status);
732                     } else {
733                         // Let's add 1 to index to make it always non-zero to distinguish
734                         // it from the default zero.
735                         uint32_t index = nonRecursive32.size() + 1;
736                         nonRecursive32.push_back(uint32_t(rawUtf32[0]));
737                         nonRecursive32.push_back(uint32_t(rawUtf32[1]));
738                         if (index > 0xFFFF) {
739                             status.set(U_INTERNAL_PROGRAM_ERROR);
740                             handleError(status, basename);
741                         }
742                         umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, index << 16, status);
743                     }
744                 }
745             }
746         }
747         if (!nonNfdOrRoundTrips) {
748             compositionPassthroughBound = c;
749         }
750         if (len == 1 && utf32[0] <= 0xFFFF) {
751             if (startsWithBackwardCombiningStarter) {
752                 if (mainNormalizer == nfdNormalizer) {
753                     // Not supposed to happen in NFD
754                     status.set(U_INTERNAL_PROGRAM_ERROR);
755                     handleError(status, basename);
756                 } else if (!((utf32[0] >= 0x1161 && utf32[0] <= 0x1175) || (utf32[0] >= 0x11A8 && utf32[0] <= 0x11C2))) {
757                     // Other than conjoining jamo vowels and trails
758                     // unsupported for non-NFD.
759                     status.set(U_INTERNAL_PROGRAM_ERROR);
760                     handleError(status, basename);
761                 }
762             }
763             pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, false});
764         } else if (len == 2 &&
765                    utf32[0] <= 0xFFFF &&
766                    utf32[1] <= 0xFFFF &&
767                    !u_getCombiningClass(utf32[0]) &&
768                    u_getCombiningClass(utf32[1]) &&
769                    permissibleBmpPair(nonNfdOrRoundTrips, c, utf32[1])) {
770             for (int32_t i = 0; i < len; ++i) {
771                 if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) {
772                     // Assert that iota subscript and half-width voicing marks never occur in these
773                     // expansions in the normalization forms where they are special.
774                     status.set(U_INTERNAL_PROGRAM_ERROR);
775                     handleError(status, basename);
776                 }
777             }
778             if (startsWithBackwardCombiningStarter) {
779                 status.set(U_INTERNAL_PROGRAM_ERROR);
780                 handleError(status, basename);
781             }
782             pendingTrieInsertions.push_back({c, (uint32_t(utf32[0]) << 16) | uint32_t(utf32[1]), false});
783         } else {
784             if (startsWithBackwardCombiningStarter) {
785                 status.set(U_INTERNAL_PROGRAM_ERROR);
786                 handleError(status, basename);
787             }
788 
789             UBool supplementary = false;
790             UBool nonInitialStarter = false;
791             for (int32_t i = 0; i < len; ++i) {
792                 if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) {
793                     // Assert that iota subscript and half-width voicing marks never occur in these
794                     // expansions in the normalization forms where they are special.
795                     status.set(U_INTERNAL_PROGRAM_ERROR);
796                     handleError(status, basename);
797                 }
798 
799                 if (utf32[i] > 0xFFFF) {
800                     supplementary = true;
801                 }
802                 if (utf32[i] == 0) {
803                     status.set(U_INTERNAL_PROGRAM_ERROR);
804                     handleError(status, basename);
805                 }
806                 if (i != 0 && !u_getCombiningClass(utf32[i])) {
807                     nonInitialStarter = true;
808                 }
809             }
810             if (!supplementary) {
811                 if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) {
812                     if (len == 18 && c == 0xFDFA) {
813                         // Special marker for the one character whose decomposition
814                         // is too long.
815                         pendingTrieInsertions.push_back({c, FDFA_MARKER << 16, supplementary});
816                         continue;
817                     } else {
818                         status.set(U_INTERNAL_PROGRAM_ERROR);
819                         handleError(status, basename);
820                     }
821                 }
822             } else if (len > LONGEST_ENCODABLE_LENGTH_32 || !len) {
823                 status.set(U_INTERNAL_PROGRAM_ERROR);
824                 handleError(status, basename);
825             }
826             // Complex decomposition
827             // Format for 16-bit value:
828             // 15..13: length minus two for 16-bit case and length minus one for
829             //         the 32-bit case. Length 8 needs to fit in three bits in
830             //         the 16-bit case, and this way the value is future-proofed
831             //         up to 9 in the 16-bit case. Zero is unused and length one
832             //         in the 16-bit case goes directly into the trie.
833             //     12: 1 if all trailing characters are guaranteed non-starters,
834             //         0 if no guarantees about non-starterness.
835             //         Note: The bit choice is this way around to allow for
836             //         dynamically falling back to not having this but instead
837             //         having one more bit for length by merely choosing
838             //         different masks.
839             //  11..0: Start offset in storage. The offset is to the logical
840             //         sequence of scalars16, scalars32, supplementary_scalars16,
841             //         supplementary_scalars32.
842             uint32_t descriptor = uint32_t(!nonInitialStarter) << 12;
843             if (!supplementary) {
844                 descriptor |= (uint32_t(len) - 2) << 13;
845             } else {
846                 descriptor |= (uint32_t(len) - 1) << 13;
847             }
848             if (descriptor & 0xFFF) {
849                 status.set(U_INTERNAL_PROGRAM_ERROR);
850                 handleError(status, basename);
851             }
852             size_t index = 0;
853             bool writeToStorage = false;
854             // Sadly, C++ lacks break and continue by label, so using goto in the
855             // inner loops to break or continue the outer loop.
856             if (!supplementary) {
857                 outer16: for (;;) {
858                     if (index == storage16.size()) {
859                         writeToStorage = true;
860                         break;
861                     }
862                     if (storage16[index] == utf32[0]) {
863                         for (int32_t i = 1; i < len; ++i) {
864                             if (storage16[index + i] != uint32_t(utf32[i])) {
865                                 ++index;
866                                 // continue outer
867                                 goto outer16;
868                             }
869                         }
870                         // break outer
871                         goto after;
872                     }
873                     ++index;
874                 }
875             } else {
876                 outer32: for (;;) {
877                     if (index == storage32.size()) {
878                         writeToStorage = true;
879                         break;
880                     }
881                     if (storage32[index] == uint32_t(utf32[0])) {
882                         for (int32_t i = 1; i < len; ++i) {
883                             if (storage32[index + i] != uint32_t(utf32[i])) {
884                                 ++index;
885                                 // continue outer
886                                 goto outer32;
887                             }
888                         }
889                         // break outer
890                         goto after;
891                     }
892                     ++index;
893                 }
894             }
895             after:
896             if (index > 0xFFF) {
897                 status.set(U_INTERNAL_PROGRAM_ERROR);
898                 handleError(status, basename);
899             }
900             descriptor |= uint32_t(index);
901             if (!descriptor || descriptor > 0xFFFF) {
902                 // > 0xFFFF should never happen if the code above is correct.
903                 // == 0 should not happen due to the nature of the data.
904                 status.set(U_INTERNAL_PROGRAM_ERROR);
905                 handleError(status, basename);
906             }
907             if (writeToStorage) {
908                 if (!supplementary) {
909                     for (int32_t i = 0; i < len; ++i) {
910                         storage16.push_back(uint16_t(utf32[i]));
911                     }
912                 } else {
913                     for (int32_t i = 0; i < len; ++i) {
914                         storage32.push_back(uint32_t(utf32[i]));
915                     }
916                 }
917             }
918 
919             uint32_t nonRoundTripMarker = 0;
920             if (!nonNfdOrRoundTrips) {
921                 nonRoundTripMarker = (NON_ROUND_TRIP_MARKER << 16);
922             }
923             pendingTrieInsertions.push_back({c, descriptor | nonRoundTripMarker, supplementary});
924         }
925     }
926     if (storage16.size() + storage32.size() > 0xFFF) {
927         status.set(U_INTERNAL_PROGRAM_ERROR);
928     }
929     if (f) {
930         usrc_writeArray(f, "scalars32 = [\n  ", nonRecursive32.data(), 32, nonRecursive32.size(), "  ", "\n]\n");
931 
932         LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
933             nonRecursiveBuilder.getAlias(),
934             trieType,
935             UCPTRIE_VALUE_BITS_32,
936             status));
937         handleError(status, basename);
938 
939         fprintf(f, "[trie]\n");
940         usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
941 
942         fclose(f);
943     }
944     handleError(status, basename);
945 }
946 
947 #endif // !UCONFIG_NO_NORMALIZATION
948 
949 enum {
950     OPT_HELP_H,
951     OPT_HELP_QUESTION_MARK,
952     OPT_MODE,
953     OPT_TRIE_TYPE,
954     OPT_VERSION,
955     OPT_DESTDIR,
956     OPT_ALL,
957     OPT_INDEX,
958     OPT_COPYRIGHT,
959     OPT_VERBOSE,
960     OPT_QUIET,
961 
962     OPT_COUNT
963 };
964 
965 #define UOPTION_MODE UOPTION_DEF("mode", 'm', UOPT_REQUIRES_ARG)
966 #define UOPTION_TRIE_TYPE UOPTION_DEF("trie-type", '\1', UOPT_REQUIRES_ARG)
967 #define UOPTION_ALL UOPTION_DEF("all", '\1', UOPT_NO_ARG)
968 #define UOPTION_INDEX UOPTION_DEF("index", '\1', UOPT_NO_ARG)
969 
970 static UOption options[]={
971     UOPTION_HELP_H,
972     UOPTION_HELP_QUESTION_MARK,
973     UOPTION_MODE,
974     UOPTION_TRIE_TYPE,
975     UOPTION_VERSION,
976     UOPTION_DESTDIR,
977     UOPTION_ALL,
978     UOPTION_INDEX,
979     UOPTION_COPYRIGHT,
980     UOPTION_VERBOSE,
981     UOPTION_QUIET,
982 };
983 
printHelp(FILE * stdfile,const char * program)984 void printHelp(FILE* stdfile, const char* program) {
985   fprintf(stdfile,
986           "usage: %s -m mode [-options] [--all | properties...]\n"
987           "\tdump Unicode property data to .toml files\n"
988           "options:\n"
989           "\t-h or -? or --help  this usage text\n"
990           "\t-V or --version     show a version message\n"
991           "\t-m or --mode        mode: currently only 'uprops', 'ucase', and 'norm', but more may be added\n"
992           "\t      --trie-type   set the trie type (small or fast, default small)\n"
993           "\t-d or --destdir     destination directory, followed by the path\n"
994           "\t      --all         write out all properties known to icuexportdata\n"
995           "\t      --index       write an _index.toml summarizing all data exported\n"
996           "\t-c or --copyright   include a copyright notice\n"
997           "\t-v or --verbose     Turn on verbose output\n"
998           "\t-q or --quiet       do not display warnings and progress\n",
999           program);
1000 }
1001 
exportUprops(int argc,char * argv[])1002 int exportUprops(int argc, char* argv[]) {
1003     // Load list of Unicode properties
1004     std::vector<const char*> propNames;
1005     for (int i=1; i<argc; i++) {
1006         propNames.push_back(argv[i]);
1007     }
1008     if (options[OPT_ALL].doesOccur) {
1009         int i = UCHAR_BINARY_START;
1010         while (true) {
1011             if (i == UCHAR_BINARY_LIMIT) {
1012                 i = UCHAR_INT_START;
1013             }
1014             if (i == UCHAR_INT_LIMIT) {
1015                 i = UCHAR_SCRIPT_EXTENSIONS;
1016             }
1017             if (i == UCHAR_SCRIPT_EXTENSIONS + 1) {
1018                 break;
1019             }
1020             UProperty uprop = static_cast<UProperty>(i);
1021             const char* propName = u_getPropertyName(uprop, U_SHORT_PROPERTY_NAME);
1022             if (propName == NULL) {
1023                 propName = u_getPropertyName(uprop, U_LONG_PROPERTY_NAME);
1024                 if (propName != NULL && VERBOSE) {
1025                     std::cerr << "Note: falling back to long name for: " << propName << std::endl;
1026                 }
1027             }
1028             if (propName != NULL) {
1029                 propNames.push_back(propName);
1030             } else {
1031                 std::cerr << "Warning: Could not find name for: " << uprop << std::endl;
1032             }
1033             i++;
1034         }
1035     }
1036 
1037     if (propNames.empty()
1038             || options[OPT_HELP_H].doesOccur
1039             || options[OPT_HELP_QUESTION_MARK].doesOccur
1040             || !options[OPT_MODE].doesOccur) {
1041         FILE *stdfile=argc<0 ? stderr : stdout;
1042         fprintf(stdfile,
1043             "usage: %s -m uprops [-options] [--all | properties...]\n"
1044             "\tdump Unicode property data to .toml files\n"
1045             "options:\n"
1046             "\t-h or -? or --help  this usage text\n"
1047             "\t-V or --version     show a version message\n"
1048             "\t-m or --mode        mode: currently only 'uprops', but more may be added\n"
1049             "\t      --trie-type   set the trie type (small or fast, default small)\n"
1050             "\t-d or --destdir     destination directory, followed by the path\n"
1051             "\t      --all         write out all properties known to icuexportdata\n"
1052             "\t      --index       write an _index.toml summarizing all data exported\n"
1053             "\t-c or --copyright   include a copyright notice\n"
1054             "\t-v or --verbose     Turn on verbose output\n"
1055             "\t-q or --quiet       do not display warnings and progress\n",
1056             argv[0]);
1057         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1058     }
1059 
1060     const char* mode = options[OPT_MODE].value;
1061     if (uprv_strcmp(mode, "uprops") != 0) {
1062         fprintf(stderr, "Invalid option for --mode (must be uprops)\n");
1063         return U_ILLEGAL_ARGUMENT_ERROR;
1064     }
1065 
1066     if (options[OPT_TRIE_TYPE].doesOccur) {
1067         if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
1068             trieType = UCPTRIE_TYPE_FAST;
1069         } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
1070             trieType = UCPTRIE_TYPE_SMALL;
1071         } else {
1072             fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
1073             return U_ILLEGAL_ARGUMENT_ERROR;
1074         }
1075     }
1076 
1077     for (const char* propName : propNames) {
1078         UProperty propEnum = u_getPropertyEnum(propName);
1079         if (propEnum == UCHAR_INVALID_CODE) {
1080             std::cerr << "Error: Invalid property alias: " << propName << std::endl;
1081             return U_ILLEGAL_ARGUMENT_ERROR;
1082         }
1083 
1084         FILE* f = prepareOutputFile(propName);
1085 
1086         UVersionInfo versionInfo;
1087         u_getUnicodeVersion(versionInfo);
1088         char uvbuf[U_MAX_VERSION_STRING_LENGTH];
1089         u_versionToString(versionInfo, uvbuf);
1090         fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n",
1091             U_ICU_VERSION,
1092             uvbuf);
1093 
1094         if (propEnum < UCHAR_BINARY_LIMIT) {
1095             dumpBinaryProperty(propEnum, f);
1096         } else if (UCHAR_INT_START <= propEnum && propEnum <= UCHAR_INT_LIMIT) {
1097             dumpEnumeratedProperty(propEnum, f);
1098         } else if (propEnum == UCHAR_SCRIPT_EXTENSIONS) {
1099             dumpScriptExtensions(f);
1100         } else {
1101             std::cerr << "Don't know how to write property: " << propEnum << std::endl;
1102             return U_INTERNAL_PROGRAM_ERROR;
1103         }
1104 
1105         fclose(f);
1106     }
1107 
1108     if (options[OPT_INDEX].doesOccur) {
1109         FILE* f = prepareOutputFile("_index");
1110         fprintf(f, "index = [\n");
1111         for (const char* propName : propNames) {
1112             // At this point, propName is a valid property name, so it should be alphanum ASCII
1113             fprintf(f, "  { filename=\"%s.toml\" },\n", propName);
1114         }
1115         fprintf(f, "]\n");
1116         fclose(f);
1117     }
1118 
1119     return 0;
1120 }
1121 
1122 struct AddRangeHelper {
1123     UMutableCPTrie* ucptrie;
1124 };
1125 
1126 static UBool U_CALLCONV
addRangeToUCPTrie(const void * context,UChar32 start,UChar32 end,uint32_t value)1127 addRangeToUCPTrie(const void* context, UChar32 start, UChar32 end, uint32_t value) {
1128     IcuToolErrorCode status("addRangeToUCPTrie");
1129     UMutableCPTrie* ucptrie = ((const AddRangeHelper*) context)->ucptrie;
1130     umutablecptrie_setRange(ucptrie, start, end, value, status);
1131     handleError(status, "setRange");
1132 
1133     return true;
1134 }
1135 
exportCase(int argc,char * argv[])1136 int exportCase(int argc, char* argv[]) {
1137     if (argc > 1) {
1138         fprintf(stderr, "ucase mode does not expect additional arguments\n");
1139         return U_ILLEGAL_ARGUMENT_ERROR;
1140     }
1141     (void) argv; // Suppress unused variable warning
1142 
1143     IcuToolErrorCode status("icuexportdata");
1144     LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
1145     handleError(status, "exportCase");
1146 
1147     int32_t exceptionsLength, unfoldLength;
1148     const UCaseProps *caseProps = ucase_getSingleton(&exceptionsLength, &unfoldLength);
1149     const UTrie2* caseTrie = &caseProps->trie;
1150 
1151     AddRangeHelper helper = { builder.getAlias() };
1152     utrie2_enum(caseTrie, NULL, addRangeToUCPTrie, &helper);
1153 
1154     UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_16;
1155     LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
1156         builder.getAlias(),
1157         trieType,
1158         width,
1159         status));
1160     handleError(status, "exportCase");
1161 
1162     FILE* f = prepareOutputFile("ucase");
1163 
1164     UVersionInfo versionInfo;
1165     u_getUnicodeVersion(versionInfo);
1166     char uvbuf[U_MAX_VERSION_STRING_LENGTH];
1167     u_versionToString(versionInfo, uvbuf);
1168     fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n",
1169             U_ICU_VERSION,
1170             uvbuf);
1171 
1172     fputs("[ucase.code_point_trie]\n", f);
1173     usrc_writeUCPTrie(f, "case_trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
1174     fputs("\n", f);
1175 
1176     const char* indent = "  ";
1177     const char* suffix = "\n]\n";
1178 
1179     fputs("[ucase.exceptions]\n", f);
1180     const char* exceptionsPrefix = "exceptions = [\n  ";
1181     int32_t exceptionsWidth = 16;
1182     usrc_writeArray(f, exceptionsPrefix, caseProps->exceptions, exceptionsWidth,
1183                     exceptionsLength, indent, suffix);
1184     fputs("\n", f);
1185 
1186     fputs("[ucase.unfold]\n", f);
1187     const char* unfoldPrefix = "unfold = [\n  ";
1188     int32_t unfoldWidth = 16;
1189     usrc_writeArray(f, unfoldPrefix, caseProps->unfold, unfoldWidth,
1190                     unfoldLength, indent, suffix);
1191 
1192     return 0;
1193 }
1194 
1195 #if !UCONFIG_NO_NORMALIZATION
1196 
exportNorm()1197 int exportNorm() {
1198     IcuToolErrorCode status("icuexportdata: exportNorm");
1199     USet* backwardCombiningStarters = uset_openEmpty();
1200     writeCanonicalCompositions(backwardCombiningStarters);
1201 
1202     std::vector<uint16_t> storage16;
1203     std::vector<uint32_t> storage32;
1204 
1205     // Note: the USets are not exported. They are only used to check that a new
1206     // Unicode version doesn't violate expectations that are hard-coded in ICU4X.
1207     USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty();
1208     USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1209     std::vector<PendingDescriptor> nfdPendingTrieInsertions;
1210     UChar32 nfdBound = 0x10FFFF;
1211     UChar32 nfcBound = 0x10FFFF;
1212     computeDecompositions("nfd",
1213                           backwardCombiningStarters,
1214                           storage16,
1215                           storage32,
1216                           nfdDecompositionStartsWithNonStarter,
1217                           nfdDecompositionStartsWithBackwardCombiningStarter,
1218                           nfdPendingTrieInsertions,
1219                           nfdBound,
1220                           nfcBound);
1221     if (!(nfdBound == 0xC0 && nfcBound == 0x300)) {
1222         // Unexpected bounds for NFD/NFC.
1223         status.set(U_INTERNAL_PROGRAM_ERROR);
1224         handleError(status, "exportNorm");
1225     }
1226 
1227     uint32_t baseSize16 = storage16.size();
1228     uint32_t baseSize32 = storage32.size();
1229 
1230     USet* nfkdDecompositionStartsWithNonStarter = uset_openEmpty();
1231     USet* nfkdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1232     std::vector<PendingDescriptor> nfkdPendingTrieInsertions;
1233     UChar32 nfkdBound = 0x10FFFF;
1234     UChar32 nfkcBound = 0x10FFFF;
1235     computeDecompositions("nfkd",
1236                           backwardCombiningStarters,
1237                           storage16,
1238                           storage32,
1239                           nfkdDecompositionStartsWithNonStarter,
1240                           nfkdDecompositionStartsWithBackwardCombiningStarter,
1241                           nfkdPendingTrieInsertions,
1242                           nfkdBound,
1243                           nfkcBound);
1244     if (!(nfkdBound <= 0xC0 && nfkcBound <= 0x300)) {
1245         status.set(U_INTERNAL_PROGRAM_ERROR);
1246         handleError(status, "exportNorm");
1247     }
1248     if (nfkcBound > 0xC0) {
1249         if (nfkdBound != 0xC0) {
1250             status.set(U_INTERNAL_PROGRAM_ERROR);
1251             handleError(status, "exportNorm");
1252         }
1253     } else {
1254         if (nfkdBound != nfkcBound) {
1255             status.set(U_INTERNAL_PROGRAM_ERROR);
1256             handleError(status, "exportNorm");
1257         }
1258     }
1259 
1260     USet* uts46DecompositionStartsWithNonStarter = uset_openEmpty();
1261     USet* uts46DecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1262     std::vector<PendingDescriptor> uts46PendingTrieInsertions;
1263     UChar32 uts46dBound = 0x10FFFF;
1264     UChar32 uts46Bound = 0x10FFFF;
1265     computeDecompositions("uts46d",
1266                           backwardCombiningStarters,
1267                           storage16,
1268                           storage32,
1269                           uts46DecompositionStartsWithNonStarter,
1270                           uts46DecompositionStartsWithBackwardCombiningStarter,
1271                           uts46PendingTrieInsertions,
1272                           uts46dBound,
1273                           uts46Bound);
1274     if (!(uts46dBound <= 0xC0 && uts46Bound <= 0x300)) {
1275         status.set(U_INTERNAL_PROGRAM_ERROR);
1276         handleError(status, "exportNorm");
1277     }
1278     if (uts46Bound > 0xC0) {
1279         if (uts46dBound != 0xC0) {
1280             status.set(U_INTERNAL_PROGRAM_ERROR);
1281             handleError(status, "exportNorm");
1282         }
1283     } else {
1284         if (uts46dBound != uts46Bound) {
1285             status.set(U_INTERNAL_PROGRAM_ERROR);
1286             handleError(status, "exportNorm");
1287         }
1288     }
1289 
1290     uint32_t supplementSize16 = storage16.size() - baseSize16;
1291     uint32_t supplementSize32 = storage32.size() - baseSize32;
1292 
1293     writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions, char16_t(nfcBound));
1294     writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions, char16_t(nfkcBound));
1295     writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions, char16_t(uts46Bound));
1296 
1297     writeDecompositionTables("nfdex", storage16.data(), baseSize16, storage32.data(), baseSize32);
1298     writeDecompositionTables("nfkdex", storage16.data() + baseSize16, supplementSize16, storage32.data() + baseSize32, supplementSize32);
1299 
1300     uset_close(nfdDecompositionStartsWithNonStarter);
1301     uset_close(nfkdDecompositionStartsWithNonStarter);
1302     uset_close(uts46DecompositionStartsWithNonStarter);
1303 
1304     uset_close(nfdDecompositionStartsWithBackwardCombiningStarter);
1305     uset_close(nfkdDecompositionStartsWithBackwardCombiningStarter);
1306     uset_close(uts46DecompositionStartsWithBackwardCombiningStarter);
1307 
1308     uset_close(backwardCombiningStarters);
1309     handleError(status, "exportNorm");
1310     return 0;
1311 }
1312 
1313 #endif // !UCONFIG_NO_NORMALIZATION
1314 
main(int argc,char * argv[])1315 int main(int argc, char* argv[]) {
1316     U_MAIN_INIT_ARGS(argc, argv);
1317 
1318     /* preset then read command line options */
1319     options[OPT_DESTDIR].value=u_getDataDirectory();
1320     argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
1321 
1322     if(options[OPT_VERSION].doesOccur) {
1323         printf("icuexportdata version %s, ICU tool to dump data files for external consumers\n",
1324                U_ICU_DATA_VERSION);
1325         printf("%s\n", U_COPYRIGHT_STRING);
1326         exit(0);
1327     }
1328 
1329     /* error handling, printing usage message */
1330     if(argc<0) {
1331         fprintf(stderr,
1332             "error in command line argument \"%s\"\n",
1333             argv[-argc]);
1334     }
1335 
1336     if (argc < 0
1337             || options[OPT_HELP_H].doesOccur
1338             || options[OPT_HELP_QUESTION_MARK].doesOccur
1339             || !options[OPT_MODE].doesOccur) {
1340         FILE *stdfile=argc<0 ? stderr : stdout;
1341         printHelp(stdfile, argv[0]);
1342         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1343     }
1344 
1345     /* get the options values */
1346     haveCopyright = options[OPT_COPYRIGHT].doesOccur;
1347     destdir = options[OPT_DESTDIR].value;
1348     VERBOSE = options[OPT_VERBOSE].doesOccur;
1349     QUIET = options[OPT_QUIET].doesOccur;
1350 
1351     if (options[OPT_TRIE_TYPE].doesOccur) {
1352         if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
1353             trieType = UCPTRIE_TYPE_FAST;
1354         } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
1355             trieType = UCPTRIE_TYPE_SMALL;
1356         } else {
1357             fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
1358             return U_ILLEGAL_ARGUMENT_ERROR;
1359         }
1360     }
1361 
1362     const char* mode = options[OPT_MODE].value;
1363     if (uprv_strcmp(mode, "norm") == 0) {
1364 #if !UCONFIG_NO_NORMALIZATION
1365         return exportNorm();
1366 #else
1367     fprintf(stderr, "Exporting normalization data not supported when compiling without normalization support.\n");
1368     return U_ILLEGAL_ARGUMENT_ERROR;
1369 #endif
1370     }
1371     if (uprv_strcmp(mode, "uprops") == 0) {
1372         return exportUprops(argc, argv);
1373     } else if (uprv_strcmp(mode, "ucase") == 0) {
1374         return exportCase(argc, argv);
1375     }
1376 
1377     fprintf(stderr, "Invalid option for --mode (must be uprops, ucase, or norm)\n");
1378     return U_ILLEGAL_ARGUMENT_ERROR;
1379 }
1380