1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 #include <cstddef>
5 #include <cstdint>
6 #include <cstdio>
7 #include <iostream>
8 #include <unicode/localpointer.h>
9 #include <unicode/umachine.h>
10 #include <unicode/unistr.h>
11 #include <unicode/urename.h>
12 #include <unicode/uset.h>
13 #include <vector>
14 #include <algorithm>
15 #include "toolutil.h"
16 #include "uoptions.h"
17 #include "cmemory.h"
18 #include "charstr.h"
19 #include "cstring.h"
20 #include "unicode/uchar.h"
21 #include "unicode/errorcode.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uscript.h"
24 #include "unicode/putil.h"
25 #include "unicode/umutablecptrie.h"
26 #include "unicode/ucharstriebuilder.h"
27 #include "ucase.h"
28 #include "unicode/normalizer2.h"
29 #include "normalizer2impl.h"
30 #include "writesrc.h"
31
32 U_NAMESPACE_USE
33
34 /*
35 * Global - verbosity
36 */
37 UBool VERBOSE = false;
38 UBool QUIET = false;
39
40 UBool haveCopyright = true;
41 UCPTrieType trieType = UCPTRIE_TYPE_SMALL;
42 const char* destdir = "";
43
44 // Mask constants for modified values in the Script CodePointTrie, values are logically 12-bits.
45 int16_t DATAEXPORT_SCRIPT_X_WITH_COMMON = 0x0400;
46 int16_t DATAEXPORT_SCRIPT_X_WITH_INHERITED = 0x0800;
47 int16_t DATAEXPORT_SCRIPT_X_WITH_OTHER = 0x0c00;
48
49 // TODO(ICU-21821): Replace this with a call to a library function
50 int32_t scxCodePoints[] = {
51 7415, 7377, 7380, 7387, 7390, 7391, 7394, 7395, 7396, 7397,
52 7398, 7399, 7400, 7403, 7404, 7406, 7407, 7408, 7409, 113824,
53 113825, 113826, 113827, 834, 837, 7616, 7617, 12294, 12350, 12351,
54 12688, 12689, 12690, 12691, 12692, 12693, 12694, 12695, 12696, 12697,
55 12698, 12699, 12700, 12701, 12702, 12703, 12736, 12737, 12738, 12739,
56 12740, 12741, 12742, 12743, 12744, 12745, 12746, 12747, 12748, 12749,
57 12750, 12751, 12752, 12753, 12754, 12755, 12756, 12757, 12758, 12759,
58 12760, 12761, 12762, 12763, 12764, 12765, 12766, 12767, 12768, 12769,
59 12770, 12771, 12832, 12833, 12834, 12835, 12836, 12837, 12838, 12839,
60 12840, 12841, 12842, 12843, 12844, 12845, 12846, 12847, 12848, 12849,
61 12850, 12851, 12852, 12853, 12854, 12855, 12856, 12857, 12858, 12859,
62 12860, 12861, 12862, 12863, 12864, 12865, 12866, 12867, 12868, 12869,
63 12870, 12871, 12928, 12929, 12930, 12931, 12932, 12933, 12934, 12935,
64 12936, 12937, 12938, 12939, 12940, 12941, 12942, 12943, 12944, 12945,
65 12946, 12947, 12948, 12949, 12950, 12951, 12952, 12953, 12954, 12955,
66 12956, 12957, 12958, 12959, 12960, 12961, 12962, 12963, 12964, 12965,
67 12966, 12967, 12968, 12969, 12970, 12971, 12972, 12973, 12974, 12975,
68 12976, 12992, 12993, 12994, 12995, 12996, 12997, 12998, 12999, 13000,
69 13001, 13002, 13003, 13055, 13144, 13145, 13146, 13147, 13148, 13149,
70 13150, 13151, 13152, 13153, 13154, 13155, 13156, 13157, 13158, 13159,
71 13160, 13161, 13162, 13163, 13164, 13165, 13166, 13167, 13168, 13179,
72 13180, 13181, 13182, 13183, 13280, 13281, 13282, 13283, 13284, 13285,
73 13286, 13287, 13288, 13289, 13290, 13291, 13292, 13293, 13294, 13295,
74 13296, 13297, 13298, 13299, 13300, 13301, 13302, 13303, 13304, 13305,
75 13306, 13307, 13308, 13309, 13310, 119648, 119649, 119650, 119651, 119652,
76 119653, 119654, 119655, 119656, 119657, 119658, 119659, 119660, 119661, 119662,
77 119663, 119664, 119665, 127568, 127569, 867, 868, 869, 870, 871,
78 872, 873, 874, 875, 876, 877, 878, 879, 7418, 7674,
79 66272, 66273, 66274, 66275, 66276, 66277, 66278, 66279, 66280, 66281,
80 66282, 66283, 66284, 66285, 66286, 66287, 66288, 66289, 66290, 66291,
81 66292, 66293, 66294, 66295, 66296, 66297, 66298, 66299, 1748, 64830,
82 64831, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619,
83 1620, 1621, 1648, 65010, 65021, 7381, 7382, 7384, 7393, 7402,
84 7405, 7413, 7414, 43249, 12330, 12331, 12332, 12333, 43471, 65794,
85 65847, 65848, 65849, 65850, 65851, 65852, 65853, 65854, 65855, 1156,
86 1159, 11843, 42607, 1157, 1158, 1155, 7672, 7379, 7411, 7416,
87 7417, 7401, 7383, 7385, 7388, 7389, 7392, 43251, 4347, 3046,
88 3047, 3048, 3049, 3050, 3051, 3052, 3053, 3054, 3055, 3056,
89 3057, 3058, 3059, 70401, 70403, 70459, 70460, 73680, 73681, 73683,
90 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2799,
91 2662, 2663, 2664, 2665, 2666, 2667, 2668, 2669, 2670, 2671,
92 42752, 42753, 42754, 42755, 42756, 42757, 42758, 42759, 12337, 12338,
93 12339, 12340, 12341, 12441, 12442, 12443, 12444, 12448, 12540, 65392,
94 65438, 65439, 3302, 3303, 3304, 3305, 3306, 3307, 3308, 3309,
95 3310, 3311, 8239, 68338, 6146, 6147, 6149, 1564, 1632, 1633,
96 1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 2534, 2535,
97 2536, 2537, 2538, 2539, 2540, 2541, 2542, 2543, 4160, 4161,
98 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 65792, 65793,
99 65799, 65800, 65801, 65802, 65803, 65804, 65805, 65806, 65807, 65808,
100 65809, 65810, 65811, 65812, 65813, 65814, 65815, 65816, 65817, 65818,
101 65819, 65820, 65821, 65822, 65823, 65824, 65825, 65826, 65827, 65828,
102 65829, 65830, 65831, 65832, 65833, 65834, 65835, 65836, 65837, 65838,
103 65839, 65840, 65841, 65842, 65843, 7412, 8432, 12348, 12349, 43310,
104 7376, 7378, 5941, 5942, 2406, 2407, 2408, 2409, 2410, 2411,
105 2412, 2413, 2414, 2415, 12291, 12307, 12316, 12317, 12318, 12319,
106 12336, 12343, 65093, 65094, 1548, 1563, 12289, 12290, 12296, 12297,
107 12298, 12299, 12300, 12301, 12302, 12303, 12304, 12305, 12308, 12309,
108 12310, 12311, 12312, 12313, 12314, 12315, 12539, 65377, 65378, 65379,
109 65380, 65381, 7386, 1567, 7410, 1600, 43062, 43063, 43064, 43065,
110 2386, 2385, 43059, 43060, 43061, 43056, 43057, 43058, 2404, 2405
111 };
112
handleError(ErrorCode & status,const char * context)113 void handleError(ErrorCode& status, const char* context) {
114 if (status.isFailure()) {
115 std::cerr << "Error: " << context << ": " << status.errorName() << std::endl;
116 exit(status.reset());
117 }
118 }
119
120 class PropertyValueNameGetter : public ValueNameGetter {
121 public:
PropertyValueNameGetter(UProperty prop)122 PropertyValueNameGetter(UProperty prop) : property(prop) {}
123 ~PropertyValueNameGetter() override;
getName(uint32_t value)124 const char *getName(uint32_t value) override {
125 return u_getPropertyValueName(property, value, U_SHORT_PROPERTY_NAME);
126 }
127
128 private:
129 UProperty property;
130 };
131
~PropertyValueNameGetter()132 PropertyValueNameGetter::~PropertyValueNameGetter() {}
133
dumpBinaryProperty(UProperty uproperty,FILE * f)134 void dumpBinaryProperty(UProperty uproperty, FILE* f) {
135 IcuToolErrorCode status("icuexportdata: dumpBinaryProperty");
136 const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
137 const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
138 const USet* uset = u_getBinaryPropertySet(uproperty, status);
139 handleError(status, fullPropName);
140
141 fputs("[[binary_property]]\n", f);
142 fprintf(f, "long_name = \"%s\"\n", fullPropName);
143 if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
144 usrc_writeUnicodeSet(f, uset, UPRV_TARGET_SYNTAX_TOML);
145 }
146
dumpEnumeratedProperty(UProperty uproperty,FILE * f)147 void dumpEnumeratedProperty(UProperty uproperty, FILE* f) {
148 IcuToolErrorCode status("icuexportdata: dumpEnumeratedProperty");
149 const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME);
150 const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME);
151 const UCPMap* umap = u_getIntPropertyMap(uproperty, status);
152 handleError(status, fullPropName);
153
154 fputs("[[enum_property]]\n", f);
155 fprintf(f, "long_name = \"%s\"\n", fullPropName);
156 if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName);
157 PropertyValueNameGetter valueNameGetter(uproperty);
158 usrc_writeUCPMap(f, umap, &valueNameGetter, UPRV_TARGET_SYNTAX_TOML);
159 fputs("\n", f);
160
161 U_ASSERT(u_getIntPropertyMinValue(uproperty) >= 0);
162 int32_t maxValue = u_getIntPropertyMaxValue(uproperty);
163 U_ASSERT(maxValue >= 0);
164 UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32;
165 if (maxValue <= 0xff) {
166 width = UCPTRIE_VALUE_BITS_8;
167 } else if (maxValue <= 0xffff) {
168 width = UCPTRIE_VALUE_BITS_16;
169 }
170 LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(umap, status));
171 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
172 builder.getAlias(),
173 trieType,
174 width,
175 status));
176 handleError(status, fullPropName);
177
178 fputs("[enum_property.code_point_trie]\n", f);
179 usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
180 }
181
dumpScriptExtensions(FILE * f)182 void dumpScriptExtensions(FILE* f) {
183 IcuToolErrorCode status("icuexportdata: dumpScriptExtensions");
184
185 fputs("[[script_extensions]]\n", f);
186 const char* scxFullPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_LONG_PROPERTY_NAME);
187 const char* scxShortPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_SHORT_PROPERTY_NAME);
188 fprintf(f, "long_name = \"%s\"\n", scxFullPropName);
189 if (scxShortPropName) fprintf(f, "short_name = \"%s\"\n", scxShortPropName);
190
191 // We want to use 16 bits for our exported trie of sc/scx data because we
192 // need 12 bits to match the 12 bits of data stored for sc/scx in the trie
193 // in the uprops.icu data file.
194 UCPTrieValueWidth scWidth = UCPTRIE_VALUE_BITS_16;
195
196 // Create a mutable UCPTrie builder populated with Script property values data.
197 const UCPMap* scInvMap = u_getIntPropertyMap(UCHAR_SCRIPT, status);
198 handleError(status, scxFullPropName);
199 LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(scInvMap, status));
200 handleError(status, scxFullPropName);
201
202 // The values for the output scx companion array.
203 // Invariant is that all subvectors are distinct.
204 std::vector< std::vector<uint16_t> > outputDedupVec;
205
206 // The sc/scx companion array is an array of arrays (of script codes)
207 fputs("script_code_array = [\n", f);
208 for(const UChar32 cp : scxCodePoints) {
209 // Get the Script value
210 uint32_t scVal = umutablecptrie_get(builder.getAlias(), cp);
211 // Get the Script_Extensions value (array of Script codes)
212 const int32_t SCX_ARRAY_CAPACITY = 32;
213 UScriptCode scxValArray[SCX_ARRAY_CAPACITY];
214 int32_t numScripts = uscript_getScriptExtensions(cp, scxValArray, SCX_ARRAY_CAPACITY, status);
215 handleError(status, scxFullPropName);
216
217 // Convert the scx array into a vector
218 std::vector<uint16_t> scxValVec;
219 for(int i = 0; i < numScripts; i++) {
220 scxValVec.push_back(scxValArray[i]);
221 }
222 // Ensure that it is sorted
223 std::sort(scxValVec.begin(), scxValVec.end());
224 // Copy the Script value into the first position of the scx array only
225 // if we have the "other" case (Script value is not Common nor Inherited).
226 // This offers faster access when users want only the Script value.
227 if (scVal != USCRIPT_COMMON && scVal != USCRIPT_INHERITED) {
228 scxValVec.insert(scxValVec.begin(), scVal);
229 }
230
231 // See if there is already an scx value array matching the newly built one.
232 // If there is, then use its index.
233 // If not, then append the new value array.
234 bool isScxValUnique = true;
235 size_t outputIndex = 0;
236 for (outputIndex = 0; outputIndex < outputDedupVec.size(); outputIndex++) {
237 if (outputDedupVec[outputIndex] == scxValVec) {
238 isScxValUnique = false;
239 break;
240 }
241 }
242
243 if (isScxValUnique) {
244 outputDedupVec.push_back(scxValVec);
245 usrc_writeArray(f, " [", scxValVec.data(), 16, scxValVec.size(), " ", "],\n");
246 }
247
248 // We must update the value in the UCPTrie for the code point to contain:
249 // 9..0 the Script code in the lower 10 bits when 11..10 is 0, else it is
250 // the index into the companion array
251 // 11..10 the same higher-order 2 bits in the trie in uprops.icu indicating whether
252 // 3: other
253 // 2: Script=Inherited
254 // 1: Script=Common
255 // 0: Script=value in 9..0 (N/A because we are in this loop to create the companion array for non-0 cases)
256 uint16_t mask = 0;
257 if (scVal == USCRIPT_COMMON) {
258 mask = DATAEXPORT_SCRIPT_X_WITH_COMMON;
259 } else if (scVal == USCRIPT_INHERITED) {
260 mask = DATAEXPORT_SCRIPT_X_WITH_INHERITED;
261 } else {
262 mask = DATAEXPORT_SCRIPT_X_WITH_OTHER;
263 }
264
265 // The new trie value is the index into the new array with the high order bits set
266 uint32_t newScVal = outputIndex | mask;
267
268 // Update the code point in the mutable trie builder with the trie value
269 umutablecptrie_set(builder.getAlias(), cp, newScVal, status);
270 handleError(status, scxFullPropName);
271 }
272 fputs("]\n\n", f); // Print the TOML close delimiter for the outer array.
273
274 // Convert from mutable trie builder to immutable trie.
275 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
276 builder.getAlias(),
277 trieType,
278 scWidth,
279 status));
280 handleError(status, scxFullPropName);
281
282 fputs("[script_extensions.code_point_trie]\n", f);
283 usrc_writeUCPTrie(f, scxShortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
284 }
285
prepareOutputFile(const char * basename)286 FILE* prepareOutputFile(const char* basename) {
287 IcuToolErrorCode status("icuexportdata");
288 CharString outFileName;
289 if (destdir != nullptr && *destdir != 0) {
290 outFileName.append(destdir, status).ensureEndsWithFileSeparator(status);
291 }
292 outFileName.append(basename, status);
293 outFileName.append(".toml", status);
294 handleError(status, basename);
295
296 FILE* f = fopen(outFileName.data(), "w");
297 if (f == nullptr) {
298 std::cerr << "Unable to open file: " << outFileName.data() << std::endl;
299 exit(U_FILE_ACCESS_ERROR);
300 }
301 if (!QUIET) {
302 std::cout << "Writing to: " << outFileName.data() << std::endl;
303 }
304
305 if (haveCopyright) {
306 usrc_writeCopyrightHeader(f, "#", 2021);
307 }
308 usrc_writeFileNameGeneratedBy(f, "#", basename, "icuexportdata.cpp");
309
310 return f;
311 }
312
313 #if !UCONFIG_NO_NORMALIZATION
314
315 struct PendingDescriptor {
316 UChar32 scalar;
317 uint32_t descriptor;
318 UBool supplementary;
319 };
320
writeCanonicalCompositions(USet * backwardCombiningStarters)321 void writeCanonicalCompositions(USet* backwardCombiningStarters) {
322 IcuToolErrorCode status("icuexportdata: computeCanonicalCompositions");
323 const char* basename = "compositions";
324 FILE* f = prepareOutputFile(basename);
325
326 LocalPointer<UCharsTrieBuilder> backwardBuilder(new UCharsTrieBuilder(status), status);
327
328 const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
329 UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
330
331 const Normalizer2* nfc = Normalizer2::getNFCInstance(status);
332 for (UChar32 c = 0; c <= 0x10FFFF; ++c) {
333 if (c >= 0xD800 && c < 0xE000) {
334 // Surrogate
335 continue;
336 }
337 UnicodeString decomposition;
338 if (!nfc->getRawDecomposition(c, decomposition)) {
339 continue;
340 }
341 int32_t len = decomposition.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
342 if (len != 2) {
343 continue;
344 }
345 UChar32 starter = utf32[0];
346 UChar32 second = utf32[1];
347 UChar32 composite = nfc->composePair(starter, second);
348 if (composite < 0) {
349 continue;
350 }
351 if (c != composite) {
352 status.set(U_INTERNAL_PROGRAM_ERROR);
353 handleError(status, basename);
354 }
355 if (!u_getCombiningClass(second)) {
356 uset_add(backwardCombiningStarters, second);
357 }
358 if (composite >= 0xAC00 && composite <= 0xD7A3) {
359 // Hangul syllable
360 continue;
361 }
362
363 UnicodeString backward;
364 backward.append(second);
365 backward.append(starter);
366 backwardBuilder->add(backward, int32_t(composite), status);
367 }
368 UnicodeString canonicalCompositionTrie;
369 backwardBuilder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, canonicalCompositionTrie, status);
370
371 usrc_writeArray(f, "compositions = [\n ", canonicalCompositionTrie.getBuffer(), 16, canonicalCompositionTrie.length(), " ", "\n]\n");
372 fclose(f);
373 handleError(status, basename);
374 }
375
writeDecompositionTables(const char * basename,const uint16_t * ptr16,size_t len16,const uint32_t * ptr32,size_t len32)376 void writeDecompositionTables(const char* basename, const uint16_t* ptr16, size_t len16, const uint32_t* ptr32, size_t len32) {
377 FILE* f = prepareOutputFile(basename);
378 usrc_writeArray(f, "scalars16 = [\n ", ptr16, 16, len16, " ", "\n]\n");
379 usrc_writeArray(f, "scalars32 = [\n ", ptr32, 32, len32, " ", "\n]\n");
380 fclose(f);
381 }
382
writeDecompositionData(const char * basename,uint32_t baseSize16,uint32_t baseSize32,uint32_t supplementSize16,USet * uset,USet * reference,const std::vector<PendingDescriptor> & pendingTrieInsertions,char16_t passthroughCap)383 void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector<PendingDescriptor>& pendingTrieInsertions, char16_t passthroughCap) {
384 IcuToolErrorCode status("icuexportdata: writeDecompositionData");
385 FILE* f = prepareOutputFile(basename);
386
387 // Zero is a magic number that means the character decomposes to itself.
388 LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
389
390 // Iterate backwards to insert lower code points in the trie first in case it matters
391 // for trie block allocation.
392 for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) {
393 const PendingDescriptor& pending = pendingTrieInsertions[i];
394 uint32_t additional = 0;
395 if (!(pending.descriptor & 0xFFFE0000)) {
396 uint32_t offset = pending.descriptor & 0xFFF;
397 if (!pending.supplementary) {
398 if (offset >= baseSize16) {
399 // This is a offset to supplementary 16-bit data. We have
400 // 16-bit base data and 32-bit base data before. However,
401 // the 16-bit base data length is already part of offset.
402 additional = baseSize32;
403 }
404 } else {
405 if (offset >= baseSize32) {
406 // This is an offset to supplementary 32-bit data. We have 16-bit
407 // base data, 32-bit base data, and 16-bit supplementary data before.
408 // However, the 32-bit base data length is already part
409 // of offset.
410 additional = baseSize16 + supplementSize16;
411 } else {
412 // This is an offset to 32-bit base data. We have 16-bit
413 // base data before.
414 additional = baseSize16;
415 }
416 }
417 if (offset + additional > 0xFFF) {
418 status.set(U_INTERNAL_PROGRAM_ERROR);
419 handleError(status, basename);
420 }
421 }
422 // It turns out it's better to swap the halves compared to the initial
423 // idea in order to put special marker values close to zero so that
424 // an important marker value becomes 1, so it's efficient to compare
425 // "1 or 0". Unfortunately, going through all the code to swap
426 // things is too error prone, so let's do the swapping here in one
427 // place.
428 uint32_t oldTrieValue = pending.descriptor + additional;
429 uint32_t swappedTrieValue = (oldTrieValue >> 16) | (oldTrieValue << 16);
430 umutablecptrie_set(builder.getAlias(), pending.scalar, swappedTrieValue, status);
431 }
432 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
433 builder.getAlias(),
434 trieType,
435 UCPTRIE_VALUE_BITS_32,
436 status));
437 handleError(status, basename);
438
439 if (reference) {
440 if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) {
441 // NFD expectations don't hold. The set must not contain the half-width
442 // kana voicing marks and must contain iota subscript.
443 status.set(U_INTERNAL_PROGRAM_ERROR);
444 handleError(status, basename);
445 }
446
447 USet* halfWidthVoicing = uset_openEmpty();
448 uset_add(halfWidthVoicing, 0xFF9E);
449 uset_add(halfWidthVoicing, 0xFF9F);
450
451 USet* iotaSubscript = uset_openEmpty();
452 uset_add(iotaSubscript, 0x0345);
453
454 uint8_t flags = 0;
455
456 USet* halfWidthCheck = uset_cloneAsThawed(uset);
457 uset_removeAll(halfWidthCheck, reference);
458 if (uset_equals(halfWidthCheck, halfWidthVoicing)) {
459 flags |= 1;
460 } else if (!uset_isEmpty(halfWidthCheck)) {
461 // The result was neither empty nor contained exactly
462 // the two half-width voicing marks. The ICU4X
463 // normalizer doesn't know how to deal with this case.
464 status.set(U_INTERNAL_PROGRAM_ERROR);
465 handleError(status, basename);
466 }
467 uset_close(halfWidthCheck);
468
469 USet* iotaCheck = uset_cloneAsThawed(reference);
470 uset_removeAll(iotaCheck, uset);
471 if (!(uset_equals(iotaCheck, iotaSubscript)) && !uset_isEmpty(iotaCheck)) {
472 // The result was neither empty nor contained exactly
473 // the iota subscript. The ICU4X normalizer doesn't
474 // know how to deal with this case.
475 status.set(U_INTERNAL_PROGRAM_ERROR);
476 handleError(status, basename);
477 }
478 uset_close(halfWidthCheck);
479
480 uset_close(iotaSubscript);
481 uset_close(halfWidthVoicing);
482
483 fprintf(f, "flags = 0x%X\n", flags);
484 fprintf(f, "cap = 0x%X\n", passthroughCap);
485 }
486 fprintf(f, "[trie]\n");
487 usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
488 fclose(f);
489 handleError(status, basename);
490 }
491
492 // Special marker for the NFKD form of U+FDFA
493 const int32_t FDFA_MARKER = 3;
494
495 // Special marker for characters whose decomposition starts with a non-starter
496 // and the decomposition isn't the character itself.
497 const int32_t SPECIAL_NON_STARTER_DECOMPOSITION_MARKER = 2;
498
499 // Special marker for starters that decompose to themselves but that may
500 // combine backwards under canonical composition
501 const int32_t BACKWARD_COMBINING_STARTER_MARKER = 1;
502
503 /// Marker that a complex decomposition isn't round-trippable
504 /// under re-composition.
505 const uint32_t NON_ROUND_TRIP_MARKER = 1;
506
permissibleBmpPair(UBool knownToRoundTrip,UChar32 c,UChar32 second)507 UBool permissibleBmpPair(UBool knownToRoundTrip, UChar32 c, UChar32 second) {
508 if (knownToRoundTrip) {
509 return true;
510 }
511 // Nuktas, Hebrew presentation forms and polytonic Greek with oxia
512 // are special-cased in ICU4X.
513 if (c >= 0xFB1D && c <= 0xFB4E) {
514 // Hebrew presentation forms
515 return true;
516 }
517 if (c >= 0x1F71 && c <= 0x1FFB) {
518 // Polytonic Greek with oxia
519 return true;
520 }
521 if ((second & 0x7F) == 0x3C && second >= 0x0900 && second <= 0x0BFF) {
522 // Nukta
523 return true;
524 }
525 // To avoid more branchiness, 4 characters that decompose to
526 // a BMP starter followed by a BMP non-starter are excluded
527 // from being encoded directly into the trie value and are
528 // handled as complex decompositions instead. These are:
529 // U+0F76 TIBETAN VOWEL SIGN VOCALIC R
530 // U+0F78 TIBETAN VOWEL SIGN VOCALIC L
531 // U+212B ANGSTROM SIGN
532 // U+2ADC FORKING
533 return false;
534 }
535
536 // Computes data for canonical decompositions
computeDecompositions(const char * basename,const USet * backwardCombiningStarters,std::vector<uint16_t> & storage16,std::vector<uint32_t> & storage32,USet * decompositionStartsWithNonStarter,USet * decompositionStartsWithBackwardCombiningStarter,std::vector<PendingDescriptor> & pendingTrieInsertions,UChar32 & decompositionPassthroughBound,UChar32 & compositionPassthroughBound)537 void computeDecompositions(const char* basename,
538 const USet* backwardCombiningStarters,
539 std::vector<uint16_t>& storage16,
540 std::vector<uint32_t>& storage32,
541 USet* decompositionStartsWithNonStarter,
542 USet* decompositionStartsWithBackwardCombiningStarter,
543 std::vector<PendingDescriptor>& pendingTrieInsertions,
544 UChar32& decompositionPassthroughBound,
545 UChar32& compositionPassthroughBound) {
546 IcuToolErrorCode status("icuexportdata: computeDecompositions");
547 const Normalizer2* mainNormalizer;
548 const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(status);
549 const Normalizer2* nfcNormalizer = Normalizer2::getNFCInstance(status);
550 FILE* f = NULL;
551 std::vector<uint32_t> nonRecursive32;
552 LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status));
553
554 if (uprv_strcmp(basename, "nfkd") == 0) {
555 mainNormalizer = Normalizer2::getNFKDInstance(status);
556 } else if (uprv_strcmp(basename, "uts46d") == 0) {
557 mainNormalizer = Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, status);
558 } else {
559 mainNormalizer = nfdNormalizer;
560 f = prepareOutputFile("decompositionex");
561 }
562
563 // Max length as of Unicode 14 is 4 for NFD. For NFKD the max
564 // is 18 (U+FDFA; special-cased), and the next longest is 8 (U+FDFB).
565 const int32_t LONGEST_ENCODABLE_LENGTH_16 = 9;
566 const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8;
567 const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
568 UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
569 const int32_t RAW_DECOMPOSITION_BUFFER_SIZE = 2;
570 UChar32 rawUtf32[RAW_DECOMPOSITION_BUFFER_SIZE];
571
572 // Iterate over all scalar values excluding Hangul syllables.
573 //
574 // We go backwards in order to better find overlapping decompositions.
575 //
576 // As of Unicode 14:
577 // Iterate forward without overlap search:
578 // nfd: 16 size: 896, 32 size: 173
579 // nfkd: 16 size: 3854, 32 size: 179
580 //
581 // Iterate forward with overlap search:
582 // nfd: 16 size: 888, 32 size: 173
583 // nfkd: 16 size: 3266, 32 size: 179
584 //
585 // Iterate backward with overlap search:
586 // nfd: 16 size: 776, 32 size: 173
587 // nfkd: 16 size: 2941, 32 size: 179
588 //
589 // UChar32 is signed!
590 for (UChar32 c = 0x10FFFF; c >= 0; --c) {
591 if (c >= 0xAC00 && c <= 0xD7A3) {
592 // Hangul syllable
593 continue;
594 }
595 if (c >= 0xD800 && c < 0xE000) {
596 // Surrogate
597 continue;
598 }
599 UnicodeString src;
600 UnicodeString dst;
601 // True if we're building non-NFD or we're building NFD but
602 // the `c` round trips to NFC.
603 // False if we're building NFD and `c` does not round trip to NFC.
604 UBool nonNfdOrRoundTrips = true;
605 src.append(c);
606 if (mainNormalizer != nfdNormalizer) {
607 UnicodeString inter;
608 mainNormalizer->normalize(src, inter, status);
609 nfdNormalizer->normalize(inter, dst, status);
610 } else {
611 nfdNormalizer->normalize(src, dst, status);
612 UnicodeString nfc;
613 nfcNormalizer->normalize(dst, nfc, status);
614 nonNfdOrRoundTrips = (src == nfc);
615 }
616 int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
617 if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
618 // Characters that normalize to nothing or to U+FFFD (without the
619 // input being U+FFFD) in ICU4C's UTS 46 normalization normalize
620 // as in NFD in ICU4X's UTF 46 normalization in the interest
621 // of data size and ICU4X's normalizer being unable to handle
622 // normalizing to nothing.
623 // When UTS 46 is implemented on top of ICU4X, a preprocessing
624 // step is supposed to remove these characters before the
625 // normalization step.
626 if (uprv_strcmp(basename, "uts46d") != 0) {
627 status.set(U_INTERNAL_PROGRAM_ERROR);
628 handleError(status, basename);
629 }
630 nfdNormalizer->normalize(src, dst, status);
631 len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status);
632 if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) {
633 status.set(U_INTERNAL_PROGRAM_ERROR);
634 handleError(status, basename);
635 }
636 }
637 if (len > DECOMPOSITION_BUFFER_SIZE) {
638 status.set(U_INTERNAL_PROGRAM_ERROR);
639 handleError(status, basename);
640 }
641 uint8_t firstCombiningClass = u_getCombiningClass(utf32[0]);
642 bool specialNonStarterDecomposition = false;
643 bool startsWithBackwardCombiningStarter = false;
644 if (firstCombiningClass) {
645 decompositionPassthroughBound = c;
646 compositionPassthroughBound = c;
647 uset_add(decompositionStartsWithNonStarter, c);
648 if (src != dst) {
649 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F) {
650 specialNonStarterDecomposition = true;
651 } else {
652 // A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X.
653 status.set(U_INTERNAL_PROGRAM_ERROR);
654 handleError(status, basename);
655 }
656 }
657 } else if (uset_contains(backwardCombiningStarters, utf32[0])) {
658 compositionPassthroughBound = c;
659 startsWithBackwardCombiningStarter = true;
660 uset_add(decompositionStartsWithBackwardCombiningStarter, c);
661 }
662 if (c != BACKWARD_COMBINING_STARTER_MARKER && len == 1 && utf32[0] == BACKWARD_COMBINING_STARTER_MARKER) {
663 status.set(U_INTERNAL_PROGRAM_ERROR);
664 handleError(status, basename);
665 }
666 if (c != SPECIAL_NON_STARTER_DECOMPOSITION_MARKER && len == 1 && utf32[0] == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER) {
667 status.set(U_INTERNAL_PROGRAM_ERROR);
668 handleError(status, basename);
669 }
670 if (c != FDFA_MARKER && len == 1 && utf32[0] == FDFA_MARKER) {
671 status.set(U_INTERNAL_PROGRAM_ERROR);
672 handleError(status, basename);
673 }
674 if (mainNormalizer != nfdNormalizer) {
675 UnicodeString nfd;
676 nfdNormalizer->normalize(src, nfd, status);
677 if (dst == nfd) {
678 continue;
679 }
680 decompositionPassthroughBound = c;
681 compositionPassthroughBound = c;
682 } else if (firstCombiningClass) {
683 len = 1;
684 if (specialNonStarterDecomposition) {
685 utf32[0] = SPECIAL_NON_STARTER_DECOMPOSITION_MARKER; // magic value
686 } else {
687 // Use the surrogate range to store the canonical combining class
688 utf32[0] = 0xD800 | UChar32(firstCombiningClass);
689 }
690 } else {
691 if (src == dst) {
692 if (startsWithBackwardCombiningStarter) {
693 pendingTrieInsertions.push_back({c, BACKWARD_COMBINING_STARTER_MARKER << 16, false});
694 }
695 continue;
696 }
697 decompositionPassthroughBound = c;
698 // ICU4X hard-codes ANGSTROM SIGN
699 if (c != 0x212B) {
700 UnicodeString raw;
701 if (!nfdNormalizer->getRawDecomposition(c, raw)) {
702 // We're always supposed to have a non-recursive decomposition
703 // if we had a recursive one.
704 status.set(U_INTERNAL_PROGRAM_ERROR);
705 handleError(status, basename);
706 }
707 // In addition to actual difference, put the whole range that contains characters
708 // with oxia into the non-recursive trie in order to catch cases where characters
709 // with oxia have singleton decompositions to corresponding characters with tonos.
710 // This way, the run-time decision to fall through can be done on the range
711 // without checking for individual characters inside the range.
712 if (raw != dst || (c >= 0x1F71 && c <= 0x1FFB)) {
713 int32_t rawLen = raw.toUTF32(rawUtf32, RAW_DECOMPOSITION_BUFFER_SIZE, status);
714 if (!rawLen) {
715 status.set(U_INTERNAL_PROGRAM_ERROR);
716 handleError(status, basename);
717 }
718 if (rawLen == 1) {
719 if (c >= 0xFFFF) {
720 status.set(U_INTERNAL_PROGRAM_ERROR);
721 handleError(status, basename);
722 }
723 umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, uint32_t(rawUtf32[0]), status);
724 } else if (rawUtf32[0] <= 0xFFFF && rawUtf32[1] <= 0xFFFF) {
725 if (!rawUtf32[0] || !rawUtf32[1]) {
726 status.set(U_INTERNAL_PROGRAM_ERROR);
727 handleError(status, basename);
728 }
729 // Swapped for consistency with the primary trie
730 uint32_t bmpPair = uint32_t(rawUtf32[1]) << 16 | uint32_t(rawUtf32[0]);
731 umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, bmpPair, status);
732 } else {
733 // Let's add 1 to index to make it always non-zero to distinguish
734 // it from the default zero.
735 uint32_t index = nonRecursive32.size() + 1;
736 nonRecursive32.push_back(uint32_t(rawUtf32[0]));
737 nonRecursive32.push_back(uint32_t(rawUtf32[1]));
738 if (index > 0xFFFF) {
739 status.set(U_INTERNAL_PROGRAM_ERROR);
740 handleError(status, basename);
741 }
742 umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, index << 16, status);
743 }
744 }
745 }
746 }
747 if (!nonNfdOrRoundTrips) {
748 compositionPassthroughBound = c;
749 }
750 if (len == 1 && utf32[0] <= 0xFFFF) {
751 if (startsWithBackwardCombiningStarter) {
752 if (mainNormalizer == nfdNormalizer) {
753 // Not supposed to happen in NFD
754 status.set(U_INTERNAL_PROGRAM_ERROR);
755 handleError(status, basename);
756 } else if (!((utf32[0] >= 0x1161 && utf32[0] <= 0x1175) || (utf32[0] >= 0x11A8 && utf32[0] <= 0x11C2))) {
757 // Other than conjoining jamo vowels and trails
758 // unsupported for non-NFD.
759 status.set(U_INTERNAL_PROGRAM_ERROR);
760 handleError(status, basename);
761 }
762 }
763 pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, false});
764 } else if (len == 2 &&
765 utf32[0] <= 0xFFFF &&
766 utf32[1] <= 0xFFFF &&
767 !u_getCombiningClass(utf32[0]) &&
768 u_getCombiningClass(utf32[1]) &&
769 permissibleBmpPair(nonNfdOrRoundTrips, c, utf32[1])) {
770 for (int32_t i = 0; i < len; ++i) {
771 if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) {
772 // Assert that iota subscript and half-width voicing marks never occur in these
773 // expansions in the normalization forms where they are special.
774 status.set(U_INTERNAL_PROGRAM_ERROR);
775 handleError(status, basename);
776 }
777 }
778 if (startsWithBackwardCombiningStarter) {
779 status.set(U_INTERNAL_PROGRAM_ERROR);
780 handleError(status, basename);
781 }
782 pendingTrieInsertions.push_back({c, (uint32_t(utf32[0]) << 16) | uint32_t(utf32[1]), false});
783 } else {
784 if (startsWithBackwardCombiningStarter) {
785 status.set(U_INTERNAL_PROGRAM_ERROR);
786 handleError(status, basename);
787 }
788
789 UBool supplementary = false;
790 UBool nonInitialStarter = false;
791 for (int32_t i = 0; i < len; ++i) {
792 if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) {
793 // Assert that iota subscript and half-width voicing marks never occur in these
794 // expansions in the normalization forms where they are special.
795 status.set(U_INTERNAL_PROGRAM_ERROR);
796 handleError(status, basename);
797 }
798
799 if (utf32[i] > 0xFFFF) {
800 supplementary = true;
801 }
802 if (utf32[i] == 0) {
803 status.set(U_INTERNAL_PROGRAM_ERROR);
804 handleError(status, basename);
805 }
806 if (i != 0 && !u_getCombiningClass(utf32[i])) {
807 nonInitialStarter = true;
808 }
809 }
810 if (!supplementary) {
811 if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) {
812 if (len == 18 && c == 0xFDFA) {
813 // Special marker for the one character whose decomposition
814 // is too long.
815 pendingTrieInsertions.push_back({c, FDFA_MARKER << 16, supplementary});
816 continue;
817 } else {
818 status.set(U_INTERNAL_PROGRAM_ERROR);
819 handleError(status, basename);
820 }
821 }
822 } else if (len > LONGEST_ENCODABLE_LENGTH_32 || !len) {
823 status.set(U_INTERNAL_PROGRAM_ERROR);
824 handleError(status, basename);
825 }
826 // Complex decomposition
827 // Format for 16-bit value:
828 // 15..13: length minus two for 16-bit case and length minus one for
829 // the 32-bit case. Length 8 needs to fit in three bits in
830 // the 16-bit case, and this way the value is future-proofed
831 // up to 9 in the 16-bit case. Zero is unused and length one
832 // in the 16-bit case goes directly into the trie.
833 // 12: 1 if all trailing characters are guaranteed non-starters,
834 // 0 if no guarantees about non-starterness.
835 // Note: The bit choice is this way around to allow for
836 // dynamically falling back to not having this but instead
837 // having one more bit for length by merely choosing
838 // different masks.
839 // 11..0: Start offset in storage. The offset is to the logical
840 // sequence of scalars16, scalars32, supplementary_scalars16,
841 // supplementary_scalars32.
842 uint32_t descriptor = uint32_t(!nonInitialStarter) << 12;
843 if (!supplementary) {
844 descriptor |= (uint32_t(len) - 2) << 13;
845 } else {
846 descriptor |= (uint32_t(len) - 1) << 13;
847 }
848 if (descriptor & 0xFFF) {
849 status.set(U_INTERNAL_PROGRAM_ERROR);
850 handleError(status, basename);
851 }
852 size_t index = 0;
853 bool writeToStorage = false;
854 // Sadly, C++ lacks break and continue by label, so using goto in the
855 // inner loops to break or continue the outer loop.
856 if (!supplementary) {
857 outer16: for (;;) {
858 if (index == storage16.size()) {
859 writeToStorage = true;
860 break;
861 }
862 if (storage16[index] == utf32[0]) {
863 for (int32_t i = 1; i < len; ++i) {
864 if (storage16[index + i] != uint32_t(utf32[i])) {
865 ++index;
866 // continue outer
867 goto outer16;
868 }
869 }
870 // break outer
871 goto after;
872 }
873 ++index;
874 }
875 } else {
876 outer32: for (;;) {
877 if (index == storage32.size()) {
878 writeToStorage = true;
879 break;
880 }
881 if (storage32[index] == uint32_t(utf32[0])) {
882 for (int32_t i = 1; i < len; ++i) {
883 if (storage32[index + i] != uint32_t(utf32[i])) {
884 ++index;
885 // continue outer
886 goto outer32;
887 }
888 }
889 // break outer
890 goto after;
891 }
892 ++index;
893 }
894 }
895 after:
896 if (index > 0xFFF) {
897 status.set(U_INTERNAL_PROGRAM_ERROR);
898 handleError(status, basename);
899 }
900 descriptor |= uint32_t(index);
901 if (!descriptor || descriptor > 0xFFFF) {
902 // > 0xFFFF should never happen if the code above is correct.
903 // == 0 should not happen due to the nature of the data.
904 status.set(U_INTERNAL_PROGRAM_ERROR);
905 handleError(status, basename);
906 }
907 if (writeToStorage) {
908 if (!supplementary) {
909 for (int32_t i = 0; i < len; ++i) {
910 storage16.push_back(uint16_t(utf32[i]));
911 }
912 } else {
913 for (int32_t i = 0; i < len; ++i) {
914 storage32.push_back(uint32_t(utf32[i]));
915 }
916 }
917 }
918
919 uint32_t nonRoundTripMarker = 0;
920 if (!nonNfdOrRoundTrips) {
921 nonRoundTripMarker = (NON_ROUND_TRIP_MARKER << 16);
922 }
923 pendingTrieInsertions.push_back({c, descriptor | nonRoundTripMarker, supplementary});
924 }
925 }
926 if (storage16.size() + storage32.size() > 0xFFF) {
927 status.set(U_INTERNAL_PROGRAM_ERROR);
928 }
929 if (f) {
930 usrc_writeArray(f, "scalars32 = [\n ", nonRecursive32.data(), 32, nonRecursive32.size(), " ", "\n]\n");
931
932 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
933 nonRecursiveBuilder.getAlias(),
934 trieType,
935 UCPTRIE_VALUE_BITS_32,
936 status));
937 handleError(status, basename);
938
939 fprintf(f, "[trie]\n");
940 usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
941
942 fclose(f);
943 }
944 handleError(status, basename);
945 }
946
947 #endif // !UCONFIG_NO_NORMALIZATION
948
949 enum {
950 OPT_HELP_H,
951 OPT_HELP_QUESTION_MARK,
952 OPT_MODE,
953 OPT_TRIE_TYPE,
954 OPT_VERSION,
955 OPT_DESTDIR,
956 OPT_ALL,
957 OPT_INDEX,
958 OPT_COPYRIGHT,
959 OPT_VERBOSE,
960 OPT_QUIET,
961
962 OPT_COUNT
963 };
964
965 #define UOPTION_MODE UOPTION_DEF("mode", 'm', UOPT_REQUIRES_ARG)
966 #define UOPTION_TRIE_TYPE UOPTION_DEF("trie-type", '\1', UOPT_REQUIRES_ARG)
967 #define UOPTION_ALL UOPTION_DEF("all", '\1', UOPT_NO_ARG)
968 #define UOPTION_INDEX UOPTION_DEF("index", '\1', UOPT_NO_ARG)
969
970 static UOption options[]={
971 UOPTION_HELP_H,
972 UOPTION_HELP_QUESTION_MARK,
973 UOPTION_MODE,
974 UOPTION_TRIE_TYPE,
975 UOPTION_VERSION,
976 UOPTION_DESTDIR,
977 UOPTION_ALL,
978 UOPTION_INDEX,
979 UOPTION_COPYRIGHT,
980 UOPTION_VERBOSE,
981 UOPTION_QUIET,
982 };
983
printHelp(FILE * stdfile,const char * program)984 void printHelp(FILE* stdfile, const char* program) {
985 fprintf(stdfile,
986 "usage: %s -m mode [-options] [--all | properties...]\n"
987 "\tdump Unicode property data to .toml files\n"
988 "options:\n"
989 "\t-h or -? or --help this usage text\n"
990 "\t-V or --version show a version message\n"
991 "\t-m or --mode mode: currently only 'uprops', 'ucase', and 'norm', but more may be added\n"
992 "\t --trie-type set the trie type (small or fast, default small)\n"
993 "\t-d or --destdir destination directory, followed by the path\n"
994 "\t --all write out all properties known to icuexportdata\n"
995 "\t --index write an _index.toml summarizing all data exported\n"
996 "\t-c or --copyright include a copyright notice\n"
997 "\t-v or --verbose Turn on verbose output\n"
998 "\t-q or --quiet do not display warnings and progress\n",
999 program);
1000 }
1001
exportUprops(int argc,char * argv[])1002 int exportUprops(int argc, char* argv[]) {
1003 // Load list of Unicode properties
1004 std::vector<const char*> propNames;
1005 for (int i=1; i<argc; i++) {
1006 propNames.push_back(argv[i]);
1007 }
1008 if (options[OPT_ALL].doesOccur) {
1009 int i = UCHAR_BINARY_START;
1010 while (true) {
1011 if (i == UCHAR_BINARY_LIMIT) {
1012 i = UCHAR_INT_START;
1013 }
1014 if (i == UCHAR_INT_LIMIT) {
1015 i = UCHAR_SCRIPT_EXTENSIONS;
1016 }
1017 if (i == UCHAR_SCRIPT_EXTENSIONS + 1) {
1018 break;
1019 }
1020 UProperty uprop = static_cast<UProperty>(i);
1021 const char* propName = u_getPropertyName(uprop, U_SHORT_PROPERTY_NAME);
1022 if (propName == NULL) {
1023 propName = u_getPropertyName(uprop, U_LONG_PROPERTY_NAME);
1024 if (propName != NULL && VERBOSE) {
1025 std::cerr << "Note: falling back to long name for: " << propName << std::endl;
1026 }
1027 }
1028 if (propName != NULL) {
1029 propNames.push_back(propName);
1030 } else {
1031 std::cerr << "Warning: Could not find name for: " << uprop << std::endl;
1032 }
1033 i++;
1034 }
1035 }
1036
1037 if (propNames.empty()
1038 || options[OPT_HELP_H].doesOccur
1039 || options[OPT_HELP_QUESTION_MARK].doesOccur
1040 || !options[OPT_MODE].doesOccur) {
1041 FILE *stdfile=argc<0 ? stderr : stdout;
1042 fprintf(stdfile,
1043 "usage: %s -m uprops [-options] [--all | properties...]\n"
1044 "\tdump Unicode property data to .toml files\n"
1045 "options:\n"
1046 "\t-h or -? or --help this usage text\n"
1047 "\t-V or --version show a version message\n"
1048 "\t-m or --mode mode: currently only 'uprops', but more may be added\n"
1049 "\t --trie-type set the trie type (small or fast, default small)\n"
1050 "\t-d or --destdir destination directory, followed by the path\n"
1051 "\t --all write out all properties known to icuexportdata\n"
1052 "\t --index write an _index.toml summarizing all data exported\n"
1053 "\t-c or --copyright include a copyright notice\n"
1054 "\t-v or --verbose Turn on verbose output\n"
1055 "\t-q or --quiet do not display warnings and progress\n",
1056 argv[0]);
1057 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1058 }
1059
1060 const char* mode = options[OPT_MODE].value;
1061 if (uprv_strcmp(mode, "uprops") != 0) {
1062 fprintf(stderr, "Invalid option for --mode (must be uprops)\n");
1063 return U_ILLEGAL_ARGUMENT_ERROR;
1064 }
1065
1066 if (options[OPT_TRIE_TYPE].doesOccur) {
1067 if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
1068 trieType = UCPTRIE_TYPE_FAST;
1069 } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
1070 trieType = UCPTRIE_TYPE_SMALL;
1071 } else {
1072 fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
1073 return U_ILLEGAL_ARGUMENT_ERROR;
1074 }
1075 }
1076
1077 for (const char* propName : propNames) {
1078 UProperty propEnum = u_getPropertyEnum(propName);
1079 if (propEnum == UCHAR_INVALID_CODE) {
1080 std::cerr << "Error: Invalid property alias: " << propName << std::endl;
1081 return U_ILLEGAL_ARGUMENT_ERROR;
1082 }
1083
1084 FILE* f = prepareOutputFile(propName);
1085
1086 UVersionInfo versionInfo;
1087 u_getUnicodeVersion(versionInfo);
1088 char uvbuf[U_MAX_VERSION_STRING_LENGTH];
1089 u_versionToString(versionInfo, uvbuf);
1090 fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n",
1091 U_ICU_VERSION,
1092 uvbuf);
1093
1094 if (propEnum < UCHAR_BINARY_LIMIT) {
1095 dumpBinaryProperty(propEnum, f);
1096 } else if (UCHAR_INT_START <= propEnum && propEnum <= UCHAR_INT_LIMIT) {
1097 dumpEnumeratedProperty(propEnum, f);
1098 } else if (propEnum == UCHAR_SCRIPT_EXTENSIONS) {
1099 dumpScriptExtensions(f);
1100 } else {
1101 std::cerr << "Don't know how to write property: " << propEnum << std::endl;
1102 return U_INTERNAL_PROGRAM_ERROR;
1103 }
1104
1105 fclose(f);
1106 }
1107
1108 if (options[OPT_INDEX].doesOccur) {
1109 FILE* f = prepareOutputFile("_index");
1110 fprintf(f, "index = [\n");
1111 for (const char* propName : propNames) {
1112 // At this point, propName is a valid property name, so it should be alphanum ASCII
1113 fprintf(f, " { filename=\"%s.toml\" },\n", propName);
1114 }
1115 fprintf(f, "]\n");
1116 fclose(f);
1117 }
1118
1119 return 0;
1120 }
1121
1122 struct AddRangeHelper {
1123 UMutableCPTrie* ucptrie;
1124 };
1125
1126 static UBool U_CALLCONV
addRangeToUCPTrie(const void * context,UChar32 start,UChar32 end,uint32_t value)1127 addRangeToUCPTrie(const void* context, UChar32 start, UChar32 end, uint32_t value) {
1128 IcuToolErrorCode status("addRangeToUCPTrie");
1129 UMutableCPTrie* ucptrie = ((const AddRangeHelper*) context)->ucptrie;
1130 umutablecptrie_setRange(ucptrie, start, end, value, status);
1131 handleError(status, "setRange");
1132
1133 return true;
1134 }
1135
exportCase(int argc,char * argv[])1136 int exportCase(int argc, char* argv[]) {
1137 if (argc > 1) {
1138 fprintf(stderr, "ucase mode does not expect additional arguments\n");
1139 return U_ILLEGAL_ARGUMENT_ERROR;
1140 }
1141 (void) argv; // Suppress unused variable warning
1142
1143 IcuToolErrorCode status("icuexportdata");
1144 LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status));
1145 handleError(status, "exportCase");
1146
1147 int32_t exceptionsLength, unfoldLength;
1148 const UCaseProps *caseProps = ucase_getSingleton(&exceptionsLength, &unfoldLength);
1149 const UTrie2* caseTrie = &caseProps->trie;
1150
1151 AddRangeHelper helper = { builder.getAlias() };
1152 utrie2_enum(caseTrie, NULL, addRangeToUCPTrie, &helper);
1153
1154 UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_16;
1155 LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
1156 builder.getAlias(),
1157 trieType,
1158 width,
1159 status));
1160 handleError(status, "exportCase");
1161
1162 FILE* f = prepareOutputFile("ucase");
1163
1164 UVersionInfo versionInfo;
1165 u_getUnicodeVersion(versionInfo);
1166 char uvbuf[U_MAX_VERSION_STRING_LENGTH];
1167 u_versionToString(versionInfo, uvbuf);
1168 fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n",
1169 U_ICU_VERSION,
1170 uvbuf);
1171
1172 fputs("[ucase.code_point_trie]\n", f);
1173 usrc_writeUCPTrie(f, "case_trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
1174 fputs("\n", f);
1175
1176 const char* indent = " ";
1177 const char* suffix = "\n]\n";
1178
1179 fputs("[ucase.exceptions]\n", f);
1180 const char* exceptionsPrefix = "exceptions = [\n ";
1181 int32_t exceptionsWidth = 16;
1182 usrc_writeArray(f, exceptionsPrefix, caseProps->exceptions, exceptionsWidth,
1183 exceptionsLength, indent, suffix);
1184 fputs("\n", f);
1185
1186 fputs("[ucase.unfold]\n", f);
1187 const char* unfoldPrefix = "unfold = [\n ";
1188 int32_t unfoldWidth = 16;
1189 usrc_writeArray(f, unfoldPrefix, caseProps->unfold, unfoldWidth,
1190 unfoldLength, indent, suffix);
1191
1192 return 0;
1193 }
1194
1195 #if !UCONFIG_NO_NORMALIZATION
1196
exportNorm()1197 int exportNorm() {
1198 IcuToolErrorCode status("icuexportdata: exportNorm");
1199 USet* backwardCombiningStarters = uset_openEmpty();
1200 writeCanonicalCompositions(backwardCombiningStarters);
1201
1202 std::vector<uint16_t> storage16;
1203 std::vector<uint32_t> storage32;
1204
1205 // Note: the USets are not exported. They are only used to check that a new
1206 // Unicode version doesn't violate expectations that are hard-coded in ICU4X.
1207 USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty();
1208 USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1209 std::vector<PendingDescriptor> nfdPendingTrieInsertions;
1210 UChar32 nfdBound = 0x10FFFF;
1211 UChar32 nfcBound = 0x10FFFF;
1212 computeDecompositions("nfd",
1213 backwardCombiningStarters,
1214 storage16,
1215 storage32,
1216 nfdDecompositionStartsWithNonStarter,
1217 nfdDecompositionStartsWithBackwardCombiningStarter,
1218 nfdPendingTrieInsertions,
1219 nfdBound,
1220 nfcBound);
1221 if (!(nfdBound == 0xC0 && nfcBound == 0x300)) {
1222 // Unexpected bounds for NFD/NFC.
1223 status.set(U_INTERNAL_PROGRAM_ERROR);
1224 handleError(status, "exportNorm");
1225 }
1226
1227 uint32_t baseSize16 = storage16.size();
1228 uint32_t baseSize32 = storage32.size();
1229
1230 USet* nfkdDecompositionStartsWithNonStarter = uset_openEmpty();
1231 USet* nfkdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1232 std::vector<PendingDescriptor> nfkdPendingTrieInsertions;
1233 UChar32 nfkdBound = 0x10FFFF;
1234 UChar32 nfkcBound = 0x10FFFF;
1235 computeDecompositions("nfkd",
1236 backwardCombiningStarters,
1237 storage16,
1238 storage32,
1239 nfkdDecompositionStartsWithNonStarter,
1240 nfkdDecompositionStartsWithBackwardCombiningStarter,
1241 nfkdPendingTrieInsertions,
1242 nfkdBound,
1243 nfkcBound);
1244 if (!(nfkdBound <= 0xC0 && nfkcBound <= 0x300)) {
1245 status.set(U_INTERNAL_PROGRAM_ERROR);
1246 handleError(status, "exportNorm");
1247 }
1248 if (nfkcBound > 0xC0) {
1249 if (nfkdBound != 0xC0) {
1250 status.set(U_INTERNAL_PROGRAM_ERROR);
1251 handleError(status, "exportNorm");
1252 }
1253 } else {
1254 if (nfkdBound != nfkcBound) {
1255 status.set(U_INTERNAL_PROGRAM_ERROR);
1256 handleError(status, "exportNorm");
1257 }
1258 }
1259
1260 USet* uts46DecompositionStartsWithNonStarter = uset_openEmpty();
1261 USet* uts46DecompositionStartsWithBackwardCombiningStarter = uset_openEmpty();
1262 std::vector<PendingDescriptor> uts46PendingTrieInsertions;
1263 UChar32 uts46dBound = 0x10FFFF;
1264 UChar32 uts46Bound = 0x10FFFF;
1265 computeDecompositions("uts46d",
1266 backwardCombiningStarters,
1267 storage16,
1268 storage32,
1269 uts46DecompositionStartsWithNonStarter,
1270 uts46DecompositionStartsWithBackwardCombiningStarter,
1271 uts46PendingTrieInsertions,
1272 uts46dBound,
1273 uts46Bound);
1274 if (!(uts46dBound <= 0xC0 && uts46Bound <= 0x300)) {
1275 status.set(U_INTERNAL_PROGRAM_ERROR);
1276 handleError(status, "exportNorm");
1277 }
1278 if (uts46Bound > 0xC0) {
1279 if (uts46dBound != 0xC0) {
1280 status.set(U_INTERNAL_PROGRAM_ERROR);
1281 handleError(status, "exportNorm");
1282 }
1283 } else {
1284 if (uts46dBound != uts46Bound) {
1285 status.set(U_INTERNAL_PROGRAM_ERROR);
1286 handleError(status, "exportNorm");
1287 }
1288 }
1289
1290 uint32_t supplementSize16 = storage16.size() - baseSize16;
1291 uint32_t supplementSize32 = storage32.size() - baseSize32;
1292
1293 writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions, char16_t(nfcBound));
1294 writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions, char16_t(nfkcBound));
1295 writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions, char16_t(uts46Bound));
1296
1297 writeDecompositionTables("nfdex", storage16.data(), baseSize16, storage32.data(), baseSize32);
1298 writeDecompositionTables("nfkdex", storage16.data() + baseSize16, supplementSize16, storage32.data() + baseSize32, supplementSize32);
1299
1300 uset_close(nfdDecompositionStartsWithNonStarter);
1301 uset_close(nfkdDecompositionStartsWithNonStarter);
1302 uset_close(uts46DecompositionStartsWithNonStarter);
1303
1304 uset_close(nfdDecompositionStartsWithBackwardCombiningStarter);
1305 uset_close(nfkdDecompositionStartsWithBackwardCombiningStarter);
1306 uset_close(uts46DecompositionStartsWithBackwardCombiningStarter);
1307
1308 uset_close(backwardCombiningStarters);
1309 handleError(status, "exportNorm");
1310 return 0;
1311 }
1312
1313 #endif // !UCONFIG_NO_NORMALIZATION
1314
main(int argc,char * argv[])1315 int main(int argc, char* argv[]) {
1316 U_MAIN_INIT_ARGS(argc, argv);
1317
1318 /* preset then read command line options */
1319 options[OPT_DESTDIR].value=u_getDataDirectory();
1320 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
1321
1322 if(options[OPT_VERSION].doesOccur) {
1323 printf("icuexportdata version %s, ICU tool to dump data files for external consumers\n",
1324 U_ICU_DATA_VERSION);
1325 printf("%s\n", U_COPYRIGHT_STRING);
1326 exit(0);
1327 }
1328
1329 /* error handling, printing usage message */
1330 if(argc<0) {
1331 fprintf(stderr,
1332 "error in command line argument \"%s\"\n",
1333 argv[-argc]);
1334 }
1335
1336 if (argc < 0
1337 || options[OPT_HELP_H].doesOccur
1338 || options[OPT_HELP_QUESTION_MARK].doesOccur
1339 || !options[OPT_MODE].doesOccur) {
1340 FILE *stdfile=argc<0 ? stderr : stdout;
1341 printHelp(stdfile, argv[0]);
1342 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1343 }
1344
1345 /* get the options values */
1346 haveCopyright = options[OPT_COPYRIGHT].doesOccur;
1347 destdir = options[OPT_DESTDIR].value;
1348 VERBOSE = options[OPT_VERBOSE].doesOccur;
1349 QUIET = options[OPT_QUIET].doesOccur;
1350
1351 if (options[OPT_TRIE_TYPE].doesOccur) {
1352 if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) {
1353 trieType = UCPTRIE_TYPE_FAST;
1354 } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) {
1355 trieType = UCPTRIE_TYPE_SMALL;
1356 } else {
1357 fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n");
1358 return U_ILLEGAL_ARGUMENT_ERROR;
1359 }
1360 }
1361
1362 const char* mode = options[OPT_MODE].value;
1363 if (uprv_strcmp(mode, "norm") == 0) {
1364 #if !UCONFIG_NO_NORMALIZATION
1365 return exportNorm();
1366 #else
1367 fprintf(stderr, "Exporting normalization data not supported when compiling without normalization support.\n");
1368 return U_ILLEGAL_ARGUMENT_ERROR;
1369 #endif
1370 }
1371 if (uprv_strcmp(mode, "uprops") == 0) {
1372 return exportUprops(argc, argv);
1373 } else if (uprv_strcmp(mode, "ucase") == 0) {
1374 return exportCase(argc, argv);
1375 }
1376
1377 fprintf(stderr, "Invalid option for --mode (must be uprops, ucase, or norm)\n");
1378 return U_ILLEGAL_ARGUMENT_ERROR;
1379 }
1380