1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: genuts46.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2010mar02
14 * created by: Markus W. Scherer
15 *
16 * quick & dirty tool to recreate the UTS #46 data table according to the spec
17 */
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string>
22 #include <string.h>
23 #include "unicode/utypes.h"
24 #include "unicode/errorcode.h"
25 #include "unicode/normalizer2.h"
26 #include "unicode/uniset.h"
27 #include "unicode/unistr.h"
28 #include "unicode/usetiter.h"
29 #include "unicode/usprep.h"
30 #include "sprpimpl.h" // HACK
31
32 /**
33 * icu::ErrorCode subclass for easy UErrorCode handling.
34 * The destructor calls handleFailure() which calls exit(errorCode) when isFailure().
35 */
36 class ExitingErrorCode : public icu::ErrorCode {
37 public:
38 /**
39 * @param loc A short string describing where the ExitingErrorCode is used.
40 */
ExitingErrorCode(const char * loc)41 ExitingErrorCode(const char *loc) : location(loc) {}
42 virtual ~ExitingErrorCode();
43 protected:
44 virtual void handleFailure() const;
45 private:
46 const char *location;
47 };
48
~ExitingErrorCode()49 ExitingErrorCode::~ExitingErrorCode() {
50 // Safe because our handleFailure() does not throw exceptions.
51 if(isFailure()) { handleFailure(); }
52 }
53
handleFailure() const54 void ExitingErrorCode::handleFailure() const {
55 fprintf(stderr, "error at %s: %s\n", location, errorName());
56 exit(errorCode);
57 }
58
59 static int
toIDNA2003(const UStringPrepProfile * prep,UChar32 c,icu::UnicodeString & destString)60 toIDNA2003(const UStringPrepProfile *prep, UChar32 c, icu::UnicodeString &destString) {
61 UChar src[2];
62 int32_t srcLength=0;
63 U16_APPEND_UNSAFE(src, srcLength, c);
64 UChar *dest;
65 int32_t destLength;
66 dest=destString.getBuffer(32);
67 if(dest==NULL) {
68 return FALSE;
69 }
70 UErrorCode errorCode=U_ZERO_ERROR;
71 destLength=usprep_prepare(prep, src, srcLength,
72 dest, destString.getCapacity(),
73 USPREP_DEFAULT, NULL, &errorCode);
74 destString.releaseBuffer(destLength);
75 if(errorCode==U_STRINGPREP_PROHIBITED_ERROR) {
76 return -1;
77 } else {
78 // Returns FALSE=0 for U_STRINGPREP_UNASSIGNED_ERROR and processing errors,
79 // TRUE=1 if c is valid or mapped.
80 return U_SUCCESS(errorCode);
81 }
82 }
83
84 enum Status {
85 DISALLOWED, IGNORED, MAPPED, DEVIATION, VALID,
86 DISALLOWED_STD3_VALID, DISALLOWED_STD3_MAPPED
87 };
88 static const char *const statusNames[]={
89 "disallowed", "ignored", "mapped", "deviation", "valid",
90 "disallowed_STD3_valid", "disallowed_STD3_mapped"
91 };
92
93 static void
printLine(UChar32 start,UChar32 end,Status status,const icu::UnicodeString & mapping)94 printLine(UChar32 start, UChar32 end, Status status, const icu::UnicodeString &mapping) {
95 if(start==end) {
96 printf("%04lX ", (long)start);
97 } else {
98 printf("%04lX..%04lX ", (long)start, (long)end);
99 }
100 printf("; %s", statusNames[status]);
101 if(status==MAPPED || status==DEVIATION || !mapping.isEmpty()) {
102 printf(" ;");
103 const UChar *buffer=mapping.getBuffer();
104 int32_t length=mapping.length();
105 int32_t i=0;
106 UChar32 c;
107 while(i<length) {
108 U16_NEXT(buffer, i, length, c);
109 printf(" %04lX", (long)c);
110 }
111 }
112 puts("");
113 }
114
115 static void
getAgeIfAssigned(UChar32 c,UVersionInfo age)116 getAgeIfAssigned(UChar32 c, UVersionInfo age) {
117 if(u_isdefined(c)) {
118 u_charAge(c, age);
119 } else if(U_IS_UNICODE_NONCHAR(c)) {
120 age[0]=0;
121 age[1]=0;
122 age[2]=0;
123 age[3]=1;
124 } else {
125 memset(age, 0, 4);
126 }
127 }
128
129 extern int
main(int argc,const char * argv[])130 main(int argc, const char *argv[]) {
131 ExitingErrorCode errorCode("genuts46");
132
133 // predefined base sets
134 icu::UnicodeSet unassignedSet(UNICODE_STRING_SIMPLE("[:Cn:]"), errorCode);
135
136 icu::UnicodeSet labelSeparators(
137 UNICODE_STRING_SIMPLE("[\\u002E\\u3002\\uFF0E\\uFF61]"), errorCode);
138
139 icu::UnicodeSet mappedSet(
140 UNICODE_STRING_SIMPLE("[:Changes_When_NFKC_Casefolded:]"), errorCode);
141 mappedSet.removeAll(labelSeparators); // simplifies checking of mapped characters
142
143 icu::UnicodeSet baseValidSet(icu::UnicodeString(
144 "[[[[:^Changes_When_NFKC_Casefolded:]"
145 "-[:C:]-[:Z:]"
146 "-[:Block=Ideographic_Description_Characters:]]"
147 "[:ascii:]]-[.]]", -1, US_INV), errorCode);
148
149 // Characters that are disallowed when STD3 rules are applied,
150 // but valid when STD3 rules are not applied.
151 icu::UnicodeSet disallowedSTD3Set(icu::UnicodeString(
152 "[[:ascii:]-[\\u002D.a-zA-Z0-9]]", -1, US_INV), errorCode);
153
154 icu::UnicodeSet deviationSet(
155 UNICODE_STRING_SIMPLE("[\\u00DF\\u03C2\\u200C\\u200D]"), errorCode);
156 errorCode.assertSuccess();
157
158 // derived sets
159 icu::LocalUStringPrepProfilePointer namePrep(usprep_openByType(USPREP_RFC3491_NAMEPREP, errorCode));
160 const icu::Normalizer2 *nfkc_cf=
161 icu::Normalizer2::getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode);
162 errorCode.assertSuccess();
163
164 // HACK: The StringPrep API performs a BiDi check according to the data.
165 // We need to override that for this data generation, by resetting an internal flag.
166 namePrep->checkBiDi=FALSE;
167
168 icu::UnicodeSet baseExclusionSet;
169 icu::UnicodeString cString, mapping, namePrepResult;
170 for(UChar32 c=0; c<=0x10ffff; ++c) {
171 if(c==0xd800) {
172 c=0xe000;
173 }
174 int namePrepStatus=toIDNA2003(namePrep.getAlias(), c, namePrepResult);
175 if(namePrepStatus!=0) {
176 // get the UTS #46 base mapping value
177 switch(c) {
178 case 0xff0e:
179 case 0x3002:
180 case 0xff61:
181 mapping.setTo(0x2e);
182 break;
183 default:
184 cString.setTo(c);
185 nfkc_cf->normalize(cString, mapping, errorCode);
186 break;
187 }
188 if(
189 namePrepStatus>0 ?
190 // c is valid or mapped in IDNA2003
191 !labelSeparators.contains(c) && namePrepResult!=mapping :
192 // namePrepStatus<0: c is prohibited in IDNA2003
193 baseValidSet.contains(c) || (cString!=mapping && baseValidSet.containsAll(mapping))
194 ) {
195 baseExclusionSet.add(c);
196 }
197 }
198 }
199
200 icu::UnicodeSet disallowedSet(0, 0x10ffff);
201 disallowedSet.
202 removeAll(labelSeparators).
203 removeAll(deviationSet).
204 removeAll(mappedSet).
205 removeAll(baseValidSet).
206 addAll(baseExclusionSet).
207 addAll(unassignedSet);
208
209 const icu::Normalizer2 *nfd=
210 icu::Normalizer2::getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode);
211 errorCode.assertSuccess();
212
213 icu::UnicodeSet ignoredSet; // will be a subset of mappedSet
214 icu::UnicodeSet removeSet;
215 icu::UnicodeString nfdString;
216 {
217 icu::UnicodeSetIterator iter(mappedSet);
218 while(iter.next()) {
219 UChar32 c=iter.getCodepoint();
220 cString.setTo(c);
221 nfkc_cf->normalize(cString, mapping, errorCode);
222 if(!baseValidSet.containsAll(mapping)) {
223 fprintf(stderr, "U+%04lX mapped -> disallowed: mapping not wholly in base valid set\n", (long)c);
224 disallowedSet.add(c);
225 removeSet.add(c);
226 } else if(mapping.isEmpty()) {
227 ignoredSet.add(c);
228 }
229 }
230 mappedSet.removeAll(removeSet);
231 }
232 errorCode.assertSuccess();
233
234 icu::UnicodeSet validSet(baseValidSet);
235 validSet.
236 removeAll(labelSeparators). // non-ASCII label separators will be mapped in the end
237 removeAll(deviationSet).
238 removeAll(disallowedSet).
239 removeAll(mappedSet).
240 add(0x2e); // not mapped, simply valid
241 UBool madeChange;
242 do {
243 madeChange=FALSE;
244 {
245 removeSet.clear();
246 icu::UnicodeSetIterator iter(validSet);
247 while(iter.next()) {
248 UChar32 c=iter.getCodepoint();
249 if(nfd->getDecomposition(c, nfdString) && !validSet.containsAll(nfdString)) {
250 fprintf(stderr, "U+%04lX valid -> disallowed: NFD not wholly valid\n", (long)c);
251 disallowedSet.add(c);
252 removeSet.add(c);
253 madeChange=TRUE;
254 }
255 }
256 validSet.removeAll(removeSet);
257 }
258 {
259 removeSet.clear();
260 icu::UnicodeSetIterator iter(mappedSet);
261 while(iter.next()) {
262 UChar32 c=iter.getCodepoint();
263 cString.setTo(c);
264 nfkc_cf->normalize(cString, mapping, errorCode);
265 nfd->normalize(mapping, nfdString, errorCode);
266 if(!validSet.containsAll(nfdString)) {
267 fprintf(stderr, "U+%04lX mapped -> disallowed: NFD of mapping not wholly valid\n", (long)c);
268 disallowedSet.add(c);
269 removeSet.add(c);
270 madeChange=TRUE;
271 }
272 }
273 mappedSet.removeAll(removeSet);
274 }
275 } while(madeChange);
276 errorCode.assertSuccess();
277
278 // finish up
279 labelSeparators.remove(0x2e).freeze(); // U+002E is simply valid
280 deviationSet.freeze();
281 ignoredSet.freeze();
282 validSet.freeze();
283 mappedSet.freeze();
284 disallowedSTD3Set.freeze();
285
286 // output
287 UChar32 prevStart=0, c=0;
288 Status prevStatus=DISALLOWED_STD3_VALID, status;
289 icu::UnicodeString prevMapping;
290 UVersionInfo prevAge={ 1, 1, 0, 0 }, age;
291
292 icu::UnicodeSetIterator iter(disallowedSet);
293 while(iter.nextRange()) {
294 UChar32 start=iter.getCodepoint();
295 while(c<start) {
296 mapping.remove();
297 if(labelSeparators.contains(c)) {
298 status=MAPPED;
299 mapping.setTo(0x2e);
300 } else if(deviationSet.contains(c)) {
301 status=DEVIATION;
302 cString.setTo(c);
303 nfkc_cf->normalize(cString, mapping, errorCode);
304 } else if(ignoredSet.contains(c)) {
305 status=IGNORED;
306 } else if(validSet.contains(c)) {
307 if(disallowedSTD3Set.contains(c)) {
308 fprintf(stderr, "U+%04lX valid -> disallowed_STD3_valid: itself not STD3\n", (long)c);
309 status=DISALLOWED_STD3_VALID;
310 } else if( nfd->getDecomposition(c, nfdString) &&
311 disallowedSTD3Set.containsSome(nfdString)
312 ) {
313 fprintf(stderr, "U+%04lX valid -> disallowed_STD3_valid: NFD not wholly STD3\n", (long)c);
314 status=DISALLOWED_STD3_VALID;
315 } else {
316 status=VALID;
317 }
318 } else if(mappedSet.contains(c)) {
319 cString.setTo(c);
320 nfkc_cf->normalize(cString, mapping, errorCode);
321 if(disallowedSTD3Set.containsSome(mapping)) {
322 fprintf(stderr, "U+%04lX mapped -> disallowed_STD3_mapped\n", (long)c);
323 status=DISALLOWED_STD3_MAPPED;
324 } else {
325 status=MAPPED;
326 }
327 } else {
328 fprintf(stderr, "*** undetermined status of U+%04lX\n", (long)c);
329 }
330 // Print a new line where the status, the mapping or
331 // the character age change.
332 getAgeIfAssigned(c, age);
333 if( prevStart<c &&
334 (status!=prevStatus || mapping!=prevMapping || 0!=memcmp(prevAge, age, 4))
335 ) {
336 printLine(prevStart, c-1, prevStatus, prevMapping);
337 prevStart=c;
338 prevStatus=status;
339 prevMapping=mapping;
340 memcpy(prevAge, age, 4);
341 }
342 ++c;
343 }
344 // c==start is disallowed
345 if(prevStart<c) {
346 printLine(prevStart, c-1, prevStatus, prevMapping);
347 }
348 prevStart=c;
349 prevStatus=DISALLOWED;
350 prevMapping.remove();
351 getAgeIfAssigned(c, prevAge);
352 UChar32 end=iter.getCodepointEnd();
353 while(++c<=end) {
354 getAgeIfAssigned(c, age);
355 if(prevStart<c && 0!=memcmp(prevAge, age, 4)) {
356 printLine(prevStart, c-1, prevStatus, prevMapping);
357 prevStart=c;
358 memcpy(prevAge, age, 4);
359 }
360 }
361 }
362 if(prevStart<c) {
363 printLine(prevStart, c-1, prevStatus, prevMapping);
364 }
365 return 0;
366 }
367