• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2010, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   file name:  genuts46.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2010mar02
14 *   created by: Markus W. Scherer
15 *
16 * quick & dirty tool to recreate the UTS #46 data table according to the spec
17 */
18 
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string>
22 #include <string.h>
23 #include "unicode/utypes.h"
24 #include "unicode/errorcode.h"
25 #include "unicode/normalizer2.h"
26 #include "unicode/uniset.h"
27 #include "unicode/unistr.h"
28 #include "unicode/usetiter.h"
29 #include "unicode/usprep.h"
30 #include "sprpimpl.h"  // HACK
31 
32 /**
33  * icu::ErrorCode subclass for easy UErrorCode handling.
34  * The destructor calls handleFailure() which calls exit(errorCode) when isFailure().
35  */
36 class ExitingErrorCode : public icu::ErrorCode {
37 public:
38     /**
39      * @param loc A short string describing where the ExitingErrorCode is used.
40      */
ExitingErrorCode(const char * loc)41     ExitingErrorCode(const char *loc) : location(loc) {}
42     virtual ~ExitingErrorCode();
43 protected:
44     virtual void handleFailure() const;
45 private:
46     const char *location;
47 };
48 
~ExitingErrorCode()49 ExitingErrorCode::~ExitingErrorCode() {
50     // Safe because our handleFailure() does not throw exceptions.
51     if(isFailure()) { handleFailure(); }
52 }
53 
handleFailure() const54 void ExitingErrorCode::handleFailure() const {
55     fprintf(stderr, "error at %s: %s\n", location, errorName());
56     exit(errorCode);
57 }
58 
59 static int
toIDNA2003(const UStringPrepProfile * prep,UChar32 c,icu::UnicodeString & destString)60 toIDNA2003(const UStringPrepProfile *prep, UChar32 c, icu::UnicodeString &destString) {
61     UChar src[2];
62     int32_t srcLength=0;
63     U16_APPEND_UNSAFE(src, srcLength, c);
64     UChar *dest;
65     int32_t destLength;
66     dest=destString.getBuffer(32);
67     if(dest==NULL) {
68         return FALSE;
69     }
70     UErrorCode errorCode=U_ZERO_ERROR;
71     destLength=usprep_prepare(prep, src, srcLength,
72                               dest, destString.getCapacity(),
73                               USPREP_DEFAULT, NULL, &errorCode);
74     destString.releaseBuffer(destLength);
75     if(errorCode==U_STRINGPREP_PROHIBITED_ERROR) {
76         return -1;
77     } else {
78         // Returns FALSE=0 for U_STRINGPREP_UNASSIGNED_ERROR and processing errors,
79         // TRUE=1 if c is valid or mapped.
80         return U_SUCCESS(errorCode);
81     }
82 }
83 
84 enum Status {
85     DISALLOWED, IGNORED, MAPPED, DEVIATION, VALID,
86     DISALLOWED_STD3_VALID, DISALLOWED_STD3_MAPPED
87 };
88 static const char *const statusNames[]={
89     "disallowed", "ignored", "mapped", "deviation", "valid",
90     "disallowed_STD3_valid", "disallowed_STD3_mapped"
91 };
92 
93 static void
printLine(UChar32 start,UChar32 end,Status status,const icu::UnicodeString & mapping)94 printLine(UChar32 start, UChar32 end, Status status, const icu::UnicodeString &mapping) {
95     if(start==end) {
96         printf("%04lX          ", (long)start);
97     } else {
98         printf("%04lX..%04lX    ", (long)start, (long)end);
99     }
100     printf("; %s", statusNames[status]);
101     if(status==MAPPED || status==DEVIATION || !mapping.isEmpty()) {
102         printf(" ;");
103         const UChar *buffer=mapping.getBuffer();
104         int32_t length=mapping.length();
105         int32_t i=0;
106         UChar32 c;
107         while(i<length) {
108             U16_NEXT(buffer, i, length, c);
109             printf(" %04lX", (long)c);
110         }
111     }
112     puts("");
113 }
114 
115 static void
getAgeIfAssigned(UChar32 c,UVersionInfo age)116 getAgeIfAssigned(UChar32 c, UVersionInfo age) {
117     if(u_isdefined(c)) {
118         u_charAge(c, age);
119     } else if(U_IS_UNICODE_NONCHAR(c)) {
120         age[0]=0;
121         age[1]=0;
122         age[2]=0;
123         age[3]=1;
124     } else {
125         memset(age, 0, 4);
126     }
127 }
128 
129 extern int
main(int argc,const char * argv[])130 main(int argc, const char *argv[]) {
131     ExitingErrorCode errorCode("genuts46");
132 
133     // predefined base sets
134     icu::UnicodeSet unassignedSet(UNICODE_STRING_SIMPLE("[:Cn:]"), errorCode);
135 
136     icu::UnicodeSet labelSeparators(
137         UNICODE_STRING_SIMPLE("[\\u002E\\u3002\\uFF0E\\uFF61]"), errorCode);
138 
139     icu::UnicodeSet mappedSet(
140         UNICODE_STRING_SIMPLE("[:Changes_When_NFKC_Casefolded:]"), errorCode);
141     mappedSet.removeAll(labelSeparators);  // simplifies checking of mapped characters
142 
143     icu::UnicodeSet baseValidSet(icu::UnicodeString(
144         "[[[[:^Changes_When_NFKC_Casefolded:]"
145         "-[:C:]-[:Z:]"
146         "-[:Block=Ideographic_Description_Characters:]]"
147         "[:ascii:]]-[.]]", -1, US_INV), errorCode);
148 
149     // Characters that are disallowed when STD3 rules are applied,
150     // but valid when STD3 rules are not applied.
151     icu::UnicodeSet disallowedSTD3Set(icu::UnicodeString(
152         "[[:ascii:]-[\\u002D.a-zA-Z0-9]]", -1, US_INV), errorCode);
153 
154     icu::UnicodeSet deviationSet(
155         UNICODE_STRING_SIMPLE("[\\u00DF\\u03C2\\u200C\\u200D]"), errorCode);
156     errorCode.assertSuccess();
157 
158     // derived sets
159     icu::LocalUStringPrepProfilePointer namePrep(usprep_openByType(USPREP_RFC3491_NAMEPREP, errorCode));
160     const icu::Normalizer2 *nfkc_cf=
161         icu::Normalizer2::getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode);
162     errorCode.assertSuccess();
163 
164     // HACK: The StringPrep API performs a BiDi check according to the data.
165     // We need to override that for this data generation, by resetting an internal flag.
166     namePrep->checkBiDi=FALSE;
167 
168     icu::UnicodeSet baseExclusionSet;
169     icu::UnicodeString cString, mapping, namePrepResult;
170     for(UChar32 c=0; c<=0x10ffff; ++c) {
171         if(c==0xd800) {
172             c=0xe000;
173         }
174         int namePrepStatus=toIDNA2003(namePrep.getAlias(), c, namePrepResult);
175         if(namePrepStatus!=0) {
176             // get the UTS #46 base mapping value
177             switch(c) {
178             case 0xff0e:
179             case 0x3002:
180             case 0xff61:
181                 mapping.setTo(0x2e);
182                 break;
183             default:
184                 cString.setTo(c);
185                 nfkc_cf->normalize(cString, mapping, errorCode);
186                 break;
187             }
188             if(
189                 namePrepStatus>0 ?
190                     // c is valid or mapped in IDNA2003
191                     !labelSeparators.contains(c) && namePrepResult!=mapping :
192                     // namePrepStatus<0: c is prohibited in IDNA2003
193                     baseValidSet.contains(c) || (cString!=mapping && baseValidSet.containsAll(mapping))
194             ) {
195                 baseExclusionSet.add(c);
196             }
197         }
198     }
199 
200     icu::UnicodeSet disallowedSet(0, 0x10ffff);
201     disallowedSet.
202         removeAll(labelSeparators).
203         removeAll(deviationSet).
204         removeAll(mappedSet).
205         removeAll(baseValidSet).
206         addAll(baseExclusionSet).
207         addAll(unassignedSet);
208 
209     const icu::Normalizer2 *nfd=
210         icu::Normalizer2::getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode);
211     errorCode.assertSuccess();
212 
213     icu::UnicodeSet ignoredSet;  // will be a subset of mappedSet
214     icu::UnicodeSet removeSet;
215     icu::UnicodeString nfdString;
216     {
217         icu::UnicodeSetIterator iter(mappedSet);
218         while(iter.next()) {
219             UChar32 c=iter.getCodepoint();
220             cString.setTo(c);
221             nfkc_cf->normalize(cString, mapping, errorCode);
222             if(!baseValidSet.containsAll(mapping)) {
223                 fprintf(stderr, "U+%04lX mapped -> disallowed: mapping not wholly in base valid set\n", (long)c);
224                 disallowedSet.add(c);
225                 removeSet.add(c);
226             } else if(mapping.isEmpty()) {
227                 ignoredSet.add(c);
228             }
229         }
230         mappedSet.removeAll(removeSet);
231     }
232     errorCode.assertSuccess();
233 
234     icu::UnicodeSet validSet(baseValidSet);
235     validSet.
236         removeAll(labelSeparators).  // non-ASCII label separators will be mapped in the end
237         removeAll(deviationSet).
238         removeAll(disallowedSet).
239         removeAll(mappedSet).
240         add(0x2e);  // not mapped, simply valid
241     UBool madeChange;
242     do {
243         madeChange=FALSE;
244         {
245             removeSet.clear();
246             icu::UnicodeSetIterator iter(validSet);
247             while(iter.next()) {
248                 UChar32 c=iter.getCodepoint();
249                 if(nfd->getDecomposition(c, nfdString) && !validSet.containsAll(nfdString)) {
250                     fprintf(stderr, "U+%04lX valid -> disallowed: NFD not wholly valid\n", (long)c);
251                     disallowedSet.add(c);
252                     removeSet.add(c);
253                     madeChange=TRUE;
254                 }
255             }
256             validSet.removeAll(removeSet);
257         }
258         {
259             removeSet.clear();
260             icu::UnicodeSetIterator iter(mappedSet);
261             while(iter.next()) {
262                 UChar32 c=iter.getCodepoint();
263                 cString.setTo(c);
264                 nfkc_cf->normalize(cString, mapping, errorCode);
265                 nfd->normalize(mapping, nfdString, errorCode);
266                 if(!validSet.containsAll(nfdString)) {
267                     fprintf(stderr, "U+%04lX mapped -> disallowed: NFD of mapping not wholly valid\n", (long)c);
268                     disallowedSet.add(c);
269                     removeSet.add(c);
270                     madeChange=TRUE;
271                 }
272             }
273             mappedSet.removeAll(removeSet);
274         }
275     } while(madeChange);
276     errorCode.assertSuccess();
277 
278     // finish up
279     labelSeparators.remove(0x2e).freeze();  // U+002E is simply valid
280     deviationSet.freeze();
281     ignoredSet.freeze();
282     validSet.freeze();
283     mappedSet.freeze();
284     disallowedSTD3Set.freeze();
285 
286     // output
287     UChar32 prevStart=0, c=0;
288     Status prevStatus=DISALLOWED_STD3_VALID, status;
289     icu::UnicodeString prevMapping;
290     UVersionInfo prevAge={ 1, 1, 0, 0 }, age;
291 
292     icu::UnicodeSetIterator iter(disallowedSet);
293     while(iter.nextRange()) {
294         UChar32 start=iter.getCodepoint();
295         while(c<start) {
296             mapping.remove();
297             if(labelSeparators.contains(c)) {
298                 status=MAPPED;
299                 mapping.setTo(0x2e);
300             } else if(deviationSet.contains(c)) {
301                 status=DEVIATION;
302                 cString.setTo(c);
303                 nfkc_cf->normalize(cString, mapping, errorCode);
304             } else if(ignoredSet.contains(c)) {
305                 status=IGNORED;
306             } else if(validSet.contains(c)) {
307                 if(disallowedSTD3Set.contains(c)) {
308                     fprintf(stderr, "U+%04lX valid -> disallowed_STD3_valid: itself not STD3\n", (long)c);
309                     status=DISALLOWED_STD3_VALID;
310                 } else if( nfd->getDecomposition(c, nfdString) &&
311                     disallowedSTD3Set.containsSome(nfdString)
312                 ) {
313                     fprintf(stderr, "U+%04lX valid -> disallowed_STD3_valid: NFD not wholly STD3\n", (long)c);
314                     status=DISALLOWED_STD3_VALID;
315                 } else {
316                     status=VALID;
317                 }
318             } else if(mappedSet.contains(c)) {
319                 cString.setTo(c);
320                 nfkc_cf->normalize(cString, mapping, errorCode);
321                 if(disallowedSTD3Set.containsSome(mapping)) {
322                     fprintf(stderr, "U+%04lX mapped -> disallowed_STD3_mapped\n", (long)c);
323                     status=DISALLOWED_STD3_MAPPED;
324                 } else {
325                     status=MAPPED;
326                 }
327             } else {
328                 fprintf(stderr, "*** undetermined status of U+%04lX\n", (long)c);
329             }
330             // Print a new line where the status, the mapping or
331             // the character age change.
332             getAgeIfAssigned(c, age);
333             if( prevStart<c &&
334                 (status!=prevStatus || mapping!=prevMapping || 0!=memcmp(prevAge, age, 4))
335             ) {
336                 printLine(prevStart, c-1, prevStatus, prevMapping);
337                 prevStart=c;
338                 prevStatus=status;
339                 prevMapping=mapping;
340                 memcpy(prevAge, age, 4);
341             }
342             ++c;
343         }
344         // c==start is disallowed
345         if(prevStart<c) {
346             printLine(prevStart, c-1, prevStatus, prevMapping);
347         }
348         prevStart=c;
349         prevStatus=DISALLOWED;
350         prevMapping.remove();
351         getAgeIfAssigned(c, prevAge);
352         UChar32 end=iter.getCodepointEnd();
353         while(++c<=end) {
354             getAgeIfAssigned(c, age);
355             if(prevStart<c && 0!=memcmp(prevAge, age, 4)) {
356                 printLine(prevStart, c-1, prevStatus, prevMapping);
357                 prevStart=c;
358                 memcpy(prevAge, age, 4);
359             }
360         }
361     }
362     if(prevStart<c) {
363         printLine(prevStart, c-1, prevStatus, prevMapping);
364     }
365     return 0;
366 }
367