• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2010-2014, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   file name:  uts46test.cpp
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2010may05
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_IDNA
20 
21 #include <string.h>
22 #include "unicode/bytestream.h"
23 #include "unicode/idna.h"
24 #include "unicode/localpointer.h"
25 #include "unicode/std_string.h"
26 #include "unicode/stringpiece.h"
27 #include "unicode/uidna.h"
28 #include "unicode/unistr.h"
29 #include "charstr.h"
30 #include "cmemory.h"
31 #include "intltest.h"
32 #include "punycode.h"
33 #include "uparse.h"
34 
35 class UTS46Test : public IntlTest {
36 public:
UTS46Test()37     UTS46Test() : trans(nullptr), nontrans(nullptr) {}
38     virtual ~UTS46Test();
39 
40     void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=nullptr) override;
41     void TestAPI();
42     void TestNotSTD3();
43     void TestInvalidPunycodeDigits();
44     void TestACELabelEdgeCases();
45     void TestTooLong();
46     void TestSomeCases();
47     void IdnaTest();
48 
49     void checkIdnaTestResult(const char *line, const char *type,
50                              const UnicodeString &expected, const UnicodeString &result,
51                              const char *status, const IDNAInfo &info);
52     void idnaTestOneLine(char *fields[][2], UErrorCode &errorCode);
53 
54 private:
55     IDNA *trans, *nontrans;
56 };
57 
createUTS46Test()58 extern IntlTest *createUTS46Test() {
59     return new UTS46Test();
60 }
61 
~UTS46Test()62 UTS46Test::~UTS46Test() {
63     delete trans;
64     delete nontrans;
65 }
66 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)67 void UTS46Test::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
68     if(exec) {
69         logln("TestSuite UTS46Test: ");
70         if(trans==nullptr) {
71             IcuTestErrorCode errorCode(*this, "init/createUTS46Instance()");
72             uint32_t commonOptions=
73                 UIDNA_USE_STD3_RULES|UIDNA_CHECK_BIDI|
74                 UIDNA_CHECK_CONTEXTJ|UIDNA_CHECK_CONTEXTO;
75             trans=IDNA::createUTS46Instance(commonOptions, errorCode);
76             nontrans=IDNA::createUTS46Instance(
77                 commonOptions|
78                 UIDNA_NONTRANSITIONAL_TO_ASCII|UIDNA_NONTRANSITIONAL_TO_UNICODE,
79                 errorCode);
80             if(errorCode.errDataIfFailureAndReset("createUTS46Instance()")) {
81                 name="";
82                 return;
83             }
84         }
85     }
86     TESTCASE_AUTO_BEGIN;
87     TESTCASE_AUTO(TestAPI);
88     TESTCASE_AUTO(TestNotSTD3);
89     TESTCASE_AUTO(TestInvalidPunycodeDigits);
90     TESTCASE_AUTO(TestACELabelEdgeCases);
91     TESTCASE_AUTO(TestTooLong);
92     TESTCASE_AUTO(TestSomeCases);
93     TESTCASE_AUTO(IdnaTest);
94     TESTCASE_AUTO_END;
95 }
96 
97 const uint32_t severeErrors=
98     UIDNA_ERROR_LEADING_COMBINING_MARK|
99     UIDNA_ERROR_DISALLOWED|
100     UIDNA_ERROR_PUNYCODE|
101     UIDNA_ERROR_LABEL_HAS_DOT|
102     UIDNA_ERROR_INVALID_ACE_LABEL;
103 
isASCII(const UnicodeString & str)104 static UBool isASCII(const UnicodeString &str) {
105     const char16_t *s=str.getBuffer();
106     int32_t length=str.length();
107     for(int32_t i=0; i<length; ++i) {
108         if(s[i]>=0x80) {
109             return false;
110         }
111     }
112     return true;
113 }
114 
115 class TestCheckedArrayByteSink : public CheckedArrayByteSink {
116 public:
TestCheckedArrayByteSink(char * outbuf,int32_t capacity)117     TestCheckedArrayByteSink(char* outbuf, int32_t capacity)
118             : CheckedArrayByteSink(outbuf, capacity), calledFlush(false) {}
Reset()119     virtual CheckedArrayByteSink& Reset() override {
120         CheckedArrayByteSink::Reset();
121         calledFlush = false;
122         return *this;
123     }
Flush()124     virtual void Flush() override { calledFlush = true; }
125     UBool calledFlush;
126 };
127 
TestAPI()128 void UTS46Test::TestAPI() {
129     UErrorCode errorCode=U_ZERO_ERROR;
130     UnicodeString result;
131     IDNAInfo info;
132     UnicodeString input=UNICODE_STRING_SIMPLE("www.eXample.cOm");
133     UnicodeString expected=UNICODE_STRING_SIMPLE("www.example.com");
134     trans->nameToASCII(input, result, info, errorCode);
135     if(U_FAILURE(errorCode) || info.hasErrors() || result!=expected) {
136         errln("T.nameToASCII(www.example.com) info.errors=%04lx result matches=%d %s",
137               (long)info.getErrors(), result==expected, u_errorName(errorCode));
138     }
139     errorCode=U_USELESS_COLLATOR_ERROR;
140     trans->nameToUnicode(input, result, info, errorCode);
141     if(errorCode!=U_USELESS_COLLATOR_ERROR || !result.isBogus()) {
142         errln("T.nameToUnicode(U_FAILURE) did not preserve the errorCode "
143               "or not result.setToBogus() - %s",
144               u_errorName(errorCode));
145     }
146     errorCode=U_ZERO_ERROR;
147     input.setToBogus();
148     result=UNICODE_STRING_SIMPLE("quatsch");
149     nontrans->labelToASCII(input, result, info, errorCode);
150     if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || !result.isBogus()) {
151         errln("N.labelToASCII(bogus) did not set illegal-argument-error "
152               "or not result.setToBogus() - %s",
153               u_errorName(errorCode));
154     }
155     errorCode=U_ZERO_ERROR;
156     input=UNICODE_STRING_SIMPLE("xn--bcher.de-65a");
157     expected=UNICODE_STRING_SIMPLE("xn--bcher\\uFFFDde-65a").unescape();
158     nontrans->labelToASCII(input, result, info, errorCode);
159     if( U_FAILURE(errorCode) ||
160         info.getErrors()!=(UIDNA_ERROR_LABEL_HAS_DOT|UIDNA_ERROR_INVALID_ACE_LABEL) ||
161         result!=expected
162     ) {
163         errln("N.labelToASCII(label-with-dot) failed with errors %04lx - %s",
164               info.getErrors(), u_errorName(errorCode));
165     }
166     // UTF-8
167     char buffer[100];
168     TestCheckedArrayByteSink sink(buffer, UPRV_LENGTHOF(buffer));
169     errorCode=U_ZERO_ERROR;
170     nontrans->labelToUnicodeUTF8(StringPiece((const char *)nullptr, 5), sink, info, errorCode);
171     if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || sink.NumberOfBytesWritten()!=0) {
172         errln("N.labelToUnicodeUTF8(StringPiece(nullptr, 5)) did not set illegal-argument-error ",
173               "or did output something - %s",
174               u_errorName(errorCode));
175     }
176 
177     sink.Reset();
178     errorCode=U_ZERO_ERROR;
179     nontrans->nameToASCII_UTF8(StringPiece(), sink, info, errorCode);
180     if(U_FAILURE(errorCode) || sink.NumberOfBytesWritten()!=0 || !sink.calledFlush) {
181         errln("N.nameToASCII_UTF8(empty) failed - %s",
182               u_errorName(errorCode));
183     }
184 
185     static const char s[]={ 0x61, (char)0xc3, (char)0x9f };
186     sink.Reset();
187     errorCode=U_USELESS_COLLATOR_ERROR;
188     nontrans->nameToUnicodeUTF8(StringPiece(s, 3), sink, info, errorCode);
189     if(errorCode!=U_USELESS_COLLATOR_ERROR || sink.NumberOfBytesWritten()!=0) {
190         errln("N.nameToUnicode_UTF8(U_FAILURE) did not preserve the errorCode "
191               "or did output something - %s",
192               u_errorName(errorCode));
193     }
194 
195     sink.Reset();
196     errorCode=U_ZERO_ERROR;
197     trans->labelToUnicodeUTF8(StringPiece(s, 3), sink, info, errorCode);
198     if( U_FAILURE(errorCode) || sink.NumberOfBytesWritten()!=3 ||
199         buffer[0]!=0x61 || buffer[1]!=0x73 || buffer[2]!=0x73 ||
200         !sink.calledFlush
201     ) {
202         errln("T.labelToUnicodeUTF8(a sharp-s) failed - %s",
203               u_errorName(errorCode));
204     }
205 
206     sink.Reset();
207     errorCode=U_ZERO_ERROR;
208     // "eXampLe.cOm"
209     static const char eX[]={ 0x65, 0x58, 0x61, 0x6d, 0x70, 0x4c, 0x65, 0x2e, 0x63, 0x4f, 0x6d, 0 };
210     // "example.com"
211     static const char ex[]={ 0x65, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x2e, 0x63, 0x6f, 0x6d };
212     trans->nameToUnicodeUTF8(eX, sink, info, errorCode);
213     if( U_FAILURE(errorCode) || sink.NumberOfBytesWritten()!=11 ||
214         0!=memcmp(ex, buffer, 11) || !sink.calledFlush
215     ) {
216         errln("T.nameToUnicodeUTF8(eXampLe.cOm) failed - %s",
217               u_errorName(errorCode));
218     }
219 }
220 
TestNotSTD3()221 void UTS46Test::TestNotSTD3() {
222     IcuTestErrorCode errorCode(*this, "TestNotSTD3()");
223     char buffer[400];
224     LocalPointer<IDNA> not3(IDNA::createUTS46Instance(UIDNA_CHECK_BIDI, errorCode));
225     if(errorCode.isFailure()) {
226         return;
227     }
228     UnicodeString input=UNICODE_STRING_SIMPLE("\\u0000A_2+2=4\\u000A.e\\u00DFen.net").unescape();
229     UnicodeString result;
230     IDNAInfo info;
231     if( not3->nameToUnicode(input, result, info, errorCode)!=
232             UNICODE_STRING_SIMPLE("\\u0000a_2+2=4\\u000A.essen.net").unescape() ||
233         info.hasErrors()
234     ) {
235         prettify(result).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
236         errln("notSTD3.nameToUnicode(non-LDH ASCII) unexpected errors %04lx string %s",
237               (long)info.getErrors(), buffer);
238     }
239     // A space (BiDi class WS) is not allowed in a BiDi domain name.
240     input=UNICODE_STRING_SIMPLE("a z.xn--4db.edu");
241     not3->nameToASCII(input, result, info, errorCode);
242     if(result!=input || info.getErrors()!=UIDNA_ERROR_BIDI) {
243         errln("notSTD3.nameToASCII(ASCII-with-space.alef.edu) failed");
244     }
245 }
246 
TestInvalidPunycodeDigits()247 void UTS46Test::TestInvalidPunycodeDigits() {
248     IcuTestErrorCode errorCode(*this, "TestInvalidPunycodeDigits()");
249     LocalPointer<IDNA> idna(IDNA::createUTS46Instance(0, errorCode));
250     if(errorCode.isFailure()) {
251         return;
252     }
253     UnicodeString result;
254     {
255         IDNAInfo info;
256         idna->nameToUnicode(u"xn--pleP", result, info, errorCode);  // P=U+0050
257         assertFalse("nameToUnicode() should succeed",
258                     (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
259         assertEquals("normal result", u"ᔼᔴ", result);
260     }
261     {
262         IDNAInfo info;
263         idna->nameToUnicode(u"xn--pleѐ", result, info, errorCode);  // ends with non-ASCII U+0450
264         assertTrue("nameToUnicode() should detect non-ASCII",
265                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
266     }
267 
268     // Test with ASCII characters adjacent to LDH.
269     {
270         IDNAInfo info;
271         idna->nameToUnicode(u"xn--ple/", result, info, errorCode);
272         assertTrue("nameToUnicode() should detect '/'",
273                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
274     }
275 
276     {
277         IDNAInfo info;
278         idna->nameToUnicode(u"xn--ple:", result, info, errorCode);
279         assertTrue("nameToUnicode() should detect ':'",
280                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
281     }
282 
283     {
284         IDNAInfo info;
285         idna->nameToUnicode(u"xn--ple@", result, info, errorCode);
286         assertTrue("nameToUnicode() should detect '@'",
287                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
288     }
289 
290     {
291         IDNAInfo info;
292         idna->nameToUnicode(u"xn--ple[", result, info, errorCode);
293         assertTrue("nameToUnicode() should detect '['",
294                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
295     }
296 
297     {
298         IDNAInfo info;
299         idna->nameToUnicode(u"xn--ple`", result, info, errorCode);
300         assertTrue("nameToUnicode() should detect '`'",
301                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
302     }
303 
304     {
305         IDNAInfo info;
306         idna->nameToUnicode(u"xn--ple{", result, info, errorCode);
307         assertTrue("nameToUnicode() should detect '{'",
308                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
309     }
310 }
311 
TestACELabelEdgeCases()312 void UTS46Test::TestACELabelEdgeCases() {
313     // In IDNA2008, these labels fail the round-trip validation from comparing
314     // the ToUnicode input with the back-to-ToASCII output.
315     IcuTestErrorCode errorCode(*this, "TestACELabelEdgeCases()");
316     LocalPointer<IDNA> idna(IDNA::createUTS46Instance(0, errorCode));
317     if(errorCode.isFailure()) {
318         return;
319     }
320     UnicodeString result;
321     {
322         IDNAInfo info;
323         idna->labelToUnicode(u"xn--", result, info, errorCode);
324         assertTrue("empty xn--", (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
325     }
326     {
327         IDNAInfo info;
328         idna->labelToUnicode(u"xN--ASCII-", result, info, errorCode);
329         assertTrue("nothing but ASCII", (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
330     }
331     {
332         // Different error: The Punycode decoding procedure does not consume the last delimiter
333         // if it is right after the xn-- so the main decoding loop fails because the hyphen
334         // is not a valid Punycode digit.
335         IDNAInfo info;
336         idna->labelToUnicode(u"Xn---", result, info, errorCode);
337         assertTrue("empty Xn---", (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
338     }
339 }
340 
TestTooLong()341 void UTS46Test::TestTooLong() {
342     // ICU-13727: Limit input length for n^2 algorithm
343     // where well-formed strings are at most 59 characters long.
344     int32_t count = 50000;
345     UnicodeString s(count, u'a', count);  // capacity, code point, count
346     char16_t dest[60000];
347     UErrorCode errorCode = U_ZERO_ERROR;
348     u_strToPunycode(s.getBuffer(), s.length(), dest, UPRV_LENGTHOF(dest), nullptr, &errorCode);
349     assertEquals("encode: expected an error for too-long input", U_INPUT_TOO_LONG_ERROR, errorCode);
350     errorCode = U_ZERO_ERROR;
351     u_strFromPunycode(s.getBuffer(), s.length(), dest, UPRV_LENGTHOF(dest), nullptr, &errorCode);
352     assertEquals("decode: expected an error for too-long input", U_INPUT_TOO_LONG_ERROR, errorCode);
353 }
354 
355 struct TestCase {
356     // Input string and options string (Nontransitional/Transitional/Both).
357     const char *s, *o;
358     // Expected Unicode result string.
359     const char *u;
360     uint32_t errors;
361 };
362 
363 static const TestCase testCases[]={
364     { "www.eXample.cOm", "B",  // all ASCII
365       "www.example.com", 0 },
366     { "B\\u00FCcher.de", "B",  // u-umlaut
367       "b\\u00FCcher.de", 0 },
368     { "\\u00D6BB", "B",  // O-umlaut
369       "\\u00F6bb", 0 },
370     { "fa\\u00DF.de", "N",  // sharp s
371       "fa\\u00DF.de", 0 },
372     { "fa\\u00DF.de", "T",  // sharp s
373       "fass.de", 0 },
374     { "XN--fA-hia.dE", "B",  // sharp s in Punycode
375       "fa\\u00DF.de", 0 },
376     { "\\u03B2\\u03CC\\u03BB\\u03BF\\u03C2.com", "N",  // Greek with final sigma
377       "\\u03B2\\u03CC\\u03BB\\u03BF\\u03C2.com", 0 },
378     { "\\u03B2\\u03CC\\u03BB\\u03BF\\u03C2.com", "T",  // Greek with final sigma
379       "\\u03B2\\u03CC\\u03BB\\u03BF\\u03C3.com", 0 },
380     { "xn--nxasmm1c", "B",  // Greek with final sigma in Punycode
381       "\\u03B2\\u03CC\\u03BB\\u03BF\\u03C2", 0 },
382     { "www.\\u0DC1\\u0DCA\\u200D\\u0DBB\\u0DD3.com", "N",  // "Sri" in "Sri Lanka" has a ZWJ
383       "www.\\u0DC1\\u0DCA\\u200D\\u0DBB\\u0DD3.com", 0 },
384     { "www.\\u0DC1\\u0DCA\\u200D\\u0DBB\\u0DD3.com", "T",  // "Sri" in "Sri Lanka" has a ZWJ
385       "www.\\u0DC1\\u0DCA\\u0DBB\\u0DD3.com", 0 },
386     { "www.xn--10cl1a0b660p.com", "B",  // "Sri" in Punycode
387       "www.\\u0DC1\\u0DCA\\u200D\\u0DBB\\u0DD3.com", 0 },
388     { "\\u0646\\u0627\\u0645\\u0647\\u200C\\u0627\\u06CC", "N",  // ZWNJ
389       "\\u0646\\u0627\\u0645\\u0647\\u200C\\u0627\\u06CC", 0 },
390     { "\\u0646\\u0627\\u0645\\u0647\\u200C\\u0627\\u06CC", "T",  // ZWNJ
391       "\\u0646\\u0627\\u0645\\u0647\\u0627\\u06CC", 0 },
392     { "xn--mgba3gch31f060k.com", "B",  // ZWNJ in Punycode
393       "\\u0646\\u0627\\u0645\\u0647\\u200C\\u0627\\u06CC.com", 0 },
394     { "a.b\\uFF0Ec\\u3002d\\uFF61", "B",
395       "a.b.c.d.", 0 },
396     { "U\\u0308.xn--tda", "B",  // U+umlaut.u-umlaut
397       "\\u00FC.\\u00FC", 0 },
398     { "xn--u-ccb", "B",  // u+umlaut in Punycode
399       "xn--u-ccb\\uFFFD", UIDNA_ERROR_INVALID_ACE_LABEL },
400     { "a\\u2488com", "B",  // contains 1-dot
401       "a\\uFFFDcom", UIDNA_ERROR_DISALLOWED },
402     { "xn--a-ecp.ru", "B",  // contains 1-dot in Punycode
403       "xn--a-ecp\\uFFFD.ru", UIDNA_ERROR_INVALID_ACE_LABEL },
404     { "xn--0.pt", "B",  // invalid Punycode
405       "xn--0\\uFFFD.pt", UIDNA_ERROR_PUNYCODE },
406     { "xn--a.pt", "B",  // U+0080
407       "xn--a\\uFFFD.pt", UIDNA_ERROR_INVALID_ACE_LABEL },
408     { "xn--a-\\u00C4.pt", "B",  // invalid Punycode
409       "xn--a-\\u00E4.pt", UIDNA_ERROR_PUNYCODE },
410     { "\\u65E5\\u672C\\u8A9E\\u3002\\uFF2A\\uFF30", "B",  // Japanese with fullwidth ".jp"
411       "\\u65E5\\u672C\\u8A9E.jp", 0 },
412     { "\\u2615", "B", "\\u2615", 0 },  // Unicode 4.0 HOT BEVERAGE
413     // many deviation characters, test the special mapping code
414     { "1.a\\u00DF\\u200C\\u200Db\\u200C\\u200Dc\\u00DF\\u00DF\\u00DF\\u00DFd"
415       "\\u03C2\\u03C3\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFe"
416       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFx"
417       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFy"
418       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u0302\\u00DFz", "N",
419       "1.a\\u00DF\\u200C\\u200Db\\u200C\\u200Dc\\u00DF\\u00DF\\u00DF\\u00DFd"
420       "\\u03C2\\u03C3\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFe"
421       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFx"
422       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFy"
423       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u0302\\u00DFz",
424       UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_CONTEXTJ },
425     { "1.a\\u00DF\\u200C\\u200Db\\u200C\\u200Dc\\u00DF\\u00DF\\u00DF\\u00DFd"
426       "\\u03C2\\u03C3\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFe"
427       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFx"
428       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFy"
429       "\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u0302\\u00DFz", "T",
430       "1.assbcssssssssd"
431       "\\u03C3\\u03C3sssssssssssssssse"
432       "ssssssssssssssssssssx"
433       "ssssssssssssssssssssy"
434       "sssssssssssssss\\u015Dssz", UIDNA_ERROR_LABEL_TOO_LONG },
435     // "xn--bss" with deviation characters
436     { "\\u200Cx\\u200Dn\\u200C-\\u200D-b\\u00DF", "N",
437       "\\u200Cx\\u200Dn\\u200C-\\u200D-b\\u00DF", UIDNA_ERROR_CONTEXTJ },
438     { "\\u200Cx\\u200Dn\\u200C-\\u200D-b\\u00DF", "T",
439       "\\u5919", 0 },
440     // "xn--bssffl" written as:
441     // 02E3 MODIFIER LETTER SMALL X
442     // 034F COMBINING GRAPHEME JOINER (ignored)
443     // 2115 DOUBLE-STRUCK CAPITAL N
444     // 200B ZERO WIDTH SPACE (ignored)
445     // FE63 SMALL HYPHEN-MINUS
446     // 00AD SOFT HYPHEN (ignored)
447     // FF0D FULLWIDTH HYPHEN-MINUS
448     // 180C MONGOLIAN FREE VARIATION SELECTOR TWO (ignored)
449     // 212C SCRIPT CAPITAL B
450     // FE00 VARIATION SELECTOR-1 (ignored)
451     // 017F LATIN SMALL LETTER LONG S
452     // 2064 INVISIBLE PLUS (ignored)
453     // 1D530 MATHEMATICAL FRAKTUR SMALL S
454     // E01EF VARIATION SELECTOR-256 (ignored)
455     // FB04 LATIN SMALL LIGATURE FFL
456     { "\\u02E3\\u034F\\u2115\\u200B\\uFE63\\u00AD\\uFF0D\\u180C"
457       "\\u212C\\uFE00\\u017F\\u2064\\U0001D530\\U000E01EF\\uFB04", "B",
458       "\\u5921\\u591E\\u591C\\u5919", 0 },
459     { "123456789012345678901234567890123456789012345678901234567890123."
460       "123456789012345678901234567890123456789012345678901234567890123."
461       "123456789012345678901234567890123456789012345678901234567890123."
462       "1234567890123456789012345678901234567890123456789012345678901", "B",
463       "123456789012345678901234567890123456789012345678901234567890123."
464       "123456789012345678901234567890123456789012345678901234567890123."
465       "123456789012345678901234567890123456789012345678901234567890123."
466       "1234567890123456789012345678901234567890123456789012345678901", 0 },
467     { "123456789012345678901234567890123456789012345678901234567890123."
468       "123456789012345678901234567890123456789012345678901234567890123."
469       "123456789012345678901234567890123456789012345678901234567890123."
470       "1234567890123456789012345678901234567890123456789012345678901.", "B",
471       "123456789012345678901234567890123456789012345678901234567890123."
472       "123456789012345678901234567890123456789012345678901234567890123."
473       "123456789012345678901234567890123456789012345678901234567890123."
474       "1234567890123456789012345678901234567890123456789012345678901.", 0 },
475     // Domain name >256 characters, forces slow path in UTF-8 processing.
476     { "123456789012345678901234567890123456789012345678901234567890123."
477       "123456789012345678901234567890123456789012345678901234567890123."
478       "123456789012345678901234567890123456789012345678901234567890123."
479       "123456789012345678901234567890123456789012345678901234567890123."
480       "12345678901234567890123456789012345678901234567890123456789012", "B",
481       "123456789012345678901234567890123456789012345678901234567890123."
482       "123456789012345678901234567890123456789012345678901234567890123."
483       "123456789012345678901234567890123456789012345678901234567890123."
484       "123456789012345678901234567890123456789012345678901234567890123."
485       "12345678901234567890123456789012345678901234567890123456789012",
486       UIDNA_ERROR_DOMAIN_NAME_TOO_LONG },
487     { "123456789012345678901234567890123456789012345678901234567890123."
488       "123456789012345678901234567890123456789012345678901234567890123."
489       "123456789012345678901234567890123456789012345678901234567890123."
490       "123456789012345678901234567890123456789012345678901234567890123."
491       "1234567890123456789012345678901234567890123456789\\u05D0", "B",
492       "123456789012345678901234567890123456789012345678901234567890123."
493       "123456789012345678901234567890123456789012345678901234567890123."
494       "123456789012345678901234567890123456789012345678901234567890123."
495       "123456789012345678901234567890123456789012345678901234567890123."
496       "1234567890123456789012345678901234567890123456789\\u05D0",
497       UIDNA_ERROR_DOMAIN_NAME_TOO_LONG|UIDNA_ERROR_BIDI },
498     { "123456789012345678901234567890123456789012345678901234567890123."
499       "1234567890123456789012345678901234567890123456789012345678901234."
500       "123456789012345678901234567890123456789012345678901234567890123."
501       "123456789012345678901234567890123456789012345678901234567890", "B",
502       "123456789012345678901234567890123456789012345678901234567890123."
503       "1234567890123456789012345678901234567890123456789012345678901234."
504       "123456789012345678901234567890123456789012345678901234567890123."
505       "123456789012345678901234567890123456789012345678901234567890",
506       UIDNA_ERROR_LABEL_TOO_LONG },
507     { "123456789012345678901234567890123456789012345678901234567890123."
508       "1234567890123456789012345678901234567890123456789012345678901234."
509       "123456789012345678901234567890123456789012345678901234567890123."
510       "123456789012345678901234567890123456789012345678901234567890.", "B",
511       "123456789012345678901234567890123456789012345678901234567890123."
512       "1234567890123456789012345678901234567890123456789012345678901234."
513       "123456789012345678901234567890123456789012345678901234567890123."
514       "123456789012345678901234567890123456789012345678901234567890.",
515       UIDNA_ERROR_LABEL_TOO_LONG },
516     { "123456789012345678901234567890123456789012345678901234567890123."
517       "1234567890123456789012345678901234567890123456789012345678901234."
518       "123456789012345678901234567890123456789012345678901234567890123."
519       "1234567890123456789012345678901234567890123456789012345678901", "B",
520       "123456789012345678901234567890123456789012345678901234567890123."
521       "1234567890123456789012345678901234567890123456789012345678901234."
522       "123456789012345678901234567890123456789012345678901234567890123."
523       "1234567890123456789012345678901234567890123456789012345678901",
524       UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_DOMAIN_NAME_TOO_LONG },
525     // label length 63: xn--1234567890123456789012345678901234567890123456789012345-9te
526     { "\\u00E41234567890123456789012345678901234567890123456789012345", "B",
527       "\\u00E41234567890123456789012345678901234567890123456789012345", 0 },
528     { "1234567890\\u00E41234567890123456789012345678901234567890123456", "B",
529       "1234567890\\u00E41234567890123456789012345678901234567890123456", UIDNA_ERROR_LABEL_TOO_LONG },
530     { "123456789012345678901234567890123456789012345678901234567890123."
531       "1234567890\\u00E4123456789012345678901234567890123456789012345."
532       "123456789012345678901234567890123456789012345678901234567890123."
533       "1234567890123456789012345678901234567890123456789012345678901", "B",
534       "123456789012345678901234567890123456789012345678901234567890123."
535       "1234567890\\u00E4123456789012345678901234567890123456789012345."
536       "123456789012345678901234567890123456789012345678901234567890123."
537       "1234567890123456789012345678901234567890123456789012345678901", 0 },
538     { "123456789012345678901234567890123456789012345678901234567890123."
539       "1234567890\\u00E4123456789012345678901234567890123456789012345."
540       "123456789012345678901234567890123456789012345678901234567890123."
541       "1234567890123456789012345678901234567890123456789012345678901.", "B",
542       "123456789012345678901234567890123456789012345678901234567890123."
543       "1234567890\\u00E4123456789012345678901234567890123456789012345."
544       "123456789012345678901234567890123456789012345678901234567890123."
545       "1234567890123456789012345678901234567890123456789012345678901.", 0 },
546     { "123456789012345678901234567890123456789012345678901234567890123."
547       "1234567890\\u00E4123456789012345678901234567890123456789012345."
548       "123456789012345678901234567890123456789012345678901234567890123."
549       "12345678901234567890123456789012345678901234567890123456789012", "B",
550       "123456789012345678901234567890123456789012345678901234567890123."
551       "1234567890\\u00E4123456789012345678901234567890123456789012345."
552       "123456789012345678901234567890123456789012345678901234567890123."
553       "12345678901234567890123456789012345678901234567890123456789012",
554       UIDNA_ERROR_DOMAIN_NAME_TOO_LONG },
555     { "123456789012345678901234567890123456789012345678901234567890123."
556       "1234567890\\u00E41234567890123456789012345678901234567890123456."
557       "123456789012345678901234567890123456789012345678901234567890123."
558       "123456789012345678901234567890123456789012345678901234567890", "B",
559       "123456789012345678901234567890123456789012345678901234567890123."
560       "1234567890\\u00E41234567890123456789012345678901234567890123456."
561       "123456789012345678901234567890123456789012345678901234567890123."
562       "123456789012345678901234567890123456789012345678901234567890",
563       UIDNA_ERROR_LABEL_TOO_LONG },
564     { "123456789012345678901234567890123456789012345678901234567890123."
565       "1234567890\\u00E41234567890123456789012345678901234567890123456."
566       "123456789012345678901234567890123456789012345678901234567890123."
567       "123456789012345678901234567890123456789012345678901234567890.", "B",
568       "123456789012345678901234567890123456789012345678901234567890123."
569       "1234567890\\u00E41234567890123456789012345678901234567890123456."
570       "123456789012345678901234567890123456789012345678901234567890123."
571       "123456789012345678901234567890123456789012345678901234567890.",
572       UIDNA_ERROR_LABEL_TOO_LONG },
573     { "123456789012345678901234567890123456789012345678901234567890123."
574       "1234567890\\u00E41234567890123456789012345678901234567890123456."
575       "123456789012345678901234567890123456789012345678901234567890123."
576       "1234567890123456789012345678901234567890123456789012345678901", "B",
577       "123456789012345678901234567890123456789012345678901234567890123."
578       "1234567890\\u00E41234567890123456789012345678901234567890123456."
579       "123456789012345678901234567890123456789012345678901234567890123."
580       "1234567890123456789012345678901234567890123456789012345678901",
581       UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_DOMAIN_NAME_TOO_LONG },
582     // hyphen errors and empty-label errors
583     // Ticket #10883: ToUnicode also checks for empty labels.
584     { ".", "B", ".", UIDNA_ERROR_EMPTY_LABEL },
585     { "\\uFF0E", "B", ".", UIDNA_ERROR_EMPTY_LABEL },
586     // "xn---q----jra"=="-q--a-umlaut-"
587     { "a.b..-q--a-.e", "B", "a.b..-q--a-.e",
588       UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|
589       UIDNA_ERROR_HYPHEN_3_4 },
590     { "a.b..-q--\\u00E4-.e", "B", "a.b..-q--\\u00E4-.e",
591       UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|
592       UIDNA_ERROR_HYPHEN_3_4 },
593     { "a.b..xn---q----jra.e", "B", "a.b..-q--\\u00E4-.e",
594       UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|
595       UIDNA_ERROR_HYPHEN_3_4 },
596     { "a..c", "B", "a..c", UIDNA_ERROR_EMPTY_LABEL },
597     { "a.xn--.c", "B", "a.xn--\\uFFFD.c", UIDNA_ERROR_INVALID_ACE_LABEL },
598     { "a.-b.", "B", "a.-b.", UIDNA_ERROR_LEADING_HYPHEN },
599     { "a.b-.c", "B", "a.b-.c", UIDNA_ERROR_TRAILING_HYPHEN },
600     { "a.-.c", "B", "a.-.c", UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN },
601     { "a.bc--de.f", "B", "a.bc--de.f", UIDNA_ERROR_HYPHEN_3_4 },
602     { "\\u00E4.\\u00AD.c", "B", "\\u00E4..c", UIDNA_ERROR_EMPTY_LABEL },
603     { "\\u00E4.xn--.c", "B", "\\u00E4.xn--\\uFFFD.c", UIDNA_ERROR_INVALID_ACE_LABEL },
604     { "\\u00E4.-b.", "B", "\\u00E4.-b.", UIDNA_ERROR_LEADING_HYPHEN },
605     { "\\u00E4.b-.c", "B", "\\u00E4.b-.c", UIDNA_ERROR_TRAILING_HYPHEN },
606     { "\\u00E4.-.c", "B", "\\u00E4.-.c", UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN },
607     { "\\u00E4.bc--de.f", "B", "\\u00E4.bc--de.f", UIDNA_ERROR_HYPHEN_3_4 },
608     { "a.b.\\u0308c.d", "B", "a.b.\\uFFFDc.d", UIDNA_ERROR_LEADING_COMBINING_MARK },
609     { "a.b.xn--c-bcb.d", "B",
610       "a.b.xn--c-bcb\\uFFFD.d", UIDNA_ERROR_LEADING_COMBINING_MARK|UIDNA_ERROR_INVALID_ACE_LABEL },
611     // BiDi
612     { "A0", "B", "a0", 0 },
613     { "0A", "B", "0a", 0 },  // all-LTR is ok to start with a digit (EN)
614     { "0A.\\u05D0", "B",  // ASCII label does not start with L/R/AL
615       "0a.\\u05D0", UIDNA_ERROR_BIDI },
616     { "c.xn--0-eha.xn--4db", "B",  // 2nd label does not start with L/R/AL
617       "c.0\\u00FC.\\u05D0", UIDNA_ERROR_BIDI },
618     { "b-.\\u05D0", "B",  // label does not end with L/EN
619       "b-.\\u05D0", UIDNA_ERROR_TRAILING_HYPHEN|UIDNA_ERROR_BIDI },
620     { "d.xn----dha.xn--4db", "B",  // 2nd label does not end with L/EN
621       "d.\\u00FC-.\\u05D0", UIDNA_ERROR_TRAILING_HYPHEN|UIDNA_ERROR_BIDI },
622     { "a\\u05D0", "B", "a\\u05D0", UIDNA_ERROR_BIDI },  // first dir != last dir
623     { "\\u05D0\\u05C7", "B", "\\u05D0\\u05C7", 0 },
624     { "\\u05D09\\u05C7", "B", "\\u05D09\\u05C7", 0 },
625     { "\\u05D0a\\u05C7", "B", "\\u05D0a\\u05C7", UIDNA_ERROR_BIDI },  // first dir != last dir
626     { "\\u05D0\\u05EA", "B", "\\u05D0\\u05EA", 0 },
627     { "\\u05D0\\u05F3\\u05EA", "B", "\\u05D0\\u05F3\\u05EA", 0 },
628     { "a\\u05D0Tz", "B", "a\\u05D0tz", UIDNA_ERROR_BIDI },  // mixed dir
629     { "\\u05D0T\\u05EA", "B", "\\u05D0t\\u05EA", UIDNA_ERROR_BIDI },  // mixed dir
630     { "\\u05D07\\u05EA", "B", "\\u05D07\\u05EA", 0 },
631     { "\\u05D0\\u0667\\u05EA", "B", "\\u05D0\\u0667\\u05EA", 0 },  // Arabic 7 in the middle
632     { "a7\\u0667z", "B", "a7\\u0667z", UIDNA_ERROR_BIDI },  // AN digit in LTR
633     { "a7\\u0667", "B", "a7\\u0667", UIDNA_ERROR_BIDI },  // AN digit in LTR
634     { "\\u05D07\\u0667\\u05EA", "B",  // mixed EN/AN digits in RTL
635       "\\u05D07\\u0667\\u05EA", UIDNA_ERROR_BIDI },
636     { "\\u05D07\\u0667", "B",  // mixed EN/AN digits in RTL
637       "\\u05D07\\u0667", UIDNA_ERROR_BIDI },
638     // ZWJ
639     { "\\u0BB9\\u0BCD\\u200D", "N", "\\u0BB9\\u0BCD\\u200D", 0 },  // Virama+ZWJ
640     { "\\u0BB9\\u200D", "N", "\\u0BB9\\u200D", UIDNA_ERROR_CONTEXTJ },  // no Virama
641     { "\\u200D", "N", "\\u200D", UIDNA_ERROR_CONTEXTJ },  // no Virama
642     // ZWNJ
643     { "\\u0BB9\\u0BCD\\u200C", "N", "\\u0BB9\\u0BCD\\u200C", 0 },  // Virama+ZWNJ
644     { "\\u0BB9\\u200C", "N", "\\u0BB9\\u200C", UIDNA_ERROR_CONTEXTJ },  // no Virama
645     { "\\u200C", "N", "\\u200C", UIDNA_ERROR_CONTEXTJ },  // no Virama
646     { "\\u0644\\u0670\\u200C\\u06ED\\u06EF", "N",  // Joining types D T ZWNJ T R
647       "\\u0644\\u0670\\u200C\\u06ED\\u06EF", 0 },
648     { "\\u0644\\u0670\\u200C\\u06EF", "N",  // D T ZWNJ R
649       "\\u0644\\u0670\\u200C\\u06EF", 0 },
650     { "\\u0644\\u200C\\u06ED\\u06EF", "N",  // D ZWNJ T R
651       "\\u0644\\u200C\\u06ED\\u06EF", 0 },
652     { "\\u0644\\u200C\\u06EF", "N",  // D ZWNJ R
653       "\\u0644\\u200C\\u06EF", 0 },
654     { "\\u0644\\u0670\\u200C\\u06ED", "N",  // D T ZWNJ T
655       "\\u0644\\u0670\\u200C\\u06ED", UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ },
656     { "\\u06EF\\u200C\\u06EF", "N",  // R ZWNJ R
657       "\\u06EF\\u200C\\u06EF", UIDNA_ERROR_CONTEXTJ },
658     { "\\u0644\\u200C", "N",  // D ZWNJ
659       "\\u0644\\u200C", UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ },
660     { "\\u0660\\u0661", "B",  // Arabic-Indic Digits alone
661       "\\u0660\\u0661", UIDNA_ERROR_BIDI },
662     { "\\u06F0\\u06F1", "B",  // Extended Arabic-Indic Digits alone
663       "\\u06F0\\u06F1", 0 },
664     { "\\u0660\\u06F1", "B",  // Mixed Arabic-Indic Digits
665       "\\u0660\\u06F1", UIDNA_ERROR_CONTEXTO_DIGITS|UIDNA_ERROR_BIDI },
666     // All of the CONTEXTO "Would otherwise have been DISALLOWED" characters
667     // in their correct contexts,
668     // then each in incorrect context.
669     { "l\\u00B7l\\u4E00\\u0375\\u03B1\\u05D0\\u05F3\\u05F4\\u30FB", "B",
670       "l\\u00B7l\\u4E00\\u0375\\u03B1\\u05D0\\u05F3\\u05F4\\u30FB", UIDNA_ERROR_BIDI },
671     { "l\\u00B7", "B",
672       "l\\u00B7", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
673     { "\\u00B7l", "B",
674       "\\u00B7l", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
675     { "\\u0375", "B",
676       "\\u0375", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
677     { "\\u03B1\\u05F3", "B",
678       "\\u03B1\\u05F3", UIDNA_ERROR_CONTEXTO_PUNCTUATION|UIDNA_ERROR_BIDI },
679     { "\\u05F4", "B",
680       "\\u05F4", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
681     { "l\\u30FB", "B",
682       "l\\u30FB", UIDNA_ERROR_CONTEXTO_PUNCTUATION },
683     // Ticket #8137: UTS #46 toUnicode() fails with non-ASCII labels that turn
684     // into 15 characters (UChars).
685     // The bug was in u_strFromPunycode() which did not write the last character
686     // if it just so fit into the end of the destination buffer.
687     // The UTS #46 code gives a default-capacity UnicodeString as the destination buffer,
688     // and the internal UnicodeString capacity is currently 15 UChars on 64-bit machines
689     // but 13 on 32-bit machines.
690     // Label with 15 UChars, for 64-bit-machine testing:
691     { "aaaaaaaaaaaaa\\u00FCa.de", "B", "aaaaaaaaaaaaa\\u00FCa.de", 0 },
692     { "xn--aaaaaaaaaaaaaa-ssb.de", "B", "aaaaaaaaaaaaa\\u00FCa.de", 0 },
693     { "abschlu\\u00DFpr\\u00FCfung.de", "N", "abschlu\\u00DFpr\\u00FCfung.de", 0 },
694     { "xn--abschluprfung-hdb15b.de", "B", "abschlu\\u00DFpr\\u00FCfung.de", 0 },
695     // Label with 13 UChars, for 32-bit-machine testing:
696     { "xn--aaaaaaaaaaaa-nlb.de", "B", "aaaaaaaaaaa\\u00FCa.de", 0 },
697     { "xn--schluprfung-z6a39a.de", "B", "schlu\\u00DFpr\\u00FCfung.de", 0 },
698     // { "", "B",
699     //   "", 0 },
700 };
701 
TestSomeCases()702 void UTS46Test::TestSomeCases() {
703     IcuTestErrorCode errorCode(*this, "TestSomeCases");
704     char buffer[400], buffer2[400];
705     int32_t i;
706     for(i=0; i<UPRV_LENGTHOF(testCases); ++i) {
707         const TestCase &testCase=testCases[i];
708         UnicodeString input(ctou(testCase.s));
709         UnicodeString expected(ctou(testCase.u));
710         // ToASCII/ToUnicode, transitional/nontransitional
711         UnicodeString aT, uT, aN, uN;
712         IDNAInfo aTInfo, uTInfo, aNInfo, uNInfo;
713         trans->nameToASCII(input, aT, aTInfo, errorCode);
714         trans->nameToUnicode(input, uT, uTInfo, errorCode);
715         nontrans->nameToASCII(input, aN, aNInfo, errorCode);
716         nontrans->nameToUnicode(input, uN, uNInfo, errorCode);
717         if(errorCode.errIfFailureAndReset("first-level processing [%d/%s] %s",
718                                           (int)i, testCase.o, testCase.s)
719         ) {
720             continue;
721         }
722         // ToUnicode does not set length-overflow errors.
723         uint32_t uniErrors=testCase.errors&~
724             (UIDNA_ERROR_LABEL_TOO_LONG|
725              UIDNA_ERROR_DOMAIN_NAME_TOO_LONG);
726         char mode=testCase.o[0];
727         if(mode=='B' || mode=='N') {
728             if(uNInfo.getErrors()!=uniErrors) {
729                 errln("N.nameToUnicode([%d] %s) unexpected errors %04lx",
730                       (int)i, testCase.s, (long)uNInfo.getErrors());
731                 continue;
732             }
733             if(uN!=expected) {
734                 prettify(uN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
735                 errln("N.nameToUnicode([%d] %s) unexpected string %s",
736                       (int)i, testCase.s, buffer);
737                 continue;
738             }
739             if(aNInfo.getErrors()!=testCase.errors) {
740                 errln("N.nameToASCII([%d] %s) unexpected errors %04lx",
741                       (int)i, testCase.s, (long)aNInfo.getErrors());
742                 continue;
743             }
744         }
745         if(mode=='B' || mode=='T') {
746             if(uTInfo.getErrors()!=uniErrors) {
747                 errln("T.nameToUnicode([%d] %s) unexpected errors %04lx",
748                       (int)i, testCase.s, (long)uTInfo.getErrors());
749                 continue;
750             }
751             if(uT!=expected) {
752                 prettify(uT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
753                 errln("T.nameToUnicode([%d] %s) unexpected string %s",
754                       (int)i, testCase.s, buffer);
755                 continue;
756             }
757             if(aTInfo.getErrors()!=testCase.errors) {
758                 errln("T.nameToASCII([%d] %s) unexpected errors %04lx",
759                       (int)i, testCase.s, (long)aTInfo.getErrors());
760                 continue;
761             }
762         }
763         // ToASCII is all-ASCII if no severe errors
764         if((aNInfo.getErrors()&severeErrors)==0 && !isASCII(aN)) {
765             prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
766             errln("N.nameToASCII([%d] %s) (errors %04lx) result is not ASCII %s",
767                   (int)i, testCase.s, aNInfo.getErrors(), buffer);
768             continue;
769         }
770         if((aTInfo.getErrors()&severeErrors)==0 && !isASCII(aT)) {
771             prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
772             errln("T.nameToASCII([%d] %s) (errors %04lx) result is not ASCII %s",
773                   (int)i, testCase.s, aTInfo.getErrors(), buffer);
774             continue;
775         }
776         if(verbose) {
777             char m= mode=='B' ? mode : 'N';
778             prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
779             logln("%c.nameToASCII([%d] %s) (errors %04lx) result string: %s",
780                   m, (int)i, testCase.s, aNInfo.getErrors(), buffer);
781             if(mode!='B') {
782                 prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
783                 logln("T.nameToASCII([%d] %s) (errors %04lx) result string: %s",
784                       (int)i, testCase.s, aTInfo.getErrors(), buffer);
785             }
786         }
787         // second-level processing
788         UnicodeString aTuN, uTaN, aNuN, uNaN;
789         IDNAInfo aTuNInfo, uTaNInfo, aNuNInfo, uNaNInfo;
790         nontrans->nameToUnicode(aT, aTuN, aTuNInfo, errorCode);
791         nontrans->nameToASCII(uT, uTaN, uTaNInfo, errorCode);
792         nontrans->nameToUnicode(aN, aNuN, aNuNInfo, errorCode);
793         nontrans->nameToASCII(uN, uNaN, uNaNInfo, errorCode);
794         if(errorCode.errIfFailureAndReset("second-level processing [%d/%s] %s",
795                                           (int)i, testCase.o, testCase.s)
796         ) {
797             continue;
798         }
799         if(aN!=uNaN) {
800             prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
801             prettify(uNaN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
802             errln("N.nameToASCII([%d] %s)!=N.nameToUnicode().N.nameToASCII() "
803                   "(errors %04lx) %s vs. %s",
804                   (int)i, testCase.s, aNInfo.getErrors(), buffer, buffer2);
805             continue;
806         }
807         if(aT!=uTaN) {
808             prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
809             prettify(uTaN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
810             errln("T.nameToASCII([%d] %s)!=T.nameToUnicode().N.nameToASCII() "
811                   "(errors %04lx) %s vs. %s",
812                   (int)i, testCase.s, aNInfo.getErrors(), buffer, buffer2);
813             continue;
814         }
815         if(uN!=aNuN) {
816             prettify(uN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
817             prettify(aNuN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
818             errln("N.nameToUnicode([%d] %s)!=N.nameToASCII().N.nameToUnicode() "
819                   "(errors %04lx) %s vs. %s",
820                   (int)i, testCase.s, uNInfo.getErrors(), buffer, buffer2);
821             continue;
822         }
823         if(uT!=aTuN) {
824             prettify(uT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
825             prettify(aTuN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
826             errln("T.nameToUnicode([%d] %s)!=T.nameToASCII().N.nameToUnicode() "
827                   "(errors %04lx) %s vs. %s",
828                   (int)i, testCase.s, uNInfo.getErrors(), buffer, buffer2);
829             continue;
830         }
831         // labelToUnicode
832         UnicodeString aTL, uTL, aNL, uNL;
833         IDNAInfo aTLInfo, uTLInfo, aNLInfo, uNLInfo;
834         trans->labelToASCII(input, aTL, aTLInfo, errorCode);
835         trans->labelToUnicode(input, uTL, uTLInfo, errorCode);
836         nontrans->labelToASCII(input, aNL, aNLInfo, errorCode);
837         nontrans->labelToUnicode(input, uNL, uNLInfo, errorCode);
838         if(errorCode.errIfFailureAndReset("labelToXYZ processing [%d/%s] %s",
839                                           (int)i, testCase.o, testCase.s)
840         ) {
841             continue;
842         }
843         if(aN.indexOf((char16_t)0x2e)<0) {
844             if(aN!=aNL || aNInfo.getErrors()!=aNLInfo.getErrors()) {
845                 prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
846                 prettify(aNL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
847                 errln("N.nameToASCII([%d] %s)!=N.labelToASCII() "
848                       "(errors %04lx vs %04lx) %s vs. %s",
849                       (int)i, testCase.s, aNInfo.getErrors(), aNLInfo.getErrors(), buffer, buffer2);
850                 continue;
851             }
852         } else {
853             if((aNLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) {
854                 errln("N.labelToASCII([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT",
855                       (int)i, testCase.s, (long)aNLInfo.getErrors());
856                 continue;
857             }
858         }
859         if(aT.indexOf((char16_t)0x2e)<0) {
860             if(aT!=aTL || aTInfo.getErrors()!=aTLInfo.getErrors()) {
861                 prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
862                 prettify(aTL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
863                 errln("T.nameToASCII([%d] %s)!=T.labelToASCII() "
864                       "(errors %04lx vs %04lx) %s vs. %s",
865                       (int)i, testCase.s, aTInfo.getErrors(), aTLInfo.getErrors(), buffer, buffer2);
866                 continue;
867             }
868         } else {
869             if((aTLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) {
870                 errln("T.labelToASCII([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT",
871                       (int)i, testCase.s, (long)aTLInfo.getErrors());
872                 continue;
873             }
874         }
875         if(uN.indexOf((char16_t)0x2e)<0) {
876             if(uN!=uNL || uNInfo.getErrors()!=uNLInfo.getErrors()) {
877                 prettify(uN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
878                 prettify(uNL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
879                 errln("N.nameToUnicode([%d] %s)!=N.labelToUnicode() "
880                       "(errors %04lx vs %04lx) %s vs. %s",
881                       (int)i, testCase.s, uNInfo.getErrors(), uNLInfo.getErrors(), buffer, buffer2);
882                 continue;
883             }
884         } else {
885             if((uNLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) {
886                 errln("N.labelToUnicode([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT",
887                       (int)i, testCase.s, (long)uNLInfo.getErrors());
888                 continue;
889             }
890         }
891         if(uT.indexOf((char16_t)0x2e)<0) {
892             if(uT!=uTL || uTInfo.getErrors()!=uTLInfo.getErrors()) {
893                 prettify(uT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
894                 prettify(uTL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
895                 errln("T.nameToUnicode([%d] %s)!=T.labelToUnicode() "
896                       "(errors %04lx vs %04lx) %s vs. %s",
897                       (int)i, testCase.s, uTInfo.getErrors(), uTLInfo.getErrors(), buffer, buffer2);
898                 continue;
899             }
900         } else {
901             if((uTLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) {
902                 errln("T.labelToUnicode([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT",
903                       (int)i, testCase.s, (long)uTLInfo.getErrors());
904                 continue;
905             }
906         }
907         // Differences between transitional and nontransitional processing
908         if(mode=='B') {
909             if( aNInfo.isTransitionalDifferent() ||
910                 aTInfo.isTransitionalDifferent() ||
911                 uNInfo.isTransitionalDifferent() ||
912                 uTInfo.isTransitionalDifferent() ||
913                 aNLInfo.isTransitionalDifferent() ||
914                 aTLInfo.isTransitionalDifferent() ||
915                 uNLInfo.isTransitionalDifferent() ||
916                 uTLInfo.isTransitionalDifferent()
917             ) {
918                 errln("B.process([%d] %s) isTransitionalDifferent()", (int)i, testCase.s);
919                 continue;
920             }
921             if( aN!=aT || uN!=uT || aNL!=aTL || uNL!=uTL ||
922                 aNInfo.getErrors()!=aTInfo.getErrors() || uNInfo.getErrors()!=uTInfo.getErrors() ||
923                 aNLInfo.getErrors()!=aTLInfo.getErrors() || uNLInfo.getErrors()!=uTLInfo.getErrors()
924             ) {
925                 errln("N.process([%d] %s) vs. T.process() different errors or result strings",
926                       (int)i, testCase.s);
927                 continue;
928             }
929         } else {
930             if( !aNInfo.isTransitionalDifferent() ||
931                 !aTInfo.isTransitionalDifferent() ||
932                 !uNInfo.isTransitionalDifferent() ||
933                 !uTInfo.isTransitionalDifferent() ||
934                 !aNLInfo.isTransitionalDifferent() ||
935                 !aTLInfo.isTransitionalDifferent() ||
936                 !uNLInfo.isTransitionalDifferent() ||
937                 !uTLInfo.isTransitionalDifferent()
938             ) {
939                 errln("%s.process([%d] %s) !isTransitionalDifferent()",
940                       testCase.o, (int)i, testCase.s);
941                 continue;
942             }
943             if(aN==aT || uN==uT || aNL==aTL || uNL==uTL) {
944                 errln("N.process([%d] %s) vs. T.process() same result strings",
945                       (int)i, testCase.s);
946                 continue;
947             }
948         }
949         // UTF-8
950         std::string input8, aT8, uT8, aN8, uN8;
951         StringByteSink<std::string> aT8Sink(&aT8), uT8Sink(&uT8), aN8Sink(&aN8), uN8Sink(&uN8);
952         IDNAInfo aT8Info, uT8Info, aN8Info, uN8Info;
953         input.toUTF8String(input8);
954         trans->nameToASCII_UTF8(input8, aT8Sink, aT8Info, errorCode);
955         trans->nameToUnicodeUTF8(input8, uT8Sink, uT8Info, errorCode);
956         nontrans->nameToASCII_UTF8(input8, aN8Sink, aN8Info, errorCode);
957         nontrans->nameToUnicodeUTF8(input8, uN8Sink, uN8Info, errorCode);
958         if(errorCode.errIfFailureAndReset("UTF-8 processing [%d/%s] %s",
959                                           (int)i, testCase.o, testCase.s)
960         ) {
961             continue;
962         }
963         UnicodeString aT16(UnicodeString::fromUTF8(aT8));
964         UnicodeString uT16(UnicodeString::fromUTF8(uT8));
965         UnicodeString aN16(UnicodeString::fromUTF8(aN8));
966         UnicodeString uN16(UnicodeString::fromUTF8(uN8));
967         if( aN8Info.getErrors()!=aNInfo.getErrors() ||
968             uN8Info.getErrors()!=uNInfo.getErrors()
969         ) {
970             errln("N.xyzUTF8([%d] %s) vs. UTF-16 processing different errors %04lx vs. %04lx",
971                   (int)i, testCase.s,
972                   (long)aN8Info.getErrors(), (long)aNInfo.getErrors());
973             continue;
974         }
975         if( aT8Info.getErrors()!=aTInfo.getErrors() ||
976             uT8Info.getErrors()!=uTInfo.getErrors()
977         ) {
978             errln("T.xyzUTF8([%d] %s) vs. UTF-16 processing different errors %04lx vs. %04lx",
979                   (int)i, testCase.s,
980                   (long)aT8Info.getErrors(), (long)aTInfo.getErrors());
981             continue;
982         }
983         if(aT16!=aT || uT16!=uT || aN16!=aN || uN16!=uN) {
984             errln("%s.xyzUTF8([%d] %s) vs. UTF-16 processing different string results",
985                   testCase.o, (int)i, testCase.s, (long)aTInfo.getErrors());
986             continue;
987         }
988         if( aT8Info.isTransitionalDifferent()!=aTInfo.isTransitionalDifferent() ||
989             uT8Info.isTransitionalDifferent()!=uTInfo.isTransitionalDifferent() ||
990             aN8Info.isTransitionalDifferent()!=aNInfo.isTransitionalDifferent() ||
991             uN8Info.isTransitionalDifferent()!=uNInfo.isTransitionalDifferent()
992         ) {
993             errln("%s.xyzUTF8([%d] %s) vs. UTF-16 processing different isTransitionalDifferent()",
994                   testCase.o, (int)i, testCase.s);
995             continue;
996         }
997     }
998 }
999 
1000 namespace {
1001 
1002 const int32_t kNumFields = 7;
1003 
1004 void U_CALLCONV
idnaTestLineFn(void * context,char * fields[][2],int32_t,UErrorCode * pErrorCode)1005 idnaTestLineFn(void *context,
1006                char *fields[][2], int32_t /* fieldCount */,
1007                UErrorCode *pErrorCode) {
1008     reinterpret_cast<UTS46Test *>(context)->idnaTestOneLine(fields, *pErrorCode);
1009 }
1010 
s16FromField(char * (& field)[2])1011 UnicodeString s16FromField(char *(&field)[2]) {
1012     int32_t length = (int32_t)(field[1] - field[0]);
1013     return UnicodeString::fromUTF8(StringPiece(field[0], length)).trim().unescape();
1014 }
1015 
statusFromField(char * (& field)[2])1016 std::string statusFromField(char *(&field)[2]) {
1017     const char *start = u_skipWhitespace(field[0]);
1018     std::string status;
1019     if (start != field[1]) {
1020         int32_t length = (int32_t)(field[1] - start);
1021         while (length > 0 && (start[length - 1] == u' ' || start[length - 1] == u'\t')) {
1022             --length;
1023         }
1024         status.assign(start, length);
1025     }
1026     return status;
1027 }
1028 
1029 }  // namespace
1030 
checkIdnaTestResult(const char * line,const char * type,const UnicodeString & expected,const UnicodeString & result,const char * status,const IDNAInfo & info)1031 void UTS46Test::checkIdnaTestResult(const char *line, const char *type,
1032                                     const UnicodeString &expected, const UnicodeString &result,
1033                                     const char *status, const IDNAInfo &info) {
1034     // An error in toUnicode or toASCII is indicated by a value in square brackets,
1035     // such as "[B5 B6]".
1036     UBool expectedHasErrors = false;
1037     if (*status != 0) {
1038         if (*status != u'[') {
1039             errln("%s  status field does not start with '[': %s\n    %s", type, status, line);
1040         }
1041         if (strcmp(status, reinterpret_cast<const char*>(u8"[]")) != 0) {
1042             expectedHasErrors = true;
1043         }
1044     }
1045     if (expectedHasErrors != info.hasErrors()) {
1046         errln("%s  expected errors %s %d != %d = actual has errors: %04lx\n    %s",
1047               type, status, expectedHasErrors, info.hasErrors(), (long)info.getErrors(), line);
1048     }
1049     if (!expectedHasErrors && expected != result) {
1050         errln("%s  expected != actual\n    %s", type, line);
1051         errln(UnicodeString(u"    ") + expected);
1052         errln(UnicodeString(u"    ") + result);
1053     }
1054 }
1055 
idnaTestOneLine(char * fields[][2],UErrorCode & errorCode)1056 void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
1057     // IdnaTestV2.txt (since Unicode 11)
1058     // Column 1: source
1059     // The source string to be tested
1060     UnicodeString source = s16FromField(fields[0]);
1061 
1062     // Column 2: toUnicode
1063     // The result of applying toUnicode to the source, with Transitional_Processing=false.
1064     // A blank value means the same as the source value.
1065     UnicodeString toUnicode = s16FromField(fields[1]);
1066     if (toUnicode.isEmpty()) {
1067         toUnicode = source;
1068     }
1069 
1070     // Column 3: toUnicodeStatus
1071     // A set of status codes, each corresponding to a particular test.
1072     // A blank value means [].
1073     std::string toUnicodeStatus = statusFromField(fields[2]);
1074 
1075     // Column 4: toAsciiN
1076     // The result of applying toASCII to the source, with Transitional_Processing=false.
1077     // A blank value means the same as the toUnicode value.
1078     UnicodeString toAsciiN = s16FromField(fields[3]);
1079     if (toAsciiN.isEmpty()) {
1080         toAsciiN = toUnicode;
1081     }
1082 
1083     // Column 5: toAsciiNStatus
1084     // A set of status codes, each corresponding to a particular test.
1085     // A blank value means the same as the toUnicodeStatus value.
1086     std::string toAsciiNStatus = statusFromField(fields[4]);
1087     if (toAsciiNStatus.empty()) {
1088         toAsciiNStatus = toUnicodeStatus;
1089     }
1090 
1091     // Column 6: toAsciiT
1092     // The result of applying toASCII to the source, with Transitional_Processing=true.
1093     // A blank value means the same as the toAsciiN value.
1094     UnicodeString toAsciiT = s16FromField(fields[5]);
1095     if (toAsciiT.isEmpty()) {
1096         toAsciiT = toAsciiN;
1097     }
1098 
1099     // Column 7: toAsciiTStatus
1100     // A set of status codes, each corresponding to a particular test.
1101     // A blank value means the same as the toAsciiNStatus value.
1102     std::string toAsciiTStatus = statusFromField(fields[6]);
1103     if (toAsciiTStatus.empty()) {
1104         toAsciiTStatus = toAsciiNStatus;
1105     }
1106 
1107     // ToASCII/ToUnicode, transitional/nontransitional
1108     UnicodeString uN, aN, aT;
1109     IDNAInfo uNInfo, aNInfo, aTInfo;
1110     nontrans->nameToUnicode(source, uN, uNInfo, errorCode);
1111     checkIdnaTestResult(fields[0][0], "toUnicodeNontrans", toUnicode, uN,
1112                         toUnicodeStatus.c_str(), uNInfo);
1113     nontrans->nameToASCII(source, aN, aNInfo, errorCode);
1114     checkIdnaTestResult(fields[0][0], "toASCIINontrans", toAsciiN, aN,
1115                         toAsciiNStatus.c_str(), aNInfo);
1116     trans->nameToASCII(source, aT, aTInfo, errorCode);
1117     checkIdnaTestResult(fields[0][0], "toASCIITrans", toAsciiT, aT,
1118                         toAsciiTStatus.c_str(), aTInfo);
1119 }
1120 
1121 // TODO: de-duplicate
1122 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
1123 
1124 // http://www.unicode.org/Public/idna/latest/IdnaTest.txt
IdnaTest()1125 void UTS46Test::IdnaTest() {
1126     IcuTestErrorCode errorCode(*this, "IdnaTest");
1127     const char *sourceTestDataPath = getSourceTestData(errorCode);
1128     if (errorCode.errIfFailureAndReset("unable to find the source/test/testdata "
1129                                        "folder (getSourceTestData())")) {
1130         return;
1131     }
1132     CharString path(sourceTestDataPath, errorCode);
1133     path.appendPathPart("IdnaTestV2.txt", errorCode);
1134     LocalStdioFilePointer idnaTestFile(fopen(path.data(), "r"));
1135     if (idnaTestFile.isNull()) {
1136         errln("unable to open %s", path.data());
1137         return;
1138     }
1139 
1140     // Columns (c1, c2,...) are separated by semicolons.
1141     // Leading and trailing spaces and tabs in each column are ignored.
1142     // Comments are indicated with hash marks.
1143     char *fields[kNumFields][2];
1144     u_parseDelimitedFile(path.data(), ';', fields, kNumFields, idnaTestLineFn, this, errorCode);
1145     if (errorCode.errIfFailureAndReset("error parsing IdnaTest.txt")) {
1146         return;
1147     }
1148 }
1149 
1150 #endif  // UCONFIG_NO_IDNA
1151