• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2010-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   file name:  uts46.cpp
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2010mar09
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_IDNA
20 
21 #include "unicode/bytestream.h"
22 #include "unicode/idna.h"
23 #include "unicode/normalizer2.h"
24 #include "unicode/uscript.h"
25 #include "unicode/ustring.h"
26 #include "unicode/utf16.h"
27 #include "bytesinkutil.h"
28 #include "cmemory.h"
29 #include "cstring.h"
30 #include "punycode.h"
31 #include "ubidi_props.h"
32 
33 // Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG:
34 //
35 // The domain name length limit is 255 octets in an internal DNS representation
36 // where the last ("root") label is the empty label
37 // represented by length byte 0 alone.
38 // In a conventional string, this translates to 253 characters, or 254
39 // if there is a trailing dot for the root label.
40 
41 U_NAMESPACE_BEGIN
42 
43 // Severe errors which usually result in a U+FFFD replacement character in the result string.
44 const uint32_t severeErrors=
45     UIDNA_ERROR_LEADING_COMBINING_MARK|
46     UIDNA_ERROR_DISALLOWED|
47     UIDNA_ERROR_PUNYCODE|
48     UIDNA_ERROR_LABEL_HAS_DOT|
49     UIDNA_ERROR_INVALID_ACE_LABEL;
50 
51 static inline UBool
isASCIIString(const UnicodeString & dest)52 isASCIIString(const UnicodeString &dest) {
53     const char16_t *s=dest.getBuffer();
54     const char16_t *limit=s+dest.length();
55     while(s<limit) {
56         if(*s++>0x7f) {
57             return false;
58         }
59     }
60     return true;
61 }
62 
63 static UBool
64 isASCIIOkBiDi(const char16_t *s, int32_t length);
65 
66 static UBool
67 isASCIIOkBiDi(const char *s, int32_t length);
68 
69 // IDNA class default implementations -------------------------------------- ***
70 
~IDNA()71 IDNA::~IDNA() {}
72 
73 void
labelToASCII_UTF8(StringPiece label,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const74 IDNA::labelToASCII_UTF8(StringPiece label, ByteSink &dest,
75                         IDNAInfo &info, UErrorCode &errorCode) const {
76     if(U_SUCCESS(errorCode)) {
77         UnicodeString destString;
78         labelToASCII(UnicodeString::fromUTF8(label), destString,
79                      info, errorCode).toUTF8(dest);
80     }
81 }
82 
83 void
labelToUnicodeUTF8(StringPiece label,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const84 IDNA::labelToUnicodeUTF8(StringPiece label, ByteSink &dest,
85                          IDNAInfo &info, UErrorCode &errorCode) const {
86     if(U_SUCCESS(errorCode)) {
87         UnicodeString destString;
88         labelToUnicode(UnicodeString::fromUTF8(label), destString,
89                        info, errorCode).toUTF8(dest);
90     }
91 }
92 
93 void
nameToASCII_UTF8(StringPiece name,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const94 IDNA::nameToASCII_UTF8(StringPiece name, ByteSink &dest,
95                        IDNAInfo &info, UErrorCode &errorCode) const {
96     if(U_SUCCESS(errorCode)) {
97         UnicodeString destString;
98         nameToASCII(UnicodeString::fromUTF8(name), destString,
99                     info, errorCode).toUTF8(dest);
100     }
101 }
102 
103 void
nameToUnicodeUTF8(StringPiece name,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const104 IDNA::nameToUnicodeUTF8(StringPiece name, ByteSink &dest,
105                         IDNAInfo &info, UErrorCode &errorCode) const {
106     if(U_SUCCESS(errorCode)) {
107         UnicodeString destString;
108         nameToUnicode(UnicodeString::fromUTF8(name), destString,
109                       info, errorCode).toUTF8(dest);
110     }
111 }
112 
113 // UTS46 class declaration ------------------------------------------------- ***
114 
115 class UTS46 : public IDNA {
116 public:
117     UTS46(uint32_t options, UErrorCode &errorCode);
118     virtual ~UTS46();
119 
120     virtual UnicodeString &
121     labelToASCII(const UnicodeString &label, UnicodeString &dest,
122                  IDNAInfo &info, UErrorCode &errorCode) const override;
123 
124     virtual UnicodeString &
125     labelToUnicode(const UnicodeString &label, UnicodeString &dest,
126                    IDNAInfo &info, UErrorCode &errorCode) const override;
127 
128     virtual UnicodeString &
129     nameToASCII(const UnicodeString &name, UnicodeString &dest,
130                 IDNAInfo &info, UErrorCode &errorCode) const override;
131 
132     virtual UnicodeString &
133     nameToUnicode(const UnicodeString &name, UnicodeString &dest,
134                   IDNAInfo &info, UErrorCode &errorCode) const override;
135 
136     virtual void
137     labelToASCII_UTF8(StringPiece label, ByteSink &dest,
138                       IDNAInfo &info, UErrorCode &errorCode) const override;
139 
140     virtual void
141     labelToUnicodeUTF8(StringPiece label, ByteSink &dest,
142                        IDNAInfo &info, UErrorCode &errorCode) const override;
143 
144     virtual void
145     nameToASCII_UTF8(StringPiece name, ByteSink &dest,
146                      IDNAInfo &info, UErrorCode &errorCode) const override;
147 
148     virtual void
149     nameToUnicodeUTF8(StringPiece name, ByteSink &dest,
150                       IDNAInfo &info, UErrorCode &errorCode) const override;
151 
152 private:
153     UnicodeString &
154     process(const UnicodeString &src,
155             UBool isLabel, UBool toASCII,
156             UnicodeString &dest,
157             IDNAInfo &info, UErrorCode &errorCode) const;
158 
159     void
160     processUTF8(StringPiece src,
161                 UBool isLabel, UBool toASCII,
162                 ByteSink &dest,
163                 IDNAInfo &info, UErrorCode &errorCode) const;
164 
165     UnicodeString &
166     processUnicode(const UnicodeString &src,
167                    int32_t labelStart, int32_t mappingStart,
168                    UBool isLabel, UBool toASCII,
169                    UnicodeString &dest,
170                    IDNAInfo &info, UErrorCode &errorCode) const;
171 
172     // returns the new dest.length()
173     int32_t
174     mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
175                 UErrorCode &errorCode) const;
176 
177     // returns the new label length
178     int32_t
179     processLabel(UnicodeString &dest,
180                  int32_t labelStart, int32_t labelLength,
181                  UBool toASCII,
182                  IDNAInfo &info, UErrorCode &errorCode) const;
183     int32_t
184     markBadACELabel(UnicodeString &dest,
185                     int32_t labelStart, int32_t labelLength,
186                     UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const;
187 
188     void
189     checkLabelBiDi(const char16_t *label, int32_t labelLength, IDNAInfo &info) const;
190 
191     UBool
192     isLabelOkContextJ(const char16_t *label, int32_t labelLength) const;
193 
194     void
195     checkLabelContextO(const char16_t *label, int32_t labelLength, IDNAInfo &info) const;
196 
197     const Normalizer2 &uts46Norm2;  // uts46.nrm
198     uint32_t options;
199 };
200 
201 IDNA *
createUTS46Instance(uint32_t options,UErrorCode & errorCode)202 IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) {
203     if(U_SUCCESS(errorCode)) {
204         IDNA *idna=new UTS46(options, errorCode);
205         if(idna==nullptr) {
206             errorCode=U_MEMORY_ALLOCATION_ERROR;
207         } else if(U_FAILURE(errorCode)) {
208             delete idna;
209             idna=nullptr;
210         }
211         return idna;
212     } else {
213         return nullptr;
214     }
215 }
216 
217 // UTS46 implementation ---------------------------------------------------- ***
218 
UTS46(uint32_t opt,UErrorCode & errorCode)219 UTS46::UTS46(uint32_t opt, UErrorCode &errorCode)
220         : uts46Norm2(*Normalizer2::getInstance(nullptr, "uts46", UNORM2_COMPOSE, errorCode)),
221           options(opt) {}
222 
~UTS46()223 UTS46::~UTS46() {}
224 
225 UnicodeString &
labelToASCII(const UnicodeString & label,UnicodeString & dest,IDNAInfo & info,UErrorCode & errorCode) const226 UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest,
227                     IDNAInfo &info, UErrorCode &errorCode) const {
228     return process(label, true, true, dest, info, errorCode);
229 }
230 
231 UnicodeString &
labelToUnicode(const UnicodeString & label,UnicodeString & dest,IDNAInfo & info,UErrorCode & errorCode) const232 UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest,
233                       IDNAInfo &info, UErrorCode &errorCode) const {
234     return process(label, true, false, dest, info, errorCode);
235 }
236 
237 UnicodeString &
nameToASCII(const UnicodeString & name,UnicodeString & dest,IDNAInfo & info,UErrorCode & errorCode) const238 UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest,
239                    IDNAInfo &info, UErrorCode &errorCode) const {
240     process(name, false, true, dest, info, errorCode);
241     if( dest.length()>=254 && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==0 &&
242         isASCIIString(dest) &&
243         (dest.length()>254 || dest[253]!=0x2e)
244     ) {
245         info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
246     }
247     return dest;
248 }
249 
250 UnicodeString &
nameToUnicode(const UnicodeString & name,UnicodeString & dest,IDNAInfo & info,UErrorCode & errorCode) const251 UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest,
252                      IDNAInfo &info, UErrorCode &errorCode) const {
253     return process(name, false, false, dest, info, errorCode);
254 }
255 
256 void
labelToASCII_UTF8(StringPiece label,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const257 UTS46::labelToASCII_UTF8(StringPiece label, ByteSink &dest,
258                          IDNAInfo &info, UErrorCode &errorCode) const {
259     processUTF8(label, true, true, dest, info, errorCode);
260 }
261 
262 void
labelToUnicodeUTF8(StringPiece label,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const263 UTS46::labelToUnicodeUTF8(StringPiece label, ByteSink &dest,
264                           IDNAInfo &info, UErrorCode &errorCode) const {
265     processUTF8(label, true, false, dest, info, errorCode);
266 }
267 
268 void
nameToASCII_UTF8(StringPiece name,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const269 UTS46::nameToASCII_UTF8(StringPiece name, ByteSink &dest,
270                         IDNAInfo &info, UErrorCode &errorCode) const {
271     processUTF8(name, false, true, dest, info, errorCode);
272 }
273 
274 void
nameToUnicodeUTF8(StringPiece name,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const275 UTS46::nameToUnicodeUTF8(StringPiece name, ByteSink &dest,
276                          IDNAInfo &info, UErrorCode &errorCode) const {
277     processUTF8(name, false, false, dest, info, errorCode);
278 }
279 
280 // UTS #46 data for ASCII characters.
281 // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase
282 // and passes through all other ASCII characters.
283 // If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed
284 // using this data.
285 // The ASCII fastpath also uses this data.
286 // Values: -1=disallowed  0==valid  1==mapped (lowercase)
287 static const int8_t asciiData[128]={
288     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
289     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
290     // 002D..002E; valid  #  HYPHEN-MINUS..FULL STOP
291     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0, -1,
292     // 0030..0039; valid  #  DIGIT ZERO..DIGIT NINE
293      0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,
294     // 0041..005A; mapped  #  LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
295     -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
296      1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1, -1,
297     // 0061..007A; valid  #  LATIN SMALL LETTER A..LATIN SMALL LETTER Z
298     -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
299      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1
300 };
301 
302 UnicodeString &
process(const UnicodeString & src,UBool isLabel,UBool toASCII,UnicodeString & dest,IDNAInfo & info,UErrorCode & errorCode) const303 UTS46::process(const UnicodeString &src,
304                UBool isLabel, UBool toASCII,
305                UnicodeString &dest,
306                IDNAInfo &info, UErrorCode &errorCode) const {
307     // uts46Norm2.normalize() would do all of this error checking and setup,
308     // but with the ASCII fastpath we do not always call it, and do not
309     // call it first.
310     if(U_FAILURE(errorCode)) {
311         dest.setToBogus();
312         return dest;
313     }
314     const char16_t *srcArray=src.getBuffer();
315     if(&dest==&src || srcArray==nullptr) {
316         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
317         dest.setToBogus();
318         return dest;
319     }
320     // Arguments are fine, reset output values.
321     dest.remove();
322     info.reset();
323     int32_t srcLength=src.length();
324     if(srcLength==0) {
325         info.errors|=UIDNA_ERROR_EMPTY_LABEL;
326         return dest;
327     }
328     char16_t *destArray=dest.getBuffer(srcLength);
329     if(destArray==nullptr) {
330         errorCode=U_MEMORY_ALLOCATION_ERROR;
331         return dest;
332     }
333     // ASCII fastpath
334     UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
335     int32_t labelStart=0;
336     int32_t i;
337     for(i=0;; ++i) {
338         if(i==srcLength) {
339             if(toASCII) {
340                 if((i-labelStart)>63) {
341                     info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
342                 }
343                 // There is a trailing dot if labelStart==i.
344                 if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
345                     info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
346                 }
347             }
348             info.errors|=info.labelErrors;
349             dest.releaseBuffer(i);
350             return dest;
351         }
352         char16_t c=srcArray[i];
353         if(c>0x7f) {
354             break;
355         }
356         int cData=asciiData[c];
357         if(cData>0) {
358             destArray[i]=c+0x20;  // Lowercase an uppercase ASCII letter.
359         } else if(cData<0 && disallowNonLDHDot) {
360             break;  // Replacing with U+FFFD can be complicated for toASCII.
361         } else {
362             destArray[i]=c;
363             if(c==0x2d) {  // hyphen
364                 if(i==(labelStart+3) && srcArray[i-1]==0x2d) {
365                     // "??--..." is Punycode or forbidden.
366                     ++i;  // '-' was copied to dest already
367                     break;
368                 }
369                 if(i==labelStart) {
370                     // label starts with "-"
371                     info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
372                 }
373                 if((i+1)==srcLength || srcArray[i+1]==0x2e) {
374                     // label ends with "-"
375                     info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
376                 }
377             } else if(c==0x2e) {  // dot
378                 if(isLabel) {
379                     // Replacing with U+FFFD can be complicated for toASCII.
380                     ++i;  // '.' was copied to dest already
381                     break;
382                 }
383                 if(i==labelStart) {
384                     info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
385                 }
386                 if(toASCII && (i-labelStart)>63) {
387                     info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
388                 }
389                 info.errors|=info.labelErrors;
390                 info.labelErrors=0;
391                 labelStart=i+1;
392             }
393         }
394     }
395     info.errors|=info.labelErrors;
396     dest.releaseBuffer(i);
397     processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode);
398     if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&
399         (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), labelStart)))
400     ) {
401         info.errors|=UIDNA_ERROR_BIDI;
402     }
403     return dest;
404 }
405 
406 void
processUTF8(StringPiece src,UBool isLabel,UBool toASCII,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const407 UTS46::processUTF8(StringPiece src,
408                    UBool isLabel, UBool toASCII,
409                    ByteSink &dest,
410                    IDNAInfo &info, UErrorCode &errorCode) const {
411     if(U_FAILURE(errorCode)) {
412         return;
413     }
414     const char *srcArray=src.data();
415     int32_t srcLength=src.length();
416     if(srcArray==nullptr && srcLength!=0) {
417         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
418         return;
419     }
420     // Arguments are fine, reset output values.
421     info.reset();
422     if(srcLength==0) {
423         info.errors|=UIDNA_ERROR_EMPTY_LABEL;
424         dest.Flush();
425         return;
426     }
427     UnicodeString destString;
428     int32_t labelStart=0;
429     if(srcLength<=256) {  // length of stackArray[]
430         // ASCII fastpath
431         char stackArray[256];
432         int32_t destCapacity;
433         char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20,
434                                              stackArray, UPRV_LENGTHOF(stackArray), &destCapacity);
435         UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
436         int32_t i;
437         for(i=0;; ++i) {
438             if(i==srcLength) {
439                 if(toASCII) {
440                     if((i-labelStart)>63) {
441                         info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
442                     }
443                     // There is a trailing dot if labelStart==i.
444                     if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
445                         info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
446                     }
447                 }
448                 info.errors|=info.labelErrors;
449                 dest.Append(destArray, i);
450                 dest.Flush();
451                 return;
452             }
453             char c=srcArray[i];
454             if((int8_t)c<0) {  // (uint8_t)c>0x7f
455                 break;
456             }
457             int cData=asciiData[(int)c];  // Cast: gcc warns about indexing with a char.
458             if(cData>0) {
459                 destArray[i]=c+0x20;  // Lowercase an uppercase ASCII letter.
460             } else if(cData<0 && disallowNonLDHDot) {
461                 break;  // Replacing with U+FFFD can be complicated for toASCII.
462             } else {
463                 destArray[i]=c;
464                 if(c==0x2d) {  // hyphen
465                     if(i==(labelStart+3) && srcArray[i-1]==0x2d) {
466                         // "??--..." is Punycode or forbidden.
467                         break;
468                     }
469                     if(i==labelStart) {
470                         // label starts with "-"
471                         info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
472                     }
473                     if((i+1)==srcLength || srcArray[i+1]==0x2e) {
474                         // label ends with "-"
475                         info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
476                     }
477                 } else if(c==0x2e) {  // dot
478                     if(isLabel) {
479                         break;  // Replacing with U+FFFD can be complicated for toASCII.
480                     }
481                     if(i==labelStart) {
482                         info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
483                     }
484                     if(toASCII && (i-labelStart)>63) {
485                         info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
486                     }
487                     info.errors|=info.labelErrors;
488                     info.labelErrors=0;
489                     labelStart=i+1;
490                 }
491             }
492         }
493         info.errors|=info.labelErrors;
494         // Convert the processed ASCII prefix of the current label to UTF-16.
495         int32_t mappingStart=i-labelStart;
496         destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, mappingStart));
497         // Output the previous ASCII labels and process the rest of src in UTF-16.
498         dest.Append(destArray, labelStart);
499         processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0, mappingStart,
500                        isLabel, toASCII,
501                        destString, info, errorCode);
502     } else {
503         // src is too long for the ASCII fastpath implementation.
504         processUnicode(UnicodeString::fromUTF8(src), 0, 0,
505                        isLabel, toASCII,
506                        destString, info, errorCode);
507     }
508     destString.toUTF8(dest);  // calls dest.Flush()
509     if(toASCII && !isLabel) {
510         // length==labelStart==254 means that there is a trailing dot (ok) and
511         // destString is empty (do not index at 253-labelStart).
512         int32_t length=labelStart+destString.length();
513         if( length>=254 && isASCIIString(destString) &&
514             (length>254 ||
515              (labelStart<254 && destString[253-labelStart]!=0x2e))
516         ) {
517             info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
518         }
519     }
520     if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&
521         (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart)))
522     ) {
523         info.errors|=UIDNA_ERROR_BIDI;
524     }
525 }
526 
527 UnicodeString &
processUnicode(const UnicodeString & src,int32_t labelStart,int32_t mappingStart,UBool isLabel,UBool toASCII,UnicodeString & dest,IDNAInfo & info,UErrorCode & errorCode) const528 UTS46::processUnicode(const UnicodeString &src,
529                       int32_t labelStart, int32_t mappingStart,
530                       UBool isLabel, UBool toASCII,
531                       UnicodeString &dest,
532                       IDNAInfo &info, UErrorCode &errorCode) const {
533     if(mappingStart==0) {
534         uts46Norm2.normalize(src, dest, errorCode);
535     } else {
536         uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode);
537     }
538     if(U_FAILURE(errorCode)) {
539         return dest;
540     }
541     UBool doMapDevChars=
542         toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 :
543                   (options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0;
544     const char16_t *destArray=dest.getBuffer();
545     int32_t destLength=dest.length();
546     int32_t labelLimit=labelStart;
547     while(labelLimit<destLength) {
548         char16_t c=destArray[labelLimit];
549         if(c==0x2e && !isLabel) {
550             int32_t labelLength=labelLimit-labelStart;
551             int32_t newLength=processLabel(dest, labelStart, labelLength,
552                                             toASCII, info, errorCode);
553             info.errors|=info.labelErrors;
554             info.labelErrors=0;
555             if(U_FAILURE(errorCode)) {
556                 return dest;
557             }
558             destArray=dest.getBuffer();
559             destLength+=newLength-labelLength;
560             labelLimit=labelStart+=newLength+1;
561             continue;
562         } else if(c<0xdf) {
563             // pass
564         } else if(c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) {
565             info.isTransDiff=true;
566             if(doMapDevChars) {
567                 destLength=mapDevChars(dest, labelStart, labelLimit, errorCode);
568                 if(U_FAILURE(errorCode)) {
569                     return dest;
570                 }
571                 destArray=dest.getBuffer();
572                 // All deviation characters have been mapped, no need to check for them again.
573                 doMapDevChars=false;
574                 // Do not increment labelLimit in case c was removed.
575                 continue;
576             }
577         } else if(U16_IS_SURROGATE(c)) {
578             if(U16_IS_SURROGATE_LEAD(c) ?
579                     (labelLimit+1)==destLength || !U16_IS_TRAIL(destArray[labelLimit+1]) :
580                     labelLimit==labelStart || !U16_IS_LEAD(destArray[labelLimit-1])) {
581                 // Map an unpaired surrogate to U+FFFD before normalization so that when
582                 // that removes characters we do not turn two unpaired ones into a pair.
583                 info.labelErrors|=UIDNA_ERROR_DISALLOWED;
584                 dest.setCharAt(labelLimit, 0xfffd);
585                 destArray=dest.getBuffer();
586             }
587         }
588         ++labelLimit;
589     }
590     // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok)
591     // but not an empty label elsewhere nor a completely empty domain name.
592     // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0.
593     if(0==labelStart || labelStart<labelLimit) {
594         processLabel(dest, labelStart, labelLimit-labelStart,
595                       toASCII, info, errorCode);
596         info.errors|=info.labelErrors;
597     }
598     return dest;
599 }
600 
601 int32_t
mapDevChars(UnicodeString & dest,int32_t labelStart,int32_t mappingStart,UErrorCode & errorCode) const602 UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
603                    UErrorCode &errorCode) const {
604     if(U_FAILURE(errorCode)) {
605         return 0;
606     }
607     int32_t length=dest.length();
608     char16_t *s=dest.getBuffer(dest[mappingStart]==0xdf ? length+1 : length);
609     if(s==nullptr) {
610         errorCode=U_MEMORY_ALLOCATION_ERROR;
611         return length;
612     }
613     int32_t capacity=dest.getCapacity();
614     UBool didMapDevChars=false;
615     int32_t readIndex=mappingStart, writeIndex=mappingStart;
616     do {
617         char16_t c=s[readIndex++];
618         switch(c) {
619         case 0xdf:
620             // Map sharp s to ss.
621             didMapDevChars=true;
622             s[writeIndex++]=0x73;  // Replace sharp s with first s.
623             // Insert second s and account for possible buffer reallocation.
624             if(writeIndex==readIndex) {
625                 if(length==capacity) {
626                     dest.releaseBuffer(length);
627                     s=dest.getBuffer(length+1);
628                     if(s==nullptr) {
629                         errorCode=U_MEMORY_ALLOCATION_ERROR;
630                         return length;
631                     }
632                     capacity=dest.getCapacity();
633                 }
634                 u_memmove(s+writeIndex+1, s+writeIndex, length-writeIndex);
635                 ++readIndex;
636             }
637             s[writeIndex++]=0x73;
638             ++length;
639             break;
640         case 0x3c2:  // Map final sigma to nonfinal sigma.
641             didMapDevChars=true;
642             s[writeIndex++]=0x3c3;
643             break;
644         case 0x200c:  // Ignore/remove ZWNJ.
645         case 0x200d:  // Ignore/remove ZWJ.
646             didMapDevChars=true;
647             --length;
648             break;
649         default:
650             // Only really necessary if writeIndex was different from readIndex.
651             s[writeIndex++]=c;
652             break;
653         }
654     } while(writeIndex<length);
655     dest.releaseBuffer(length);
656     if(didMapDevChars) {
657         // Mapping deviation characters might have resulted in an un-NFC string.
658         // We could use either the NFC or the UTS #46 normalizer.
659         // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file.
660         UnicodeString normalized;
661         uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCode);
662         if(U_SUCCESS(errorCode)) {
663             dest.replace(labelStart, 0x7fffffff, normalized);
664             if(dest.isBogus()) {
665                 errorCode=U_MEMORY_ALLOCATION_ERROR;
666             }
667             return dest.length();
668         }
669     }
670     return length;
671 }
672 
673 // Replace the label in dest with the label string, if the label was modified.
674 // If &label==&dest then the label was modified in-place and labelLength
675 // is the new label length, different from label.length().
676 // If &label!=&dest then labelLength==label.length().
677 // Returns labelLength (= the new label length).
678 static int32_t
replaceLabel(UnicodeString & dest,int32_t destLabelStart,int32_t destLabelLength,const UnicodeString & label,int32_t labelLength,UErrorCode & errorCode)679 replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLength,
680              const UnicodeString &label, int32_t labelLength, UErrorCode &errorCode) {
681     if(U_FAILURE(errorCode)) {
682         return 0;
683     }
684     if(&label!=&dest) {
685         dest.replace(destLabelStart, destLabelLength, label);
686         if(dest.isBogus()) {
687             errorCode=U_MEMORY_ALLOCATION_ERROR;
688             return 0;
689         }
690     }
691     return labelLength;
692 }
693 
694 int32_t
processLabel(UnicodeString & dest,int32_t labelStart,int32_t labelLength,UBool toASCII,IDNAInfo & info,UErrorCode & errorCode) const695 UTS46::processLabel(UnicodeString &dest,
696                     int32_t labelStart, int32_t labelLength,
697                     UBool toASCII,
698                     IDNAInfo &info, UErrorCode &errorCode) const {
699     if(U_FAILURE(errorCode)) {
700         return 0;
701     }
702     UnicodeString fromPunycode;
703     UnicodeString *labelString;
704     const char16_t *label=dest.getBuffer()+labelStart;
705     int32_t destLabelStart=labelStart;
706     int32_t destLabelLength=labelLength;
707     UBool wasPunycode;
708     if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) {
709         // Label starts with "xn--", try to un-Punycode it.
710         // In IDNA2008, labels like "xn--" (decodes to an empty string) and
711         // "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from
712         // comparing the ToUnicode input with the back-to-ToASCII output.
713         // They are alternate encodings of the respective ASCII labels.
714         // Ignore "xn---" here: It will fail Punycode.decode() which logically comes before
715         // the round-trip verification.
716         if(labelLength==4 || (labelLength>5 && label[labelLength-1]==u'-')) {
717             info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
718             return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
719         }
720         wasPunycode=true;
721         char16_t *unicodeBuffer=fromPunycode.getBuffer(-1);  // capacity==-1: most labels should fit
722         if(unicodeBuffer==nullptr) {
723             // Should never occur if we used capacity==-1 which uses the internal buffer.
724             errorCode=U_MEMORY_ALLOCATION_ERROR;
725             return labelLength;
726         }
727         UErrorCode punycodeErrorCode=U_ZERO_ERROR;
728         int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4,
729                                                 unicodeBuffer, fromPunycode.getCapacity(),
730                                                 nullptr, &punycodeErrorCode);
731         if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) {
732             fromPunycode.releaseBuffer(0);
733             unicodeBuffer=fromPunycode.getBuffer(unicodeLength);
734             if(unicodeBuffer==nullptr) {
735                 errorCode=U_MEMORY_ALLOCATION_ERROR;
736                 return labelLength;
737             }
738             punycodeErrorCode=U_ZERO_ERROR;
739             unicodeLength=u_strFromPunycode(label+4, labelLength-4,
740                                             unicodeBuffer, fromPunycode.getCapacity(),
741                                             nullptr, &punycodeErrorCode);
742         }
743         fromPunycode.releaseBuffer(unicodeLength);
744         if(U_FAILURE(punycodeErrorCode)) {
745             info.labelErrors|=UIDNA_ERROR_PUNYCODE;
746             return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
747         }
748         // Check for NFC, and for characters that are not
749         // valid or deviation characters according to the normalizer.
750         // If there is something wrong, then the string will change.
751         // Note that the normalizer passes through non-LDH ASCII and deviation characters.
752         // Deviation characters are ok in Punycode even in transitional processing.
753         // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
754         // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
755         UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode);
756         if(U_FAILURE(errorCode)) {
757             return labelLength;
758         }
759         if(!isValid) {
760             info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
761             return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
762         }
763         labelString=&fromPunycode;
764         label=fromPunycode.getBuffer();
765         labelStart=0;
766         labelLength=fromPunycode.length();
767     } else {
768         wasPunycode=false;
769         labelString=&dest;
770     }
771     // Validity check
772     if(labelLength==0) {
773         info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
774         return replaceLabel(dest, destLabelStart, destLabelLength,
775                             *labelString, labelLength, errorCode);
776     }
777     // labelLength>0
778     if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) {
779         // label starts with "??--"
780         info.labelErrors|=UIDNA_ERROR_HYPHEN_3_4;
781     }
782     if(label[0]==0x2d) {
783         // label starts with "-"
784         info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
785     }
786     if(label[labelLength-1]==0x2d) {
787         // label ends with "-"
788         info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
789     }
790     // If the label was not a Punycode label, then it was the result of
791     // mapping, normalization and label segmentation.
792     // If the label was in Punycode, then we mapped it again above
793     // and checked its validity.
794     // Now we handle the STD3 restriction to LDH characters (if set)
795     // and we look for U+FFFD which indicates disallowed characters
796     // in a non-Punycode label or U+FFFD itself in a Punycode label.
797     // We also check for dots which can come from the input to a single-label function.
798     // Ok to cast away const because we own the UnicodeString.
799     char16_t *s=(char16_t *)label;
800     const char16_t *limit=label+labelLength;
801     char16_t oredChars=0;
802     // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.
803     UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
804     do {
805         char16_t c=*s;
806         if(c<=0x7f) {
807             if(c==0x2e) {
808                 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;
809                 *s=0xfffd;
810             } else if(disallowNonLDHDot && asciiData[c]<0) {
811                 info.labelErrors|=UIDNA_ERROR_DISALLOWED;
812                 *s=0xfffd;
813             }
814         } else {
815             oredChars|=c;
816             if(c==0xfffd) {
817                 info.labelErrors|=UIDNA_ERROR_DISALLOWED;
818             }
819         }
820         ++s;
821     } while(s<limit);
822     // Check for a leading combining mark after other validity checks
823     // so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here.
824     UChar32 c;
825     int32_t cpLength=0;
826     // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD.
827     U16_NEXT_UNSAFE(label, cpLength, c);
828     if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) {
829         info.labelErrors|=UIDNA_ERROR_LEADING_COMBINING_MARK;
830         labelString->replace(labelStart, cpLength, (char16_t)0xfffd);
831         label=labelString->getBuffer()+labelStart;
832         labelLength+=1-cpLength;
833         if(labelString==&dest) {
834             destLabelLength=labelLength;
835         }
836     }
837     if((info.labelErrors&severeErrors)==0) {
838         // Do contextual checks only if we do not have U+FFFD from a severe error
839         // because U+FFFD can make these checks fail.
840         if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi || info.isOkBiDi)) {
841             checkLabelBiDi(label, labelLength, info);
842         }
843         if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c &&
844             !isLabelOkContextJ(label, labelLength)
845         ) {
846             info.labelErrors|=UIDNA_ERROR_CONTEXTJ;
847         }
848         if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) {
849             checkLabelContextO(label, labelLength, info);
850         }
851         if(toASCII) {
852             if(wasPunycode) {
853                 // Leave a Punycode label unchanged if it has no severe errors.
854                 if(destLabelLength>63) {
855                     info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
856                 }
857                 return destLabelLength;
858             } else if(oredChars>=0x80) {
859                 // Contains non-ASCII characters.
860                 UnicodeString punycode;
861                 char16_t *buffer=punycode.getBuffer(63);  // 63==maximum DNS label length
862                 if(buffer==nullptr) {
863                     errorCode=U_MEMORY_ALLOCATION_ERROR;
864                     return destLabelLength;
865                 }
866                 buffer[0]=0x78;  // Write "xn--".
867                 buffer[1]=0x6e;
868                 buffer[2]=0x2d;
869                 buffer[3]=0x2d;
870                 int32_t punycodeLength=u_strToPunycode(label, labelLength,
871                                                       buffer+4, punycode.getCapacity()-4,
872                                                       nullptr, &errorCode);
873                 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
874                     errorCode=U_ZERO_ERROR;
875                     punycode.releaseBuffer(4);
876                     buffer=punycode.getBuffer(4+punycodeLength);
877                     if(buffer==nullptr) {
878                         errorCode=U_MEMORY_ALLOCATION_ERROR;
879                         return destLabelLength;
880                     }
881                     punycodeLength=u_strToPunycode(label, labelLength,
882                                                   buffer+4, punycode.getCapacity()-4,
883                                                   nullptr, &errorCode);
884                 }
885                 punycodeLength+=4;
886                 punycode.releaseBuffer(punycodeLength);
887                 if(U_FAILURE(errorCode)) {
888                     return destLabelLength;
889                 }
890                 if(punycodeLength>63) {
891                     info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
892                 }
893                 return replaceLabel(dest, destLabelStart, destLabelLength,
894                                     punycode, punycodeLength, errorCode);
895             } else {
896                 // all-ASCII label
897                 if(labelLength>63) {
898                     info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
899                 }
900             }
901         }
902     } else {
903         // If a Punycode label has severe errors,
904         // then leave it but make sure it does not look valid.
905         if(wasPunycode) {
906             info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
907             return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info, errorCode);
908         }
909     }
910     return replaceLabel(dest, destLabelStart, destLabelLength,
911                         *labelString, labelLength, errorCode);
912 }
913 
914 // Make sure an ACE label does not look valid.
915 // Append U+FFFD if the label has only LDH characters.
916 // If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD.
917 int32_t
markBadACELabel(UnicodeString & dest,int32_t labelStart,int32_t labelLength,UBool toASCII,IDNAInfo & info,UErrorCode & errorCode) const918 UTS46::markBadACELabel(UnicodeString &dest,
919                        int32_t labelStart, int32_t labelLength,
920                        UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const {
921     if(U_FAILURE(errorCode)) {
922         return 0;
923     }
924     UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
925     UBool isASCII=true;
926     UBool onlyLDH=true;
927     const char16_t *label=dest.getBuffer()+labelStart;
928     const char16_t *limit=label+labelLength;
929     // Start after the initial "xn--".
930     // Ok to cast away const because we own the UnicodeString.
931     for(char16_t *s=const_cast<char16_t *>(label+4); s<limit; ++s) {
932         char16_t c=*s;
933         if(c<=0x7f) {
934             if(c==0x2e) {
935                 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;
936                 *s=0xfffd;
937                 isASCII=onlyLDH=false;
938             } else if(asciiData[c]<0) {
939                 onlyLDH=false;
940                 if(disallowNonLDHDot) {
941                     *s=0xfffd;
942                     isASCII=false;
943                 }
944             }
945         } else {
946             isASCII=onlyLDH=false;
947         }
948     }
949     if(onlyLDH) {
950         dest.insert(labelStart+labelLength, (char16_t)0xfffd);
951         if(dest.isBogus()) {
952             errorCode=U_MEMORY_ALLOCATION_ERROR;
953             return 0;
954         }
955         ++labelLength;
956     } else {
957         if(toASCII && isASCII && labelLength>63) {
958             info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
959         }
960     }
961     return labelLength;
962 }
963 
964 const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT);
965 const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)|U_MASK(U_RIGHT_TO_LEFT_ARABIC);
966 const uint32_t L_R_AL_MASK=L_MASK|R_AL_MASK;
967 
968 const uint32_t R_AL_AN_MASK=R_AL_MASK|U_MASK(U_ARABIC_NUMBER);
969 
970 const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)|U_MASK(U_ARABIC_NUMBER);
971 const uint32_t R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK;
972 const uint32_t L_EN_MASK=L_MASK|U_MASK(U_EUROPEAN_NUMBER);
973 
974 const uint32_t ES_CS_ET_ON_BN_NSM_MASK=
975     U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)|
976     U_MASK(U_COMMON_NUMBER_SEPARATOR)|
977     U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)|
978     U_MASK(U_OTHER_NEUTRAL)|
979     U_MASK(U_BOUNDARY_NEUTRAL)|
980     U_MASK(U_DIR_NON_SPACING_MARK);
981 const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
982 const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
983 
984 // We scan the whole label and check both for whether it contains RTL characters
985 // and whether it passes the BiDi Rule.
986 // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find
987 // that a domain name is a BiDi domain name (has an RTL label) only after
988 // processing several earlier labels.
989 void
checkLabelBiDi(const char16_t * label,int32_t labelLength,IDNAInfo & info) const990 UTS46::checkLabelBiDi(const char16_t *label, int32_t labelLength, IDNAInfo &info) const {
991     // IDNA2008 BiDi rule
992     // Get the directionality of the first character.
993     UChar32 c;
994     int32_t i=0;
995     U16_NEXT_UNSAFE(label, i, c);
996     uint32_t firstMask=U_MASK(u_charDirection(c));
997     // 1. The first character must be a character with BIDI property L, R
998     // or AL.  If it has the R or AL property, it is an RTL label; if it
999     // has the L property, it is an LTR label.
1000     if((firstMask&~L_R_AL_MASK)!=0) {
1001         info.isOkBiDi=false;
1002     }
1003     // Get the directionality of the last non-NSM character.
1004     uint32_t lastMask;
1005     for(;;) {
1006         if(i>=labelLength) {
1007             lastMask=firstMask;
1008             break;
1009         }
1010         U16_PREV_UNSAFE(label, labelLength, c);
1011         UCharDirection dir=u_charDirection(c);
1012         if(dir!=U_DIR_NON_SPACING_MARK) {
1013             lastMask=U_MASK(dir);
1014             break;
1015         }
1016     }
1017     // 3. In an RTL label, the end of the label must be a character with
1018     // BIDI property R, AL, EN or AN, followed by zero or more
1019     // characters with BIDI property NSM.
1020     // 6. In an LTR label, the end of the label must be a character with
1021     // BIDI property L or EN, followed by zero or more characters with
1022     // BIDI property NSM.
1023     if( (firstMask&L_MASK)!=0 ?
1024             (lastMask&~L_EN_MASK)!=0 :
1025             (lastMask&~R_AL_EN_AN_MASK)!=0
1026     ) {
1027         info.isOkBiDi=false;
1028     }
1029     // Add the directionalities of the intervening characters.
1030     uint32_t mask=firstMask|lastMask;
1031     while(i<labelLength) {
1032         U16_NEXT_UNSAFE(label, i, c);
1033         mask|=U_MASK(u_charDirection(c));
1034     }
1035     if(firstMask&L_MASK) {
1036         // 5. In an LTR label, only characters with the BIDI properties L, EN,
1037         // ES, CS, ET, ON, BN and NSM are allowed.
1038         if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
1039             info.isOkBiDi=false;
1040         }
1041     } else {
1042         // 2. In an RTL label, only characters with the BIDI properties R, AL,
1043         // AN, EN, ES, CS, ET, ON, BN and NSM are allowed.
1044         if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
1045             info.isOkBiDi=false;
1046         }
1047         // 4. In an RTL label, if an EN is present, no AN may be present, and
1048         // vice versa.
1049         if((mask&EN_AN_MASK)==EN_AN_MASK) {
1050             info.isOkBiDi=false;
1051         }
1052     }
1053     // An RTL label is a label that contains at least one character of type
1054     // R, AL or AN. [...]
1055     // A "BIDI domain name" is a domain name that contains at least one RTL
1056     // label. [...]
1057     // The following rule, consisting of six conditions, applies to labels
1058     // in BIDI domain names.
1059     if((mask&R_AL_AN_MASK)!=0) {
1060         info.isBiDi=true;
1061     }
1062 }
1063 
1064 // Special code for the ASCII prefix of a BiDi domain name.
1065 // The ASCII prefix is all-LTR.
1066 
1067 // IDNA2008 BiDi rule, parts relevant to ASCII labels:
1068 // 1. The first character must be a character with BIDI property L [...]
1069 // 5. In an LTR label, only characters with the BIDI properties L, EN,
1070 // ES, CS, ET, ON, BN and NSM are allowed.
1071 // 6. In an LTR label, the end of the label must be a character with
1072 // BIDI property L or EN [...]
1073 
1074 // UTF-16 version, called for mapped ASCII prefix.
1075 // Cannot contain uppercase A-Z.
1076 // s[length-1] must be the trailing dot.
1077 static UBool
isASCIIOkBiDi(const char16_t * s,int32_t length)1078 isASCIIOkBiDi(const char16_t *s, int32_t length) {
1079     int32_t labelStart=0;
1080     for(int32_t i=0; i<length; ++i) {
1081         char16_t c=s[i];
1082         if(c==0x2e) {  // dot
1083             if(i>labelStart) {
1084                 c=s[i-1];
1085                 if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) {
1086                     // Last character in the label is not an L or EN.
1087                     return false;
1088                 }
1089             }
1090             labelStart=i+1;
1091         } else if(i==labelStart) {
1092             if(!(0x61<=c && c<=0x7a)) {
1093                 // First character in the label is not an L.
1094                 return false;
1095             }
1096         } else {
1097             if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
1098                 // Intermediate character in the label is a B, S or WS.
1099                 return false;
1100             }
1101         }
1102     }
1103     return true;
1104 }
1105 
1106 // UTF-8 version, called for source ASCII prefix.
1107 // Can contain uppercase A-Z.
1108 // s[length-1] must be the trailing dot.
1109 static UBool
isASCIIOkBiDi(const char * s,int32_t length)1110 isASCIIOkBiDi(const char *s, int32_t length) {
1111     int32_t labelStart=0;
1112     for(int32_t i=0; i<length; ++i) {
1113         char c=s[i];
1114         if(c==0x2e) {  // dot
1115             if(i>labelStart) {
1116                 c=s[i-1];
1117                 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c && c<=0x39)) {
1118                     // Last character in the label is not an L or EN.
1119                     return false;
1120                 }
1121             }
1122             labelStart=i+1;
1123         } else if(i==labelStart) {
1124             if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) {
1125                 // First character in the label is not an L.
1126                 return false;
1127             }
1128         } else {
1129             if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
1130                 // Intermediate character in the label is a B, S or WS.
1131                 return false;
1132             }
1133         }
1134     }
1135     return true;
1136 }
1137 
1138 UBool
isLabelOkContextJ(const char16_t * label,int32_t labelLength) const1139 UTS46::isLabelOkContextJ(const char16_t *label, int32_t labelLength) const {
1140     // [IDNA2008-Tables]
1141     // 200C..200D  ; CONTEXTJ    # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
1142     for(int32_t i=0; i<labelLength; ++i) {
1143         if(label[i]==0x200c) {
1144             // Appendix A.1. ZERO WIDTH NON-JOINER
1145             // Rule Set:
1146             //  False;
1147             //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
1148             //  If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
1149             //     (Joining_Type:T)*(Joining_Type:{R,D})) Then True;
1150             if(i==0) {
1151                 return false;
1152             }
1153             UChar32 c;
1154             int32_t j=i;
1155             U16_PREV_UNSAFE(label, j, c);
1156             if(uts46Norm2.getCombiningClass(c)==9) {
1157                 continue;
1158             }
1159             // check precontext (Joining_Type:{L,D})(Joining_Type:T)*
1160             for(;;) {
1161                 UJoiningType type=ubidi_getJoiningType(c);
1162                 if(type==U_JT_TRANSPARENT) {
1163                     if(j==0) {
1164                         return false;
1165                     }
1166                     U16_PREV_UNSAFE(label, j, c);
1167                 } else if(type==U_JT_LEFT_JOINING || type==U_JT_DUAL_JOINING) {
1168                     break;  // precontext fulfilled
1169                 } else {
1170                     return false;
1171                 }
1172             }
1173             // check postcontext (Joining_Type:T)*(Joining_Type:{R,D})
1174             for(j=i+1;;) {
1175                 if(j==labelLength) {
1176                     return false;
1177                 }
1178                 U16_NEXT_UNSAFE(label, j, c);
1179                 UJoiningType type=ubidi_getJoiningType(c);
1180                 if(type==U_JT_TRANSPARENT) {
1181                     // just skip this character
1182                 } else if(type==U_JT_RIGHT_JOINING || type==U_JT_DUAL_JOINING) {
1183                     break;  // postcontext fulfilled
1184                 } else {
1185                     return false;
1186                 }
1187             }
1188         } else if(label[i]==0x200d) {
1189             // Appendix A.2. ZERO WIDTH JOINER (U+200D)
1190             // Rule Set:
1191             //  False;
1192             //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
1193             if(i==0) {
1194                 return false;
1195             }
1196             UChar32 c;
1197             int32_t j=i;
1198             U16_PREV_UNSAFE(label, j, c);
1199             if(uts46Norm2.getCombiningClass(c)!=9) {
1200                 return false;
1201             }
1202         }
1203     }
1204     return true;
1205 }
1206 
1207 void
checkLabelContextO(const char16_t * label,int32_t labelLength,IDNAInfo & info) const1208 UTS46::checkLabelContextO(const char16_t *label, int32_t labelLength, IDNAInfo &info) const {
1209     int32_t labelEnd=labelLength-1;  // inclusive
1210     int32_t arabicDigits=0;  // -1 for 066x, +1 for 06Fx
1211     for(int32_t i=0; i<=labelEnd; ++i) {
1212         UChar32 c=label[i];
1213         if(c<0xb7) {
1214             // ASCII fastpath
1215         } else if(c<=0x6f9) {
1216             if(c==0xb7) {
1217                 // Appendix A.3. MIDDLE DOT (U+00B7)
1218                 // Rule Set:
1219                 //  False;
1220                 //  If Before(cp) .eq.  U+006C And
1221                 //     After(cp) .eq.  U+006C Then True;
1222                 if(!(0<i && label[i-1]==0x6c &&
1223                      i<labelEnd && label[i+1]==0x6c)) {
1224                     info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1225                 }
1226             } else if(c==0x375) {
1227                 // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
1228                 // Rule Set:
1229                 //  False;
1230                 //  If Script(After(cp)) .eq.  Greek Then True;
1231                 UScriptCode script=USCRIPT_INVALID_CODE;
1232                 if(i<labelEnd) {
1233                     UErrorCode errorCode=U_ZERO_ERROR;
1234                     int32_t j=i+1;
1235                     U16_NEXT(label, j, labelLength, c);
1236                     script=uscript_getScript(c, &errorCode);
1237                 }
1238                 if(script!=USCRIPT_GREEK) {
1239                     info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1240                 }
1241             } else if(c==0x5f3 || c==0x5f4) {
1242                 // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
1243                 // Rule Set:
1244                 //  False;
1245                 //  If Script(Before(cp)) .eq.  Hebrew Then True;
1246                 //
1247                 // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
1248                 // Rule Set:
1249                 //  False;
1250                 //  If Script(Before(cp)) .eq.  Hebrew Then True;
1251                 UScriptCode script=USCRIPT_INVALID_CODE;
1252                 if(0<i) {
1253                     UErrorCode errorCode=U_ZERO_ERROR;
1254                     int32_t j=i;
1255                     U16_PREV(label, 0, j, c);
1256                     script=uscript_getScript(c, &errorCode);
1257                 }
1258                 if(script!=USCRIPT_HEBREW) {
1259                     info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1260                 }
1261             } else if(0x660<=c /* && c<=0x6f9 */) {
1262                 // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
1263                 // Rule Set:
1264                 //  True;
1265                 //  For All Characters:
1266                 //    If cp .in. 06F0..06F9 Then False;
1267                 //  End For;
1268                 //
1269                 // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
1270                 // Rule Set:
1271                 //  True;
1272                 //  For All Characters:
1273                 //    If cp .in. 0660..0669 Then False;
1274                 //  End For;
1275                 if(c<=0x669) {
1276                     if(arabicDigits>0) {
1277                         info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;
1278                     }
1279                     arabicDigits=-1;
1280                 } else if(0x6f0<=c) {
1281                     if(arabicDigits<0) {
1282                         info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;
1283                     }
1284                     arabicDigits=1;
1285                 }
1286             }
1287         } else if(c==0x30fb) {
1288             // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
1289             // Rule Set:
1290             //  False;
1291             //  For All Characters:
1292             //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
1293             //  End For;
1294             UErrorCode errorCode=U_ZERO_ERROR;
1295             for(int j=0;;) {
1296                 if(j>labelEnd) {
1297                     info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1298                     break;
1299                 }
1300                 U16_NEXT(label, j, labelLength, c);
1301                 UScriptCode script=uscript_getScript(c, &errorCode);
1302                 if(script==USCRIPT_HIRAGANA || script==USCRIPT_KATAKANA || script==USCRIPT_HAN) {
1303                     break;
1304                 }
1305             }
1306         }
1307     }
1308 }
1309 
1310 U_NAMESPACE_END
1311 
1312 // C API ------------------------------------------------------------------- ***
1313 
1314 U_NAMESPACE_USE
1315 
1316 U_CAPI UIDNA * U_EXPORT2
uidna_openUTS46(uint32_t options,UErrorCode * pErrorCode)1317 uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) {
1318     return reinterpret_cast<UIDNA *>(IDNA::createUTS46Instance(options, *pErrorCode));
1319 }
1320 
1321 U_CAPI void U_EXPORT2
uidna_close(UIDNA * idna)1322 uidna_close(UIDNA *idna) {
1323     delete reinterpret_cast<IDNA *>(idna);
1324 }
1325 
1326 static UBool
checkArgs(const void * label,int32_t length,void * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1327 checkArgs(const void *label, int32_t length,
1328           void *dest, int32_t capacity,
1329           UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1330     if(U_FAILURE(*pErrorCode)) {
1331         return false;
1332     }
1333     // sizeof(UIDNAInfo)=16 in the first API version.
1334     if(pInfo==nullptr || pInfo->size<16) {
1335         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1336         return false;
1337     }
1338     if( (label==nullptr ? length!=0 : length<-1) ||
1339         (dest==nullptr ? capacity!=0 : capacity<0) ||
1340         (dest==label && label!=nullptr)
1341     ) {
1342         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1343         return false;
1344     }
1345     // Set all *pInfo bytes to 0 except for the size field itself.
1346     uprv_memset(&pInfo->size+1, 0, pInfo->size-sizeof(pInfo->size));
1347     return true;
1348 }
1349 
1350 static void
idnaInfoToStruct(IDNAInfo & info,UIDNAInfo * pInfo)1351 idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) {
1352     pInfo->isTransitionalDifferent=info.isTransitionalDifferent();
1353     pInfo->errors=info.getErrors();
1354 }
1355 
1356 U_CAPI int32_t U_EXPORT2
uidna_labelToASCII(const UIDNA * idna,const char16_t * label,int32_t length,char16_t * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1357 uidna_labelToASCII(const UIDNA *idna,
1358                    const char16_t *label, int32_t length,
1359                    char16_t *dest, int32_t capacity,
1360                    UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1361     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1362         return 0;
1363     }
1364     UnicodeString src((UBool)(length<0), label, length);
1365     UnicodeString destString(dest, 0, capacity);
1366     IDNAInfo info;
1367     reinterpret_cast<const IDNA *>(idna)->labelToASCII(src, destString, info, *pErrorCode);
1368     idnaInfoToStruct(info, pInfo);
1369     return destString.extract(dest, capacity, *pErrorCode);
1370 }
1371 
1372 U_CAPI int32_t U_EXPORT2
uidna_labelToUnicode(const UIDNA * idna,const char16_t * label,int32_t length,char16_t * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1373 uidna_labelToUnicode(const UIDNA *idna,
1374                      const char16_t *label, int32_t length,
1375                      char16_t *dest, int32_t capacity,
1376                      UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1377     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1378         return 0;
1379     }
1380     UnicodeString src((UBool)(length<0), label, length);
1381     UnicodeString destString(dest, 0, capacity);
1382     IDNAInfo info;
1383     reinterpret_cast<const IDNA *>(idna)->labelToUnicode(src, destString, info, *pErrorCode);
1384     idnaInfoToStruct(info, pInfo);
1385     return destString.extract(dest, capacity, *pErrorCode);
1386 }
1387 
1388 U_CAPI int32_t U_EXPORT2
uidna_nameToASCII(const UIDNA * idna,const char16_t * name,int32_t length,char16_t * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1389 uidna_nameToASCII(const UIDNA *idna,
1390                   const char16_t *name, int32_t length,
1391                   char16_t *dest, int32_t capacity,
1392                   UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1393     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1394         return 0;
1395     }
1396     UnicodeString src((UBool)(length<0), name, length);
1397     UnicodeString destString(dest, 0, capacity);
1398     IDNAInfo info;
1399     reinterpret_cast<const IDNA *>(idna)->nameToASCII(src, destString, info, *pErrorCode);
1400     idnaInfoToStruct(info, pInfo);
1401     return destString.extract(dest, capacity, *pErrorCode);
1402 }
1403 
1404 U_CAPI int32_t U_EXPORT2
uidna_nameToUnicode(const UIDNA * idna,const char16_t * name,int32_t length,char16_t * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1405 uidna_nameToUnicode(const UIDNA *idna,
1406                     const char16_t *name, int32_t length,
1407                     char16_t *dest, int32_t capacity,
1408                     UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1409     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1410         return 0;
1411     }
1412     UnicodeString src((UBool)(length<0), name, length);
1413     UnicodeString destString(dest, 0, capacity);
1414     IDNAInfo info;
1415     reinterpret_cast<const IDNA *>(idna)->nameToUnicode(src, destString, info, *pErrorCode);
1416     idnaInfoToStruct(info, pInfo);
1417     return destString.extract(dest, capacity, *pErrorCode);
1418 }
1419 
1420 U_CAPI int32_t U_EXPORT2
uidna_labelToASCII_UTF8(const UIDNA * idna,const char * label,int32_t length,char * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1421 uidna_labelToASCII_UTF8(const UIDNA *idna,
1422                         const char *label, int32_t length,
1423                         char *dest, int32_t capacity,
1424                         UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1425     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1426         return 0;
1427     }
1428     StringPiece src(label, length<0 ? static_cast<int32_t>(uprv_strlen(label)) : length);
1429     return ByteSinkUtil::viaByteSinkToTerminatedChars(
1430         dest, capacity,
1431         [&](ByteSink& sink, UErrorCode& status) {
1432             IDNAInfo info;
1433             reinterpret_cast<const IDNA *>(idna)->labelToASCII_UTF8(src, sink, info, status);
1434             idnaInfoToStruct(info, pInfo);
1435         },
1436         *pErrorCode);
1437 }
1438 
1439 U_CAPI int32_t U_EXPORT2
uidna_labelToUnicodeUTF8(const UIDNA * idna,const char * label,int32_t length,char * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1440 uidna_labelToUnicodeUTF8(const UIDNA *idna,
1441                          const char *label, int32_t length,
1442                          char *dest, int32_t capacity,
1443                          UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1444     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1445         return 0;
1446     }
1447     StringPiece src(label, length<0 ? static_cast<int32_t>(uprv_strlen(label)) : length);
1448     return ByteSinkUtil::viaByteSinkToTerminatedChars(
1449         dest, capacity,
1450         [&](ByteSink& sink, UErrorCode& status) {
1451             IDNAInfo info;
1452             reinterpret_cast<const IDNA *>(idna)->labelToUnicodeUTF8(src, sink, info, status);
1453             idnaInfoToStruct(info, pInfo);
1454         },
1455         *pErrorCode);
1456 }
1457 
1458 U_CAPI int32_t U_EXPORT2
uidna_nameToASCII_UTF8(const UIDNA * idna,const char * name,int32_t length,char * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1459 uidna_nameToASCII_UTF8(const UIDNA *idna,
1460                        const char *name, int32_t length,
1461                        char *dest, int32_t capacity,
1462                        UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1463     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1464         return 0;
1465     }
1466     StringPiece src(name, length<0 ? static_cast<int32_t>(uprv_strlen(name)) : length);
1467     return ByteSinkUtil::viaByteSinkToTerminatedChars(
1468         dest, capacity,
1469         [&](ByteSink& sink, UErrorCode& status) {
1470             IDNAInfo info;
1471             reinterpret_cast<const IDNA *>(idna)->nameToASCII_UTF8(src, sink, info, status);
1472             idnaInfoToStruct(info, pInfo);
1473         },
1474         *pErrorCode);
1475 }
1476 
1477 U_CAPI int32_t U_EXPORT2
uidna_nameToUnicodeUTF8(const UIDNA * idna,const char * name,int32_t length,char * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1478 uidna_nameToUnicodeUTF8(const UIDNA *idna,
1479                         const char *name, int32_t length,
1480                         char *dest, int32_t capacity,
1481                         UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1482     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1483         return 0;
1484     }
1485     StringPiece src(name, length<0 ? static_cast<int32_t>(uprv_strlen(name)) : length);
1486     return ByteSinkUtil::viaByteSinkToTerminatedChars(
1487         dest, capacity,
1488         [&](ByteSink& sink, UErrorCode& status) {
1489             IDNAInfo info;
1490             reinterpret_cast<const IDNA *>(idna)->nameToUnicodeUTF8(src, sink, info, status);
1491             idnaInfoToStruct(info, pInfo);
1492         },
1493         *pErrorCode);
1494 }
1495 
1496 #endif  // UCONFIG_NO_IDNA
1497