• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *   Copyright (C) 2010-2012, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   file name:  uts46.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2010mar09
12 *   created by: Markus W. Scherer
13 */
14 
15 #include "unicode/utypes.h"
16 
17 #if !UCONFIG_NO_IDNA
18 
19 #include "unicode/idna.h"
20 #include "unicode/normalizer2.h"
21 #include "unicode/uscript.h"
22 #include "unicode/ustring.h"
23 #include "unicode/utf16.h"
24 #include "cmemory.h"
25 #include "cstring.h"
26 #include "punycode.h"
27 #include "ubidi_props.h"
28 #include "ustr_imp.h"
29 
30 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
31 
32 // Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG:
33 //
34 // The domain name length limit is 255 octets in an internal DNS representation
35 // where the last ("root") label is the empty label
36 // represented by length byte 0 alone.
37 // In a conventional string, this translates to 253 characters, or 254
38 // if there is a trailing dot for the root label.
39 
40 U_NAMESPACE_BEGIN
41 
42 // Severe errors which usually result in a U+FFFD replacement character in the result string.
43 const uint32_t severeErrors=
44     UIDNA_ERROR_LEADING_COMBINING_MARK|
45     UIDNA_ERROR_DISALLOWED|
46     UIDNA_ERROR_PUNYCODE|
47     UIDNA_ERROR_LABEL_HAS_DOT|
48     UIDNA_ERROR_INVALID_ACE_LABEL;
49 
50 static inline UBool
isASCIIString(const UnicodeString & dest)51 isASCIIString(const UnicodeString &dest) {
52     const UChar *s=dest.getBuffer();
53     const UChar *limit=s+dest.length();
54     while(s<limit) {
55         if(*s++>0x7f) {
56             return FALSE;
57         }
58     }
59     return TRUE;
60 }
61 
62 static UBool
63 isASCIIOkBiDi(const UChar *s, int32_t length);
64 
65 static UBool
66 isASCIIOkBiDi(const char *s, int32_t length);
67 
68 // IDNA class default implementations -------------------------------------- ***
69 
~IDNA()70 IDNA::~IDNA() {}
71 
72 void
labelToASCII_UTF8(const StringPiece & label,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const73 IDNA::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,
74                         IDNAInfo &info, UErrorCode &errorCode) const {
75     if(U_SUCCESS(errorCode)) {
76         UnicodeString destString;
77         labelToASCII(UnicodeString::fromUTF8(label), destString,
78                      info, errorCode).toUTF8(dest);
79     }
80 }
81 
82 void
labelToUnicodeUTF8(const StringPiece & label,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const83 IDNA::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,
84                          IDNAInfo &info, UErrorCode &errorCode) const {
85     if(U_SUCCESS(errorCode)) {
86         UnicodeString destString;
87         labelToUnicode(UnicodeString::fromUTF8(label), destString,
88                        info, errorCode).toUTF8(dest);
89     }
90 }
91 
92 void
nameToASCII_UTF8(const StringPiece & name,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const93 IDNA::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,
94                        IDNAInfo &info, UErrorCode &errorCode) const {
95     if(U_SUCCESS(errorCode)) {
96         UnicodeString destString;
97         nameToASCII(UnicodeString::fromUTF8(name), destString,
98                     info, errorCode).toUTF8(dest);
99     }
100 }
101 
102 void
nameToUnicodeUTF8(const StringPiece & name,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const103 IDNA::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,
104                         IDNAInfo &info, UErrorCode &errorCode) const {
105     if(U_SUCCESS(errorCode)) {
106         UnicodeString destString;
107         nameToUnicode(UnicodeString::fromUTF8(name), destString,
108                       info, errorCode).toUTF8(dest);
109     }
110 }
111 
112 UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(IDNA)
113 
114 // UTS46 class declaration ------------------------------------------------- ***
115 
116 class UTS46 : public IDNA {
117 public:
118     UTS46(uint32_t options, UErrorCode &errorCode);
119     virtual ~UTS46();
120 
121     virtual UnicodeString &
122     labelToASCII(const UnicodeString &label, UnicodeString &dest,
123                  IDNAInfo &info, UErrorCode &errorCode) const;
124 
125     virtual UnicodeString &
126     labelToUnicode(const UnicodeString &label, UnicodeString &dest,
127                    IDNAInfo &info, UErrorCode &errorCode) const;
128 
129     virtual UnicodeString &
130     nameToASCII(const UnicodeString &name, UnicodeString &dest,
131                 IDNAInfo &info, UErrorCode &errorCode) const;
132 
133     virtual UnicodeString &
134     nameToUnicode(const UnicodeString &name, UnicodeString &dest,
135                   IDNAInfo &info, UErrorCode &errorCode) const;
136 
137     virtual void
138     labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,
139                       IDNAInfo &info, UErrorCode &errorCode) const;
140 
141     virtual void
142     labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,
143                        IDNAInfo &info, UErrorCode &errorCode) const;
144 
145     virtual void
146     nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,
147                      IDNAInfo &info, UErrorCode &errorCode) const;
148 
149     virtual void
150     nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,
151                       IDNAInfo &info, UErrorCode &errorCode) const;
152 
153 private:
154     UnicodeString &
155     process(const UnicodeString &src,
156             UBool isLabel, UBool toASCII,
157             UnicodeString &dest,
158             IDNAInfo &info, UErrorCode &errorCode) const;
159 
160     void
161     processUTF8(const StringPiece &src,
162                 UBool isLabel, UBool toASCII,
163                 ByteSink &dest,
164                 IDNAInfo &info, UErrorCode &errorCode) const;
165 
166     UnicodeString &
167     processUnicode(const UnicodeString &src,
168                    int32_t labelStart, int32_t mappingStart,
169                    UBool isLabel, UBool toASCII,
170                    UnicodeString &dest,
171                    IDNAInfo &info, UErrorCode &errorCode) const;
172 
173     // returns the new dest.length()
174     int32_t
175     mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
176                 UErrorCode &errorCode) const;
177 
178     // returns the new label length
179     int32_t
180     processLabel(UnicodeString &dest,
181                  int32_t labelStart, int32_t labelLength,
182                  UBool toASCII,
183                  IDNAInfo &info, UErrorCode &errorCode) const;
184     int32_t
185     markBadACELabel(UnicodeString &dest,
186                     int32_t labelStart, int32_t labelLength,
187                     UBool toASCII, IDNAInfo &info) const;
188 
189     void
190     checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const;
191 
192     UBool
193     isLabelOkContextJ(const UChar *label, int32_t labelLength) const;
194 
195     void
196     checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const;
197 
198     const Normalizer2 &uts46Norm2;  // uts46.nrm
199     uint32_t options;
200 };
201 
202 IDNA *
createUTS46Instance(uint32_t options,UErrorCode & errorCode)203 IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) {
204     if(U_SUCCESS(errorCode)) {
205         IDNA *idna=new UTS46(options, errorCode);
206         if(idna==NULL) {
207             errorCode=U_MEMORY_ALLOCATION_ERROR;
208         } else if(U_FAILURE(errorCode)) {
209             delete idna;
210             idna=NULL;
211         }
212         return idna;
213     } else {
214         return NULL;
215     }
216 }
217 
218 // UTS46 implementation ---------------------------------------------------- ***
219 
UTS46(uint32_t opt,UErrorCode & errorCode)220 UTS46::UTS46(uint32_t opt, UErrorCode &errorCode)
221         : uts46Norm2(*Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, errorCode)),
222           options(opt) {}
223 
~UTS46()224 UTS46::~UTS46() {}
225 
226 UnicodeString &
labelToASCII(const UnicodeString & label,UnicodeString & dest,IDNAInfo & info,UErrorCode & errorCode) const227 UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest,
228                     IDNAInfo &info, UErrorCode &errorCode) const {
229     return process(label, TRUE, TRUE, dest, info, errorCode);
230 }
231 
232 UnicodeString &
labelToUnicode(const UnicodeString & label,UnicodeString & dest,IDNAInfo & info,UErrorCode & errorCode) const233 UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest,
234                       IDNAInfo &info, UErrorCode &errorCode) const {
235     return process(label, TRUE, FALSE, dest, info, errorCode);
236 }
237 
238 UnicodeString &
nameToASCII(const UnicodeString & name,UnicodeString & dest,IDNAInfo & info,UErrorCode & errorCode) const239 UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest,
240                    IDNAInfo &info, UErrorCode &errorCode) const {
241     process(name, FALSE, TRUE, dest, info, errorCode);
242     if( dest.length()>=254 && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==0 &&
243         isASCIIString(dest) &&
244         (dest.length()>254 || dest[253]!=0x2e)
245     ) {
246         info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
247     }
248     return dest;
249 }
250 
251 UnicodeString &
nameToUnicode(const UnicodeString & name,UnicodeString & dest,IDNAInfo & info,UErrorCode & errorCode) const252 UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest,
253                      IDNAInfo &info, UErrorCode &errorCode) const {
254     return process(name, FALSE, FALSE, dest, info, errorCode);
255 }
256 
257 void
labelToASCII_UTF8(const StringPiece & label,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const258 UTS46::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,
259                          IDNAInfo &info, UErrorCode &errorCode) const {
260     processUTF8(label, TRUE, TRUE, dest, info, errorCode);
261 }
262 
263 void
labelToUnicodeUTF8(const StringPiece & label,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const264 UTS46::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,
265                           IDNAInfo &info, UErrorCode &errorCode) const {
266     processUTF8(label, TRUE, FALSE, dest, info, errorCode);
267 }
268 
269 void
nameToASCII_UTF8(const StringPiece & name,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const270 UTS46::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,
271                         IDNAInfo &info, UErrorCode &errorCode) const {
272     processUTF8(name, FALSE, TRUE, dest, info, errorCode);
273 }
274 
275 void
nameToUnicodeUTF8(const StringPiece & name,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const276 UTS46::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,
277                          IDNAInfo &info, UErrorCode &errorCode) const {
278     processUTF8(name, FALSE, FALSE, dest, info, errorCode);
279 }
280 
281 // UTS #46 data for ASCII characters.
282 // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase
283 // and passes through all other ASCII characters.
284 // If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed
285 // using this data.
286 // The ASCII fastpath also uses this data.
287 // Values: -1=disallowed  0==valid  1==mapped (lowercase)
288 static const int8_t asciiData[128]={
289     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
290     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
291     // 002D..002E; valid  #  HYPHEN-MINUS..FULL STOP
292     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0, -1,
293     // 0030..0039; valid  #  DIGIT ZERO..DIGIT NINE
294      0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,
295     // 0041..005A; mapped  #  LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
296     -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
297      1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1, -1,
298     // 0061..007A; valid  #  LATIN SMALL LETTER A..LATIN SMALL LETTER Z
299     -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
300      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1
301 };
302 
303 UnicodeString &
process(const UnicodeString & src,UBool isLabel,UBool toASCII,UnicodeString & dest,IDNAInfo & info,UErrorCode & errorCode) const304 UTS46::process(const UnicodeString &src,
305                UBool isLabel, UBool toASCII,
306                UnicodeString &dest,
307                IDNAInfo &info, UErrorCode &errorCode) const {
308     // uts46Norm2.normalize() would do all of this error checking and setup,
309     // but with the ASCII fastpath we do not always call it, and do not
310     // call it first.
311     if(U_FAILURE(errorCode)) {
312         dest.setToBogus();
313         return dest;
314     }
315     const UChar *srcArray=src.getBuffer();
316     if(&dest==&src || srcArray==NULL) {
317         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
318         dest.setToBogus();
319         return dest;
320     }
321     // Arguments are fine, reset output values.
322     dest.remove();
323     info.reset();
324     int32_t srcLength=src.length();
325     if(srcLength==0) {
326         if(toASCII) {
327             info.errors|=UIDNA_ERROR_EMPTY_LABEL;
328         }
329         return dest;
330     }
331     UChar *destArray=dest.getBuffer(srcLength);
332     if(destArray==NULL) {
333         errorCode=U_MEMORY_ALLOCATION_ERROR;
334         return dest;
335     }
336     // ASCII fastpath
337     UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
338     int32_t labelStart=0;
339     int32_t i;
340     for(i=0;; ++i) {
341         if(i==srcLength) {
342             if(toASCII) {
343                 if((i-labelStart)>63) {
344                     info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
345                 }
346                 // There is a trailing dot if labelStart==i.
347                 if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
348                     info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
349                 }
350             }
351             info.errors|=info.labelErrors;
352             dest.releaseBuffer(i);
353             return dest;
354         }
355         UChar c=srcArray[i];
356         if(c>0x7f) {
357             break;
358         }
359         int cData=asciiData[c];
360         if(cData>0) {
361             destArray[i]=c+0x20;  // Lowercase an uppercase ASCII letter.
362         } else if(cData<0 && disallowNonLDHDot) {
363             break;  // Replacing with U+FFFD can be complicated for toASCII.
364         } else {
365             destArray[i]=c;
366             if(c==0x2d) {  // hyphen
367                 if(i==(labelStart+3) && srcArray[i-1]==0x2d) {
368                     // "??--..." is Punycode or forbidden.
369                     ++i;  // '-' was copied to dest already
370                     break;
371                 }
372                 if(i==labelStart) {
373                     // label starts with "-"
374                     info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
375                 }
376                 if((i+1)==srcLength || srcArray[i+1]==0x2e) {
377                     // label ends with "-"
378                     info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
379                 }
380             } else if(c==0x2e) {  // dot
381                 if(isLabel) {
382                     // Replacing with U+FFFD can be complicated for toASCII.
383                     ++i;  // '.' was copied to dest already
384                     break;
385                 }
386                 if(toASCII) {
387                     // Permit an empty label at the end but not elsewhere.
388                     if(i==labelStart && i<(srcLength-1)) {
389                         info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
390                     } else if((i-labelStart)>63) {
391                         info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
392                     }
393                 }
394                 info.errors|=info.labelErrors;
395                 info.labelErrors=0;
396                 labelStart=i+1;
397             }
398         }
399     }
400     info.errors|=info.labelErrors;
401     dest.releaseBuffer(i);
402     processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode);
403     if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&
404         (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), labelStart)))
405     ) {
406         info.errors|=UIDNA_ERROR_BIDI;
407     }
408     return dest;
409 }
410 
411 void
processUTF8(const StringPiece & src,UBool isLabel,UBool toASCII,ByteSink & dest,IDNAInfo & info,UErrorCode & errorCode) const412 UTS46::processUTF8(const StringPiece &src,
413                    UBool isLabel, UBool toASCII,
414                    ByteSink &dest,
415                    IDNAInfo &info, UErrorCode &errorCode) const {
416     if(U_FAILURE(errorCode)) {
417         return;
418     }
419     const char *srcArray=src.data();
420     int32_t srcLength=src.length();
421     if(srcArray==NULL && srcLength!=0) {
422         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
423         return;
424     }
425     // Arguments are fine, reset output values.
426     info.reset();
427     if(srcLength==0) {
428         if(toASCII) {
429             info.errors|=UIDNA_ERROR_EMPTY_LABEL;
430         }
431         dest.Flush();
432         return;
433     }
434     UnicodeString destString;
435     int32_t labelStart=0;
436     if(srcLength<=256) {  // length of stackArray[]
437         // ASCII fastpath
438         char stackArray[256];
439         int32_t destCapacity;
440         char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20,
441                                              stackArray, LENGTHOF(stackArray), &destCapacity);
442         UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
443         int32_t i;
444         for(i=0;; ++i) {
445             if(i==srcLength) {
446                 if(toASCII) {
447                     if((i-labelStart)>63) {
448                         info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
449                     }
450                     // There is a trailing dot if labelStart==i.
451                     if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
452                         info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
453                     }
454                 }
455                 info.errors|=info.labelErrors;
456                 dest.Append(destArray, i);
457                 dest.Flush();
458                 return;
459             }
460             char c=srcArray[i];
461             if((int8_t)c<0) {  // (uint8_t)c>0x7f
462                 break;
463             }
464             int cData=asciiData[(int)c];  // Cast: gcc warns about indexing with a char.
465             if(cData>0) {
466                 destArray[i]=c+0x20;  // Lowercase an uppercase ASCII letter.
467             } else if(cData<0 && disallowNonLDHDot) {
468                 break;  // Replacing with U+FFFD can be complicated for toASCII.
469             } else {
470                 destArray[i]=c;
471                 if(c==0x2d) {  // hyphen
472                     if(i==(labelStart+3) && srcArray[i-1]==0x2d) {
473                         // "??--..." is Punycode or forbidden.
474                         break;
475                     }
476                     if(i==labelStart) {
477                         // label starts with "-"
478                         info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
479                     }
480                     if((i+1)==srcLength || srcArray[i+1]==0x2e) {
481                         // label ends with "-"
482                         info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
483                     }
484                 } else if(c==0x2e) {  // dot
485                     if(isLabel) {
486                         break;  // Replacing with U+FFFD can be complicated for toASCII.
487                     }
488                     if(toASCII) {
489                         // Permit an empty label at the end but not elsewhere.
490                         if(i==labelStart && i<(srcLength-1)) {
491                             info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
492                         } else if((i-labelStart)>63) {
493                             info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
494                         }
495                     }
496                     info.errors|=info.labelErrors;
497                     info.labelErrors=0;
498                     labelStart=i+1;
499                 }
500             }
501         }
502         info.errors|=info.labelErrors;
503         // Convert the processed ASCII prefix of the current label to UTF-16.
504         int32_t mappingStart=i-labelStart;
505         destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, mappingStart));
506         // Output the previous ASCII labels and process the rest of src in UTF-16.
507         dest.Append(destArray, labelStart);
508         processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0, mappingStart,
509                        isLabel, toASCII,
510                        destString, info, errorCode);
511     } else {
512         // src is too long for the ASCII fastpath implementation.
513         processUnicode(UnicodeString::fromUTF8(src), 0, 0,
514                        isLabel, toASCII,
515                        destString, info, errorCode);
516     }
517     destString.toUTF8(dest);  // calls dest.Flush()
518     if(toASCII && !isLabel) {
519         // length==labelStart==254 means that there is a trailing dot (ok) and
520         // destString is empty (do not index at 253-labelStart).
521         int32_t length=labelStart+destString.length();
522         if( length>=254 && isASCIIString(destString) &&
523             (length>254 ||
524              (labelStart<254 && destString[253-labelStart]!=0x2e))
525         ) {
526             info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
527         }
528     }
529     if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&
530         (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart)))
531     ) {
532         info.errors|=UIDNA_ERROR_BIDI;
533     }
534 }
535 
536 UnicodeString &
processUnicode(const UnicodeString & src,int32_t labelStart,int32_t mappingStart,UBool isLabel,UBool toASCII,UnicodeString & dest,IDNAInfo & info,UErrorCode & errorCode) const537 UTS46::processUnicode(const UnicodeString &src,
538                       int32_t labelStart, int32_t mappingStart,
539                       UBool isLabel, UBool toASCII,
540                       UnicodeString &dest,
541                       IDNAInfo &info, UErrorCode &errorCode) const {
542     if(mappingStart==0) {
543         uts46Norm2.normalize(src, dest, errorCode);
544     } else {
545         uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode);
546     }
547     if(U_FAILURE(errorCode)) {
548         return dest;
549     }
550     UBool doMapDevChars=
551         toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 :
552                   (options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0;
553     const UChar *destArray=dest.getBuffer();
554     int32_t destLength=dest.length();
555     int32_t labelLimit=labelStart;
556     while(labelLimit<destLength) {
557         UChar c=destArray[labelLimit];
558         if(c==0x2e && !isLabel) {
559             int32_t labelLength=labelLimit-labelStart;
560             int32_t newLength=processLabel(dest, labelStart, labelLength,
561                                             toASCII, info, errorCode);
562             info.errors|=info.labelErrors;
563             info.labelErrors=0;
564             if(U_FAILURE(errorCode)) {
565                 return dest;
566             }
567             destArray=dest.getBuffer();
568             destLength+=newLength-labelLength;
569             labelLimit=labelStart+=newLength+1;
570         } else if(0xdf<=c && c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) {
571             info.isTransDiff=TRUE;
572             if(doMapDevChars) {
573                 destLength=mapDevChars(dest, labelStart, labelLimit, errorCode);
574                 if(U_FAILURE(errorCode)) {
575                     return dest;
576                 }
577                 destArray=dest.getBuffer();
578                 // Do not increment labelLimit in case c was removed.
579                 // All deviation characters have been mapped, no need to check for them again.
580                 doMapDevChars=FALSE;
581             } else {
582                 ++labelLimit;
583             }
584         } else {
585             ++labelLimit;
586         }
587     }
588     // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok)
589     // but not an empty label elsewhere nor a completely empty domain name.
590     // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0.
591     if(0==labelStart || labelStart<labelLimit) {
592         processLabel(dest, labelStart, labelLimit-labelStart,
593                       toASCII, info, errorCode);
594         info.errors|=info.labelErrors;
595     }
596     return dest;
597 }
598 
599 int32_t
mapDevChars(UnicodeString & dest,int32_t labelStart,int32_t mappingStart,UErrorCode & errorCode) const600 UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
601                    UErrorCode &errorCode) const {
602     int32_t length=dest.length();
603     UChar *s=dest.getBuffer(dest[mappingStart]==0xdf ? length+1 : length);
604     if(s==NULL) {
605         errorCode=U_MEMORY_ALLOCATION_ERROR;
606         return length;
607     }
608     int32_t capacity=dest.getCapacity();
609     UBool didMapDevChars=FALSE;
610     int32_t readIndex=mappingStart, writeIndex=mappingStart;
611     do {
612         UChar c=s[readIndex++];
613         switch(c) {
614         case 0xdf:
615             // Map sharp s to ss.
616             didMapDevChars=TRUE;
617             s[writeIndex++]=0x73;  // Replace sharp s with first s.
618             // Insert second s and account for possible buffer reallocation.
619             if(writeIndex==readIndex) {
620                 if(length==capacity) {
621                     dest.releaseBuffer(length);
622                     s=dest.getBuffer(length+1);
623                     if(s==NULL) {
624                         errorCode=U_MEMORY_ALLOCATION_ERROR;
625                         return length;
626                     }
627                     capacity=dest.getCapacity();
628                 }
629                 u_memmove(s+writeIndex+1, s+writeIndex, length-writeIndex);
630                 ++readIndex;
631             }
632             s[writeIndex++]=0x73;
633             ++length;
634             break;
635         case 0x3c2:  // Map final sigma to nonfinal sigma.
636             didMapDevChars=TRUE;
637             s[writeIndex++]=0x3c3;
638             break;
639         case 0x200c:  // Ignore/remove ZWNJ.
640         case 0x200d:  // Ignore/remove ZWJ.
641             didMapDevChars=TRUE;
642             --length;
643             break;
644         default:
645             // Only really necessary if writeIndex was different from readIndex.
646             s[writeIndex++]=c;
647             break;
648         }
649     } while(writeIndex<length);
650     dest.releaseBuffer(length);
651     if(didMapDevChars) {
652         // Mapping deviation characters might have resulted in an un-NFC string.
653         // We could use either the NFC or the UTS #46 normalizer.
654         // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file.
655         UnicodeString normalized;
656         uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCode);
657         if(U_SUCCESS(errorCode)) {
658             dest.replace(labelStart, 0x7fffffff, normalized);
659             return dest.length();
660         }
661     }
662     return length;
663 }
664 
665 // Some non-ASCII characters are equivalent to sequences with
666 // non-LDH ASCII characters. To find them:
667 // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt)
668 static inline UBool
isNonASCIIDisallowedSTD3Valid(UChar32 c)669 isNonASCIIDisallowedSTD3Valid(UChar32 c) {
670     return c==0x2260 || c==0x226E || c==0x226F;
671 }
672 
673 // Replace the label in dest with the label string, if the label was modified.
674 // If &label==&dest then the label was modified in-place and labelLength
675 // is the new label length, different from label.length().
676 // If &label!=&dest then labelLength==label.length().
677 // Returns labelLength (= the new label length).
678 static int32_t
replaceLabel(UnicodeString & dest,int32_t destLabelStart,int32_t destLabelLength,const UnicodeString & label,int32_t labelLength)679 replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLength,
680              const UnicodeString &label, int32_t labelLength) {
681     if(&label!=&dest) {
682         dest.replace(destLabelStart, destLabelLength, label);
683     }
684     return labelLength;
685 }
686 
687 int32_t
processLabel(UnicodeString & dest,int32_t labelStart,int32_t labelLength,UBool toASCII,IDNAInfo & info,UErrorCode & errorCode) const688 UTS46::processLabel(UnicodeString &dest,
689                     int32_t labelStart, int32_t labelLength,
690                     UBool toASCII,
691                     IDNAInfo &info, UErrorCode &errorCode) const {
692     UnicodeString fromPunycode;
693     UnicodeString *labelString;
694     const UChar *label=dest.getBuffer()+labelStart;
695     int32_t destLabelStart=labelStart;
696     int32_t destLabelLength=labelLength;
697     UBool wasPunycode;
698     if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) {
699         // Label starts with "xn--", try to un-Punycode it.
700         wasPunycode=TRUE;
701         UChar *unicodeBuffer=fromPunycode.getBuffer(-1);  // capacity==-1: most labels should fit
702         if(unicodeBuffer==NULL) {
703             // Should never occur if we used capacity==-1 which uses the internal buffer.
704             errorCode=U_MEMORY_ALLOCATION_ERROR;
705             return labelLength;
706         }
707         UErrorCode punycodeErrorCode=U_ZERO_ERROR;
708         int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4,
709                                                 unicodeBuffer, fromPunycode.getCapacity(),
710                                                 NULL, &punycodeErrorCode);
711         if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) {
712             fromPunycode.releaseBuffer(0);
713             unicodeBuffer=fromPunycode.getBuffer(unicodeLength);
714             if(unicodeBuffer==NULL) {
715                 errorCode=U_MEMORY_ALLOCATION_ERROR;
716                 return labelLength;
717             }
718             punycodeErrorCode=U_ZERO_ERROR;
719             unicodeLength=u_strFromPunycode(label+4, labelLength-4,
720                                             unicodeBuffer, fromPunycode.getCapacity(),
721                                             NULL, &punycodeErrorCode);
722         }
723         fromPunycode.releaseBuffer(unicodeLength);
724         if(U_FAILURE(punycodeErrorCode)) {
725             info.labelErrors|=UIDNA_ERROR_PUNYCODE;
726             return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
727         }
728         // Check for NFC, and for characters that are not
729         // valid or deviation characters according to the normalizer.
730         // If there is something wrong, then the string will change.
731         // Note that the normalizer passes through non-LDH ASCII and deviation characters.
732         // Deviation characters are ok in Punycode even in transitional processing.
733         // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
734         // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
735         UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode);
736         if(U_FAILURE(errorCode)) {
737             return labelLength;
738         }
739         if(!isValid) {
740             info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
741             return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
742         }
743         labelString=&fromPunycode;
744         label=fromPunycode.getBuffer();
745         labelStart=0;
746         labelLength=fromPunycode.length();
747     } else {
748         wasPunycode=FALSE;
749         labelString=&dest;
750     }
751     // Validity check
752     if(labelLength==0) {
753         if(toASCII) {
754             info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
755         }
756         return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, labelLength);
757     }
758     // labelLength>0
759     if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) {
760         // label starts with "??--"
761         info.labelErrors|=UIDNA_ERROR_HYPHEN_3_4;
762     }
763     if(label[0]==0x2d) {
764         // label starts with "-"
765         info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
766     }
767     if(label[labelLength-1]==0x2d) {
768         // label ends with "-"
769         info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
770     }
771     // If the label was not a Punycode label, then it was the result of
772     // mapping, normalization and label segmentation.
773     // If the label was in Punycode, then we mapped it again above
774     // and checked its validity.
775     // Now we handle the STD3 restriction to LDH characters (if set)
776     // and we look for U+FFFD which indicates disallowed characters
777     // in a non-Punycode label or U+FFFD itself in a Punycode label.
778     // We also check for dots which can come from the input to a single-label function.
779     // Ok to cast away const because we own the UnicodeString.
780     UChar *s=(UChar *)label;
781     const UChar *limit=label+labelLength;
782     UChar oredChars=0;
783     // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.
784     UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
785     do {
786         UChar c=*s;
787         if(c<=0x7f) {
788             if(c==0x2e) {
789                 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;
790                 *s=0xfffd;
791             } else if(disallowNonLDHDot && asciiData[c]<0) {
792                 info.labelErrors|=UIDNA_ERROR_DISALLOWED;
793                 *s=0xfffd;
794             }
795         } else {
796             oredChars|=c;
797             if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) {
798                 info.labelErrors|=UIDNA_ERROR_DISALLOWED;
799                 *s=0xfffd;
800             } else if(c==0xfffd) {
801                 info.labelErrors|=UIDNA_ERROR_DISALLOWED;
802             }
803         }
804         ++s;
805     } while(s<limit);
806     // Check for a leading combining mark after other validity checks
807     // so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here.
808     UChar32 c;
809     int32_t cpLength=0;
810     // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD.
811     U16_NEXT_UNSAFE(label, cpLength, c);
812     if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) {
813         info.labelErrors|=UIDNA_ERROR_LEADING_COMBINING_MARK;
814         labelString->replace(labelStart, cpLength, (UChar)0xfffd);
815         label=labelString->getBuffer()+labelStart;
816         labelLength+=1-cpLength;
817         if(labelString==&dest) {
818             destLabelLength=labelLength;
819         }
820     }
821     if((info.labelErrors&severeErrors)==0) {
822         // Do contextual checks only if we do not have U+FFFD from a severe error
823         // because U+FFFD can make these checks fail.
824         if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi || info.isOkBiDi)) {
825             checkLabelBiDi(label, labelLength, info);
826         }
827         if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c &&
828             !isLabelOkContextJ(label, labelLength)
829         ) {
830             info.labelErrors|=UIDNA_ERROR_CONTEXTJ;
831         }
832         if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) {
833             checkLabelContextO(label, labelLength, info);
834         }
835         if(toASCII) {
836             if(wasPunycode) {
837                 // Leave a Punycode label unchanged if it has no severe errors.
838                 if(destLabelLength>63) {
839                     info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
840                 }
841                 return destLabelLength;
842             } else if(oredChars>=0x80) {
843                 // Contains non-ASCII characters.
844                 UnicodeString punycode;
845                 UChar *buffer=punycode.getBuffer(63);  // 63==maximum DNS label length
846                 if(buffer==NULL) {
847                     errorCode=U_MEMORY_ALLOCATION_ERROR;
848                     return destLabelLength;
849                 }
850                 buffer[0]=0x78;  // Write "xn--".
851                 buffer[1]=0x6e;
852                 buffer[2]=0x2d;
853                 buffer[3]=0x2d;
854                 int32_t punycodeLength=u_strToPunycode(label, labelLength,
855                                                       buffer+4, punycode.getCapacity()-4,
856                                                       NULL, &errorCode);
857                 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
858                     errorCode=U_ZERO_ERROR;
859                     punycode.releaseBuffer(4);
860                     buffer=punycode.getBuffer(4+punycodeLength);
861                     if(buffer==NULL) {
862                         errorCode=U_MEMORY_ALLOCATION_ERROR;
863                         return destLabelLength;
864                     }
865                     punycodeLength=u_strToPunycode(label, labelLength,
866                                                   buffer+4, punycode.getCapacity()-4,
867                                                   NULL, &errorCode);
868                 }
869                 punycodeLength+=4;
870                 punycode.releaseBuffer(punycodeLength);
871                 if(U_FAILURE(errorCode)) {
872                     return destLabelLength;
873                 }
874                 if(punycodeLength>63) {
875                     info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
876                 }
877                 return replaceLabel(dest, destLabelStart, destLabelLength,
878                                     punycode, punycodeLength);
879             } else {
880                 // all-ASCII label
881                 if(labelLength>63) {
882                     info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
883                 }
884             }
885         }
886     } else {
887         // If a Punycode label has severe errors,
888         // then leave it but make sure it does not look valid.
889         if(wasPunycode) {
890             info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
891             return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info);
892         }
893     }
894     return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, labelLength);
895 }
896 
897 // Make sure an ACE label does not look valid.
898 // Append U+FFFD if the label has only LDH characters.
899 // If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD.
900 int32_t
markBadACELabel(UnicodeString & dest,int32_t labelStart,int32_t labelLength,UBool toASCII,IDNAInfo & info) const901 UTS46::markBadACELabel(UnicodeString &dest,
902                        int32_t labelStart, int32_t labelLength,
903                        UBool toASCII, IDNAInfo &info) const {
904     UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
905     UBool isASCII=TRUE;
906     UBool onlyLDH=TRUE;
907     const UChar *label=dest.getBuffer()+labelStart;
908     // Ok to cast away const because we own the UnicodeString.
909     UChar *s=(UChar *)label+4;  // After the initial "xn--".
910     const UChar *limit=label+labelLength;
911     do {
912         UChar c=*s;
913         if(c<=0x7f) {
914             if(c==0x2e) {
915                 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;
916                 *s=0xfffd;
917                 isASCII=onlyLDH=FALSE;
918             } else if(asciiData[c]<0) {
919                 onlyLDH=FALSE;
920                 if(disallowNonLDHDot) {
921                     *s=0xfffd;
922                     isASCII=FALSE;
923                 }
924             }
925         } else {
926             isASCII=onlyLDH=FALSE;
927         }
928     } while(++s<limit);
929     if(onlyLDH) {
930         dest.insert(labelStart+labelLength, (UChar)0xfffd);
931         ++labelLength;
932     } else {
933         if(toASCII && isASCII && labelLength>63) {
934             info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
935         }
936     }
937     return labelLength;
938 }
939 
940 const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT);
941 const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)|U_MASK(U_RIGHT_TO_LEFT_ARABIC);
942 const uint32_t L_R_AL_MASK=L_MASK|R_AL_MASK;
943 
944 const uint32_t R_AL_AN_MASK=R_AL_MASK|U_MASK(U_ARABIC_NUMBER);
945 
946 const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)|U_MASK(U_ARABIC_NUMBER);
947 const uint32_t R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK;
948 const uint32_t L_EN_MASK=L_MASK|U_MASK(U_EUROPEAN_NUMBER);
949 
950 const uint32_t ES_CS_ET_ON_BN_NSM_MASK=
951     U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)|
952     U_MASK(U_COMMON_NUMBER_SEPARATOR)|
953     U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)|
954     U_MASK(U_OTHER_NEUTRAL)|
955     U_MASK(U_BOUNDARY_NEUTRAL)|
956     U_MASK(U_DIR_NON_SPACING_MARK);
957 const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
958 const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
959 
960 // We scan the whole label and check both for whether it contains RTL characters
961 // and whether it passes the BiDi Rule.
962 // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find
963 // that a domain name is a BiDi domain name (has an RTL label) only after
964 // processing several earlier labels.
965 void
checkLabelBiDi(const UChar * label,int32_t labelLength,IDNAInfo & info) const966 UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const {
967     // IDNA2008 BiDi rule
968     // Get the directionality of the first character.
969     UChar32 c;
970     int32_t i=0;
971     U16_NEXT_UNSAFE(label, i, c);
972     uint32_t firstMask=U_MASK(u_charDirection(c));
973     // 1. The first character must be a character with BIDI property L, R
974     // or AL.  If it has the R or AL property, it is an RTL label; if it
975     // has the L property, it is an LTR label.
976     if((firstMask&~L_R_AL_MASK)!=0) {
977         info.isOkBiDi=FALSE;
978     }
979     // Get the directionality of the last non-NSM character.
980     uint32_t lastMask;
981     for(;;) {
982         if(i>=labelLength) {
983             lastMask=firstMask;
984             break;
985         }
986         U16_PREV_UNSAFE(label, labelLength, c);
987         UCharDirection dir=u_charDirection(c);
988         if(dir!=U_DIR_NON_SPACING_MARK) {
989             lastMask=U_MASK(dir);
990             break;
991         }
992     }
993     // 3. In an RTL label, the end of the label must be a character with
994     // BIDI property R, AL, EN or AN, followed by zero or more
995     // characters with BIDI property NSM.
996     // 6. In an LTR label, the end of the label must be a character with
997     // BIDI property L or EN, followed by zero or more characters with
998     // BIDI property NSM.
999     if( (firstMask&L_MASK)!=0 ?
1000             (lastMask&~L_EN_MASK)!=0 :
1001             (lastMask&~R_AL_EN_AN_MASK)!=0
1002     ) {
1003         info.isOkBiDi=FALSE;
1004     }
1005     // Get the directionalities of the intervening characters.
1006     uint32_t mask=0;
1007     while(i<labelLength) {
1008         U16_NEXT_UNSAFE(label, i, c);
1009         mask|=U_MASK(u_charDirection(c));
1010     }
1011     if(firstMask&L_MASK) {
1012         // 5. In an LTR label, only characters with the BIDI properties L, EN,
1013         // ES, CS, ET, ON, BN and NSM are allowed.
1014         if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
1015             info.isOkBiDi=FALSE;
1016         }
1017     } else {
1018         // 2. In an RTL label, only characters with the BIDI properties R, AL,
1019         // AN, EN, ES, CS, ET, ON, BN and NSM are allowed.
1020         if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
1021             info.isOkBiDi=FALSE;
1022         }
1023         // 4. In an RTL label, if an EN is present, no AN may be present, and
1024         // vice versa.
1025         if((mask&EN_AN_MASK)==EN_AN_MASK) {
1026             info.isOkBiDi=FALSE;
1027         }
1028     }
1029     // An RTL label is a label that contains at least one character of type
1030     // R, AL or AN. [...]
1031     // A "BIDI domain name" is a domain name that contains at least one RTL
1032     // label. [...]
1033     // The following rule, consisting of six conditions, applies to labels
1034     // in BIDI domain names.
1035     if(((firstMask|mask|lastMask)&R_AL_AN_MASK)!=0) {
1036         info.isBiDi=TRUE;
1037     }
1038 }
1039 
1040 // Special code for the ASCII prefix of a BiDi domain name.
1041 // The ASCII prefix is all-LTR.
1042 
1043 // IDNA2008 BiDi rule, parts relevant to ASCII labels:
1044 // 1. The first character must be a character with BIDI property L [...]
1045 // 5. In an LTR label, only characters with the BIDI properties L, EN,
1046 // ES, CS, ET, ON, BN and NSM are allowed.
1047 // 6. In an LTR label, the end of the label must be a character with
1048 // BIDI property L or EN [...]
1049 
1050 // UTF-16 version, called for mapped ASCII prefix.
1051 // Cannot contain uppercase A-Z.
1052 // s[length-1] must be the trailing dot.
1053 static UBool
isASCIIOkBiDi(const UChar * s,int32_t length)1054 isASCIIOkBiDi(const UChar *s, int32_t length) {
1055     int32_t labelStart=0;
1056     for(int32_t i=0; i<length; ++i) {
1057         UChar c=s[i];
1058         if(c==0x2e) {  // dot
1059             if(i>labelStart) {
1060                 c=s[i-1];
1061                 if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) {
1062                     // Last character in the label is not an L or EN.
1063                     return FALSE;
1064                 }
1065             }
1066             labelStart=i+1;
1067         } else if(i==labelStart) {
1068             if(!(0x61<=c && c<=0x7a)) {
1069                 // First character in the label is not an L.
1070                 return FALSE;
1071             }
1072         } else {
1073             if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
1074                 // Intermediate character in the label is a B, S or WS.
1075                 return FALSE;
1076             }
1077         }
1078     }
1079     return TRUE;
1080 }
1081 
1082 // UTF-8 version, called for source ASCII prefix.
1083 // Can contain uppercase A-Z.
1084 // s[length-1] must be the trailing dot.
1085 static UBool
isASCIIOkBiDi(const char * s,int32_t length)1086 isASCIIOkBiDi(const char *s, int32_t length) {
1087     int32_t labelStart=0;
1088     for(int32_t i=0; i<length; ++i) {
1089         char c=s[i];
1090         if(c==0x2e) {  // dot
1091             if(i>labelStart) {
1092                 c=s[i-1];
1093                 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c && c<=0x39)) {
1094                     // Last character in the label is not an L or EN.
1095                     return FALSE;
1096                 }
1097             }
1098             labelStart=i+1;
1099         } else if(i==labelStart) {
1100             if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) {
1101                 // First character in the label is not an L.
1102                 return FALSE;
1103             }
1104         } else {
1105             if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
1106                 // Intermediate character in the label is a B, S or WS.
1107                 return FALSE;
1108             }
1109         }
1110     }
1111     return TRUE;
1112 }
1113 
1114 UBool
isLabelOkContextJ(const UChar * label,int32_t labelLength) const1115 UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const {
1116     const UBiDiProps *bdp=ubidi_getSingleton();
1117     // [IDNA2008-Tables]
1118     // 200C..200D  ; CONTEXTJ    # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
1119     for(int32_t i=0; i<labelLength; ++i) {
1120         if(label[i]==0x200c) {
1121             // Appendix A.1. ZERO WIDTH NON-JOINER
1122             // Rule Set:
1123             //  False;
1124             //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
1125             //  If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
1126             //     (Joining_Type:T)*(Joining_Type:{R,D})) Then True;
1127             if(i==0) {
1128                 return FALSE;
1129             }
1130             UChar32 c;
1131             int32_t j=i;
1132             U16_PREV_UNSAFE(label, j, c);
1133             if(uts46Norm2.getCombiningClass(c)==9) {
1134                 continue;
1135             }
1136             // check precontext (Joining_Type:{L,D})(Joining_Type:T)*
1137             for(;;) {
1138                 UJoiningType type=ubidi_getJoiningType(bdp, c);
1139                 if(type==U_JT_TRANSPARENT) {
1140                     if(j==0) {
1141                         return FALSE;
1142                     }
1143                     U16_PREV_UNSAFE(label, j, c);
1144                 } else if(type==U_JT_LEFT_JOINING || type==U_JT_DUAL_JOINING) {
1145                     break;  // precontext fulfilled
1146                 } else {
1147                     return FALSE;
1148                 }
1149             }
1150             // check postcontext (Joining_Type:T)*(Joining_Type:{R,D})
1151             for(j=i+1;;) {
1152                 if(j==labelLength) {
1153                     return FALSE;
1154                 }
1155                 U16_NEXT_UNSAFE(label, j, c);
1156                 UJoiningType type=ubidi_getJoiningType(bdp, c);
1157                 if(type==U_JT_TRANSPARENT) {
1158                     // just skip this character
1159                 } else if(type==U_JT_RIGHT_JOINING || type==U_JT_DUAL_JOINING) {
1160                     break;  // postcontext fulfilled
1161                 } else {
1162                     return FALSE;
1163                 }
1164             }
1165         } else if(label[i]==0x200d) {
1166             // Appendix A.2. ZERO WIDTH JOINER (U+200D)
1167             // Rule Set:
1168             //  False;
1169             //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
1170             if(i==0) {
1171                 return FALSE;
1172             }
1173             UChar32 c;
1174             int32_t j=i;
1175             U16_PREV_UNSAFE(label, j, c);
1176             if(uts46Norm2.getCombiningClass(c)!=9) {
1177                 return FALSE;
1178             }
1179         }
1180     }
1181     return TRUE;
1182 }
1183 
1184 void
checkLabelContextO(const UChar * label,int32_t labelLength,IDNAInfo & info) const1185 UTS46::checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const {
1186     int32_t labelEnd=labelLength-1;  // inclusive
1187     int32_t arabicDigits=0;  // -1 for 066x, +1 for 06Fx
1188     for(int32_t i=0; i<=labelEnd; ++i) {
1189         UChar32 c=label[i];
1190         if(c<0xb7) {
1191             // ASCII fastpath
1192         } else if(c<=0x6f9) {
1193             if(c==0xb7) {
1194                 // Appendix A.3. MIDDLE DOT (U+00B7)
1195                 // Rule Set:
1196                 //  False;
1197                 //  If Before(cp) .eq.  U+006C And
1198                 //     After(cp) .eq.  U+006C Then True;
1199                 if(!(0<i && label[i-1]==0x6c &&
1200                      i<labelEnd && label[i+1]==0x6c)) {
1201                     info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1202                 }
1203             } else if(c==0x375) {
1204                 // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
1205                 // Rule Set:
1206                 //  False;
1207                 //  If Script(After(cp)) .eq.  Greek Then True;
1208                 UScriptCode script=USCRIPT_INVALID_CODE;
1209                 if(i<labelEnd) {
1210                     UErrorCode errorCode=U_ZERO_ERROR;
1211                     int32_t j=i+1;
1212                     U16_NEXT(label, j, labelLength, c);
1213                     script=uscript_getScript(c, &errorCode);
1214                 }
1215                 if(script!=USCRIPT_GREEK) {
1216                     info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1217                 }
1218             } else if(c==0x5f3 || c==0x5f4) {
1219                 // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
1220                 // Rule Set:
1221                 //  False;
1222                 //  If Script(Before(cp)) .eq.  Hebrew Then True;
1223                 //
1224                 // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
1225                 // Rule Set:
1226                 //  False;
1227                 //  If Script(Before(cp)) .eq.  Hebrew Then True;
1228                 UScriptCode script=USCRIPT_INVALID_CODE;
1229                 if(0<i) {
1230                     UErrorCode errorCode=U_ZERO_ERROR;
1231                     int32_t j=i;
1232                     U16_PREV(label, 0, j, c);
1233                     script=uscript_getScript(c, &errorCode);
1234                 }
1235                 if(script!=USCRIPT_HEBREW) {
1236                     info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1237                 }
1238             } else if(0x660<=c /* && c<=0x6f9 */) {
1239                 // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
1240                 // Rule Set:
1241                 //  True;
1242                 //  For All Characters:
1243                 //    If cp .in. 06F0..06F9 Then False;
1244                 //  End For;
1245                 //
1246                 // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
1247                 // Rule Set:
1248                 //  True;
1249                 //  For All Characters:
1250                 //    If cp .in. 0660..0669 Then False;
1251                 //  End For;
1252                 if(c<=0x669) {
1253                     if(arabicDigits>0) {
1254                         info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;
1255                     }
1256                     arabicDigits=-1;
1257                 } else if(0x6f0<=c) {
1258                     if(arabicDigits<0) {
1259                         info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;
1260                     }
1261                     arabicDigits=1;
1262                 }
1263             }
1264         } else if(c==0x30fb) {
1265             // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
1266             // Rule Set:
1267             //  False;
1268             //  For All Characters:
1269             //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
1270             //  End For;
1271             UErrorCode errorCode=U_ZERO_ERROR;
1272             for(int j=0;;) {
1273                 if(j>labelEnd) {
1274                     info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
1275                     break;
1276                 }
1277                 U16_NEXT(label, j, labelLength, c);
1278                 UScriptCode script=uscript_getScript(c, &errorCode);
1279                 if(script==USCRIPT_HIRAGANA || script==USCRIPT_KATAKANA || script==USCRIPT_HAN) {
1280                     break;
1281                 }
1282             }
1283         }
1284     }
1285 }
1286 
1287 U_NAMESPACE_END
1288 
1289 // C API ------------------------------------------------------------------- ***
1290 
1291 U_NAMESPACE_USE
1292 
1293 U_CAPI UIDNA * U_EXPORT2
uidna_openUTS46(uint32_t options,UErrorCode * pErrorCode)1294 uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) {
1295     return reinterpret_cast<UIDNA *>(IDNA::createUTS46Instance(options, *pErrorCode));
1296 }
1297 
1298 U_CAPI void U_EXPORT2
uidna_close(UIDNA * idna)1299 uidna_close(UIDNA *idna) {
1300     delete reinterpret_cast<IDNA *>(idna);
1301 }
1302 
1303 static UBool
checkArgs(const void * label,int32_t length,void * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1304 checkArgs(const void *label, int32_t length,
1305           void *dest, int32_t capacity,
1306           UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1307     if(U_FAILURE(*pErrorCode)) {
1308         return FALSE;
1309     }
1310     // sizeof(UIDNAInfo)=16 in the first API version.
1311     if(pInfo==NULL || pInfo->size<16) {
1312         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1313         return FALSE;
1314     }
1315     if( (label==NULL ? length!=0 : length<-1) ||
1316         (dest==NULL ? capacity!=0 : capacity<0) ||
1317         (dest==label && label!=NULL)
1318     ) {
1319         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1320         return FALSE;
1321     }
1322     // Set all *pInfo bytes to 0 except for the size field itself.
1323     uprv_memset(&pInfo->size+1, 0, pInfo->size-sizeof(pInfo->size));
1324     return TRUE;
1325 }
1326 
1327 static void
idnaInfoToStruct(IDNAInfo & info,UIDNAInfo * pInfo)1328 idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) {
1329     pInfo->isTransitionalDifferent=info.isTransitionalDifferent();
1330     pInfo->errors=info.getErrors();
1331 }
1332 
1333 U_CAPI int32_t U_EXPORT2
uidna_labelToASCII(const UIDNA * idna,const UChar * label,int32_t length,UChar * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1334 uidna_labelToASCII(const UIDNA *idna,
1335                    const UChar *label, int32_t length,
1336                    UChar *dest, int32_t capacity,
1337                    UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1338     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1339         return 0;
1340     }
1341     UnicodeString src((UBool)(length<0), label, length);
1342     UnicodeString destString(dest, 0, capacity);
1343     IDNAInfo info;
1344     reinterpret_cast<const IDNA *>(idna)->labelToASCII(src, destString, info, *pErrorCode);
1345     idnaInfoToStruct(info, pInfo);
1346     return destString.extract(dest, capacity, *pErrorCode);
1347 }
1348 
1349 U_CAPI int32_t U_EXPORT2
uidna_labelToUnicode(const UIDNA * idna,const UChar * label,int32_t length,UChar * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1350 uidna_labelToUnicode(const UIDNA *idna,
1351                      const UChar *label, int32_t length,
1352                      UChar *dest, int32_t capacity,
1353                      UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1354     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1355         return 0;
1356     }
1357     UnicodeString src((UBool)(length<0), label, length);
1358     UnicodeString destString(dest, 0, capacity);
1359     IDNAInfo info;
1360     reinterpret_cast<const IDNA *>(idna)->labelToUnicode(src, destString, info, *pErrorCode);
1361     idnaInfoToStruct(info, pInfo);
1362     return destString.extract(dest, capacity, *pErrorCode);
1363 }
1364 
1365 U_CAPI int32_t U_EXPORT2
uidna_nameToASCII(const UIDNA * idna,const UChar * name,int32_t length,UChar * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1366 uidna_nameToASCII(const UIDNA *idna,
1367                   const UChar *name, int32_t length,
1368                   UChar *dest, int32_t capacity,
1369                   UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1370     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1371         return 0;
1372     }
1373     UnicodeString src((UBool)(length<0), name, length);
1374     UnicodeString destString(dest, 0, capacity);
1375     IDNAInfo info;
1376     reinterpret_cast<const IDNA *>(idna)->nameToASCII(src, destString, info, *pErrorCode);
1377     idnaInfoToStruct(info, pInfo);
1378     return destString.extract(dest, capacity, *pErrorCode);
1379 }
1380 
1381 U_CAPI int32_t U_EXPORT2
uidna_nameToUnicode(const UIDNA * idna,const UChar * name,int32_t length,UChar * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1382 uidna_nameToUnicode(const UIDNA *idna,
1383                     const UChar *name, int32_t length,
1384                     UChar *dest, int32_t capacity,
1385                     UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1386     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1387         return 0;
1388     }
1389     UnicodeString src((UBool)(length<0), name, length);
1390     UnicodeString destString(dest, 0, capacity);
1391     IDNAInfo info;
1392     reinterpret_cast<const IDNA *>(idna)->nameToUnicode(src, destString, info, *pErrorCode);
1393     idnaInfoToStruct(info, pInfo);
1394     return destString.extract(dest, capacity, *pErrorCode);
1395 }
1396 
1397 U_CAPI int32_t U_EXPORT2
uidna_labelToASCII_UTF8(const UIDNA * idna,const char * label,int32_t length,char * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1398 uidna_labelToASCII_UTF8(const UIDNA *idna,
1399                         const char *label, int32_t length,
1400                         char *dest, int32_t capacity,
1401                         UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1402     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1403         return 0;
1404     }
1405     StringPiece src(label, length<0 ? uprv_strlen(label) : length);
1406     CheckedArrayByteSink sink(dest, capacity);
1407     IDNAInfo info;
1408     reinterpret_cast<const IDNA *>(idna)->labelToASCII_UTF8(src, sink, info, *pErrorCode);
1409     idnaInfoToStruct(info, pInfo);
1410     return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1411 }
1412 
1413 U_CAPI int32_t U_EXPORT2
uidna_labelToUnicodeUTF8(const UIDNA * idna,const char * label,int32_t length,char * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1414 uidna_labelToUnicodeUTF8(const UIDNA *idna,
1415                          const char *label, int32_t length,
1416                          char *dest, int32_t capacity,
1417                          UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1418     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
1419         return 0;
1420     }
1421     StringPiece src(label, length<0 ? uprv_strlen(label) : length);
1422     CheckedArrayByteSink sink(dest, capacity);
1423     IDNAInfo info;
1424     reinterpret_cast<const IDNA *>(idna)->labelToUnicodeUTF8(src, sink, info, *pErrorCode);
1425     idnaInfoToStruct(info, pInfo);
1426     return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1427 }
1428 
1429 U_CAPI int32_t U_EXPORT2
uidna_nameToASCII_UTF8(const UIDNA * idna,const char * name,int32_t length,char * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1430 uidna_nameToASCII_UTF8(const UIDNA *idna,
1431                        const char *name, int32_t length,
1432                        char *dest, int32_t capacity,
1433                        UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1434     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1435         return 0;
1436     }
1437     StringPiece src(name, length<0 ? uprv_strlen(name) : length);
1438     CheckedArrayByteSink sink(dest, capacity);
1439     IDNAInfo info;
1440     reinterpret_cast<const IDNA *>(idna)->nameToASCII_UTF8(src, sink, info, *pErrorCode);
1441     idnaInfoToStruct(info, pInfo);
1442     return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1443 }
1444 
1445 U_CAPI int32_t U_EXPORT2
uidna_nameToUnicodeUTF8(const UIDNA * idna,const char * name,int32_t length,char * dest,int32_t capacity,UIDNAInfo * pInfo,UErrorCode * pErrorCode)1446 uidna_nameToUnicodeUTF8(const UIDNA *idna,
1447                         const char *name, int32_t length,
1448                         char *dest, int32_t capacity,
1449                         UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
1450     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
1451         return 0;
1452     }
1453     StringPiece src(name, length<0 ? uprv_strlen(name) : length);
1454     CheckedArrayByteSink sink(dest, capacity);
1455     IDNAInfo info;
1456     reinterpret_cast<const IDNA *>(idna)->nameToUnicodeUTF8(src, sink, info, *pErrorCode);
1457     idnaInfoToStruct(info, pInfo);
1458     return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
1459 }
1460 
1461 #endif  // UCONFIG_NO_IDNA
1462