• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5 *******************************************************************************
6 * Copyright (C) 2010-2014, International Business Machines
7 * Corporation and others.  All Rights Reserved.
8 *******************************************************************************
9 */
10 package ohos.global.icu.impl;
11 
12 import java.util.EnumSet;
13 
14 import ohos.global.icu.impl.Normalizer2Impl.UTF16Plus;
15 import ohos.global.icu.lang.UCharacter;
16 import ohos.global.icu.lang.UCharacterCategory;
17 import ohos.global.icu.lang.UCharacterDirection;
18 import ohos.global.icu.lang.UScript;
19 import ohos.global.icu.text.IDNA;
20 import ohos.global.icu.text.Normalizer2;
21 import ohos.global.icu.text.StringPrepParseException;
22 import ohos.global.icu.util.ICUException;
23 
24 // Note about tests for IDNA.Error.DOMAIN_NAME_TOO_LONG:
25 //
26 // The domain name length limit is 255 octets in an internal DNS representation
27 // where the last ("root") label is the empty label
28 // represented by length byte 0 alone.
29 // In a conventional string, this translates to 253 characters, or 254
30 // if there is a trailing dot for the root label.
31 
32 /**
33  * UTS #46 (IDNA2008) implementation.
34  * @author Markus Scherer
35  * @hide exposed on OHOS
36  */
37 public final class UTS46 extends IDNA {
UTS46(int options)38     public UTS46(int options) {
39         this.options=options;
40     }
41 
42     @Override
labelToASCII(CharSequence label, StringBuilder dest, Info info)43     public StringBuilder labelToASCII(CharSequence label, StringBuilder dest, Info info) {
44         return process(label, true, true, dest, info);
45     }
46 
47     @Override
labelToUnicode(CharSequence label, StringBuilder dest, Info info)48     public StringBuilder labelToUnicode(CharSequence label, StringBuilder dest, Info info) {
49         return process(label, true, false, dest, info);
50     }
51 
52     @Override
nameToASCII(CharSequence name, StringBuilder dest, Info info)53     public StringBuilder nameToASCII(CharSequence name, StringBuilder dest, Info info) {
54         process(name, false, true, dest, info);
55         if( dest.length()>=254 && !info.getErrors().contains(Error.DOMAIN_NAME_TOO_LONG) &&
56             isASCIIString(dest) &&
57             (dest.length()>254 || dest.charAt(253)!='.')
58         ) {
59             addError(info, Error.DOMAIN_NAME_TOO_LONG);
60         }
61         return dest;
62     }
63 
64     @Override
nameToUnicode(CharSequence name, StringBuilder dest, Info info)65     public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info info) {
66         return process(name, false, false, dest, info);
67     }
68 
69     private static final Normalizer2 uts46Norm2=
70         Normalizer2.getInstance(null, "uts46", Normalizer2.Mode.COMPOSE);  // uts46.nrm
71     final int options;
72 
73     // Severe errors which usually result in a U+FFFD replacement character in the result string.
74     private static final EnumSet<Error> severeErrors=EnumSet.of(
75         Error.LEADING_COMBINING_MARK,
76         Error.DISALLOWED,
77         Error.PUNYCODE,
78         Error.LABEL_HAS_DOT,
79         Error.INVALID_ACE_LABEL);
80 
81     private static boolean
isASCIIString(CharSequence dest)82     isASCIIString(CharSequence dest) {
83         int length=dest.length();
84         for(int i=0; i<length; ++i) {
85             if(dest.charAt(i)>0x7f) {
86                 return false;
87             }
88         }
89         return true;
90     }
91 
92     // UTS #46 data for ASCII characters.
93     // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase
94     // and passes through all other ASCII characters.
95     // If USE_STD3_RULES is set, then non-LDH characters are disallowed
96     // using this data.
97     // The ASCII fastpath also uses this data.
98     // Values: -1=disallowed  0==valid  1==mapped (lowercase)
99     private static final byte asciiData[]={
100         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
101         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
102         // 002D..002E; valid  #  HYPHEN-MINUS..FULL STOP
103         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0, -1,
104         // 0030..0039; valid  #  DIGIT ZERO..DIGIT NINE
105          0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,
106         // 0041..005A; mapped  #  LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
107         -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
108          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1, -1,
109         // 0061..007A; valid  #  LATIN SMALL LETTER A..LATIN SMALL LETTER Z
110         -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
111          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1
112     };
113 
114     private StringBuilder
process(CharSequence src, boolean isLabel, boolean toASCII, StringBuilder dest, Info info)115     process(CharSequence src,
116             boolean isLabel, boolean toASCII,
117             StringBuilder dest,
118             Info info) {
119         // uts46Norm2.normalize() would do all of this error checking and setup,
120         // but with the ASCII fastpath we do not always call it, and do not
121         // call it first.
122         if(dest==src) {
123             throw new IllegalArgumentException();
124         }
125         // Arguments are fine, reset output values.
126         dest.delete(0, 0x7fffffff);
127         resetInfo(info);
128         int srcLength=src.length();
129         if(srcLength==0) {
130             addError(info, Error.EMPTY_LABEL);
131             return dest;
132         }
133         // ASCII fastpath
134         boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
135         int labelStart=0;
136         int i;
137         for(i=0;; ++i) {
138             if(i==srcLength) {
139                 if(toASCII) {
140                     if((i-labelStart)>63) {
141                         addLabelError(info, Error.LABEL_TOO_LONG);
142                     }
143                     // There is a trailing dot if labelStart==i.
144                     if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
145                         addError(info, Error.DOMAIN_NAME_TOO_LONG);
146                     }
147                 }
148                 promoteAndResetLabelErrors(info);
149                 return dest;
150             }
151             char c=src.charAt(i);
152             if(c>0x7f) {
153                 break;
154             }
155             int cData=asciiData[c];
156             if(cData>0) {
157                 dest.append((char)(c+0x20));  // Lowercase an uppercase ASCII letter.
158             } else if(cData<0 && disallowNonLDHDot) {
159                 break;  // Replacing with U+FFFD can be complicated for toASCII.
160             } else {
161                 dest.append(c);
162                 if(c=='-') {  // hyphen
163                     if(i==(labelStart+3) && src.charAt(i-1)=='-') {
164                         // "??--..." is Punycode or forbidden.
165                         ++i;  // '-' was copied to dest already
166                         break;
167                     }
168                     if(i==labelStart) {
169                         // label starts with "-"
170                         addLabelError(info, Error.LEADING_HYPHEN);
171                     }
172                     if((i+1)==srcLength || src.charAt(i+1)=='.') {
173                         // label ends with "-"
174                         addLabelError(info, Error.TRAILING_HYPHEN);
175                     }
176                 } else if(c=='.') {  // dot
177                     if(isLabel) {
178                         // Replacing with U+FFFD can be complicated for toASCII.
179                         ++i;  // '.' was copied to dest already
180                         break;
181                     }
182                     if(i==labelStart) {
183                         addLabelError(info, Error.EMPTY_LABEL);
184                     }
185                     if(toASCII && (i-labelStart)>63) {
186                         addLabelError(info, Error.LABEL_TOO_LONG);
187                     }
188                     promoteAndResetLabelErrors(info);
189                     labelStart=i+1;
190                 }
191             }
192         }
193         promoteAndResetLabelErrors(info);
194         processUnicode(src, labelStart, i, isLabel, toASCII, dest, info);
195         if( isBiDi(info) && !hasCertainErrors(info, severeErrors) &&
196             (!isOkBiDi(info) || (labelStart>0 && !isASCIIOkBiDi(dest, labelStart)))
197         ) {
198             addError(info, Error.BIDI);
199         }
200         return dest;
201     }
202 
203     private StringBuilder
processUnicode(CharSequence src, int labelStart, int mappingStart, boolean isLabel, boolean toASCII, StringBuilder dest, Info info)204     processUnicode(CharSequence src,
205                    int labelStart, int mappingStart,
206                    boolean isLabel, boolean toASCII,
207                    StringBuilder dest,
208                    Info info) {
209         if(mappingStart==0) {
210             uts46Norm2.normalize(src, dest);
211         } else {
212             uts46Norm2.normalizeSecondAndAppend(dest, src.subSequence(mappingStart, src.length()));
213         }
214         boolean doMapDevChars=
215             toASCII ? (options&NONTRANSITIONAL_TO_ASCII)==0 :
216                       (options&NONTRANSITIONAL_TO_UNICODE)==0;
217         int destLength=dest.length();
218         int labelLimit=labelStart;
219         while(labelLimit<destLength) {
220             char c=dest.charAt(labelLimit);
221             if(c=='.' && !isLabel) {
222                 int labelLength=labelLimit-labelStart;
223                 int newLength=processLabel(dest, labelStart, labelLength,
224                                                 toASCII, info);
225                 promoteAndResetLabelErrors(info);
226                 destLength+=newLength-labelLength;
227                 labelLimit=labelStart+=newLength+1;
228                 continue;
229             } else if(c<0xdf) {
230                 // pass
231             } else if(c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) {
232                 setTransitionalDifferent(info);
233                 if(doMapDevChars) {
234                     destLength=mapDevChars(dest, labelStart, labelLimit);
235                     // All deviation characters have been mapped, no need to check for them again.
236                     doMapDevChars=false;
237                     // Do not increment labelLimit in case c was removed.
238                     continue;
239                 }
240             } else if(Character.isSurrogate(c)) {
241                 if(UTF16Plus.isSurrogateLead(c) ?
242                         (labelLimit+1)==destLength ||
243                             !Character.isLowSurrogate(dest.charAt(labelLimit+1)) :
244                         labelLimit==labelStart ||
245                             !Character.isHighSurrogate(dest.charAt(labelLimit-1))) {
246                     // Map an unpaired surrogate to U+FFFD before normalization so that when
247                     // that removes characters we do not turn two unpaired ones into a pair.
248                     addLabelError(info, Error.DISALLOWED);
249                     dest.setCharAt(labelLimit, '\ufffd');
250                 }
251             }
252             ++labelLimit;
253         }
254         // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok)
255         // but not an empty label elsewhere nor a completely empty domain name.
256         // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0.
257         if(0==labelStart || labelStart<labelLimit) {
258             processLabel(dest, labelStart, labelLimit-labelStart, toASCII, info);
259             promoteAndResetLabelErrors(info);
260         }
261         return dest;
262     }
263 
264     // returns the new dest.length()
265     private int
mapDevChars(StringBuilder dest, int labelStart, int mappingStart)266     mapDevChars(StringBuilder dest, int labelStart, int mappingStart) {
267         int length=dest.length();
268         boolean didMapDevChars=false;
269         for(int i=mappingStart; i<length;) {
270             char c=dest.charAt(i);
271             switch(c) {
272             case 0xdf:
273                 // Map sharp s to ss.
274                 didMapDevChars=true;
275                 dest.setCharAt(i++, 's');
276                 dest.insert(i++, 's');
277                 ++length;
278                 break;
279             case 0x3c2:  // Map final sigma to nonfinal sigma.
280                 didMapDevChars=true;
281                 dest.setCharAt(i++, '\u03c3');
282                 break;
283             case 0x200c:  // Ignore/remove ZWNJ.
284             case 0x200d:  // Ignore/remove ZWJ.
285                 didMapDevChars=true;
286                 dest.delete(i, i+1);
287                 --length;
288                 break;
289             default:
290                 ++i;
291                 break;
292             }
293         }
294         if(didMapDevChars) {
295             // Mapping deviation characters might have resulted in an un-NFC string.
296             // We could use either the NFC or the UTS #46 normalizer.
297             // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file.
298             String normalized=uts46Norm2.normalize(dest.subSequence(labelStart, dest.length()));
299             dest.replace(labelStart, 0x7fffffff, normalized);
300             return dest.length();
301         }
302         return length;
303     }
304     // Some non-ASCII characters are equivalent to sequences with
305     // non-LDH ASCII characters. To find them:
306     // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt)
307     private static boolean
isNonASCIIDisallowedSTD3Valid(int c)308     isNonASCIIDisallowedSTD3Valid(int c) {
309         return c==0x2260 || c==0x226E || c==0x226F;
310     }
311 
312 
313     // Replace the label in dest with the label string, if the label was modified.
314     // If label==dest then the label was modified in-place and labelLength
315     // is the new label length, different from label.length().
316     // If label!=dest then labelLength==label.length().
317     // Returns labelLength (= the new label length).
318     private static int
replaceLabel(StringBuilder dest, int destLabelStart, int destLabelLength, CharSequence label, int labelLength)319     replaceLabel(StringBuilder dest, int destLabelStart, int destLabelLength,
320                  CharSequence label, int labelLength) {
321         if(label!=dest) {
322             dest.delete(destLabelStart, destLabelStart+destLabelLength).insert(destLabelStart, label);
323             // or dest.replace(destLabelStart, destLabelStart+destLabelLength, label.toString());
324             // which would create a String rather than moving characters in the StringBuilder.
325         }
326         return labelLength;
327     }
328 
329     // returns the new label length
330     private int
processLabel(StringBuilder dest, int labelStart, int labelLength, boolean toASCII, Info info)331     processLabel(StringBuilder dest,
332                  int labelStart, int labelLength,
333                  boolean toASCII,
334                  Info info) {
335         StringBuilder fromPunycode;
336         StringBuilder labelString;
337         int destLabelStart=labelStart;
338         int destLabelLength=labelLength;
339         boolean wasPunycode;
340         if( labelLength>=4 &&
341             dest.charAt(labelStart)=='x' && dest.charAt(labelStart+1)=='n' &&
342             dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-'
343         ) {
344             // Label starts with "xn--", try to un-Punycode it.
345             wasPunycode=true;
346             try {
347                 fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null);
348             } catch (StringPrepParseException e) {
349                 addLabelError(info, Error.PUNYCODE);
350                 return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
351             }
352             // Check for NFC, and for characters that are not
353             // valid or deviation characters according to the normalizer.
354             // If there is something wrong, then the string will change.
355             // Note that the normalizer passes through non-LDH ASCII and deviation characters.
356             // Deviation characters are ok in Punycode even in transitional processing.
357             // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
358             // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
359             boolean isValid=uts46Norm2.isNormalized(fromPunycode);
360             if(!isValid) {
361                 addLabelError(info, Error.INVALID_ACE_LABEL);
362                 return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
363             }
364             labelString=fromPunycode;
365             labelStart=0;
366             labelLength=fromPunycode.length();
367         } else {
368             wasPunycode=false;
369             labelString=dest;
370         }
371         // Validity check
372         if(labelLength==0) {
373             addLabelError(info, Error.EMPTY_LABEL);
374             return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
375         }
376         // labelLength>0
377         if(labelLength>=4 && labelString.charAt(labelStart+2)=='-' && labelString.charAt(labelStart+3)=='-') {
378             // label starts with "??--"
379             addLabelError(info, Error.HYPHEN_3_4);
380         }
381         if(labelString.charAt(labelStart)=='-') {
382             // label starts with "-"
383             addLabelError(info, Error.LEADING_HYPHEN);
384         }
385         if(labelString.charAt(labelStart+labelLength-1)=='-') {
386             // label ends with "-"
387             addLabelError(info, Error.TRAILING_HYPHEN);
388         }
389         // If the label was not a Punycode label, then it was the result of
390         // mapping, normalization and label segmentation.
391         // If the label was in Punycode, then we mapped it again above
392         // and checked its validity.
393         // Now we handle the STD3 restriction to LDH characters (if set)
394         // and we look for U+FFFD which indicates disallowed characters
395         // in a non-Punycode label or U+FFFD itself in a Punycode label.
396         // We also check for dots which can come from the input to a single-label function.
397         // Ok to cast away const because we own the UnicodeString.
398         int i=labelStart;
399         int limit=labelStart+labelLength;
400         char oredChars=0;
401         // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.
402         boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
403         do {
404             char c=labelString.charAt(i);
405             if(c<=0x7f) {
406                 if(c=='.') {
407                     addLabelError(info, Error.LABEL_HAS_DOT);
408                     labelString.setCharAt(i, '\ufffd');
409                 } else if(disallowNonLDHDot && asciiData[c]<0) {
410                     addLabelError(info, Error.DISALLOWED);
411                     labelString.setCharAt(i, '\ufffd');
412                 }
413             } else {
414                 oredChars|=c;
415                 if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) {
416                     addLabelError(info, Error.DISALLOWED);
417                     labelString.setCharAt(i, '\ufffd');
418                 } else if(c==0xfffd) {
419                     addLabelError(info, Error.DISALLOWED);
420                 }
421             }
422             ++i;
423         } while(i<limit);
424         // Check for a leading combining mark after other validity checks
425         // so that we don't report IDNA.Error.DISALLOWED for the U+FFFD from here.
426         int c;
427         // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD.
428         c=labelString.codePointAt(labelStart);
429         if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) {
430             addLabelError(info, Error.LEADING_COMBINING_MARK);
431             labelString.setCharAt(labelStart, '\ufffd');
432             if(c>0xffff) {
433                 // Remove c's trail surrogate.
434                 labelString.deleteCharAt(labelStart+1);
435                 --labelLength;
436                 if(labelString==dest) {
437                     --destLabelLength;
438                 }
439             }
440         }
441         if(!hasCertainLabelErrors(info, severeErrors)) {
442             // Do contextual checks only if we do not have U+FFFD from a severe error
443             // because U+FFFD can make these checks fail.
444             if((options&CHECK_BIDI)!=0 && (!isBiDi(info) || isOkBiDi(info))) {
445                 checkLabelBiDi(labelString, labelStart, labelLength, info);
446             }
447             if( (options&CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c &&
448                 !isLabelOkContextJ(labelString, labelStart, labelLength)
449             ) {
450                 addLabelError(info, Error.CONTEXTJ);
451             }
452             if((options&CHECK_CONTEXTO)!=0 && oredChars>=0xb7) {
453                 checkLabelContextO(labelString, labelStart, labelLength, info);
454             }
455             if(toASCII) {
456                 if(wasPunycode) {
457                     // Leave a Punycode label unchanged if it has no severe errors.
458                     if(destLabelLength>63) {
459                         addLabelError(info, Error.LABEL_TOO_LONG);
460                     }
461                     return destLabelLength;
462                 } else if(oredChars>=0x80) {
463                     // Contains non-ASCII characters.
464                     StringBuilder punycode;
465                     try {
466                         punycode=Punycode.encode(labelString.subSequence(labelStart, labelStart+labelLength), null);
467                     } catch (StringPrepParseException e) {
468                         throw new ICUException(e);  // unexpected
469                     }
470                     punycode.insert(0, "xn--");
471                     if(punycode.length()>63) {
472                         addLabelError(info, Error.LABEL_TOO_LONG);
473                     }
474                     return replaceLabel(dest, destLabelStart, destLabelLength,
475                                         punycode, punycode.length());
476                 } else {
477                     // all-ASCII label
478                     if(labelLength>63) {
479                         addLabelError(info, Error.LABEL_TOO_LONG);
480                     }
481                 }
482             }
483         } else {
484             // If a Punycode label has severe errors,
485             // then leave it but make sure it does not look valid.
486             if(wasPunycode) {
487                 addLabelError(info, Error.INVALID_ACE_LABEL);
488                 return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info);
489             }
490         }
491         return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
492     }
493     private int
markBadACELabel(StringBuilder dest, int labelStart, int labelLength, boolean toASCII, Info info)494     markBadACELabel(StringBuilder dest,
495                     int labelStart, int labelLength,
496                     boolean toASCII, Info info) {
497         boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
498         boolean isASCII=true;
499         boolean onlyLDH=true;
500         int i=labelStart+4;  // After the initial "xn--".
501         int limit=labelStart+labelLength;
502         do {
503             char c=dest.charAt(i);
504             if(c<=0x7f) {
505                 if(c=='.') {
506                     addLabelError(info, Error.LABEL_HAS_DOT);
507                     dest.setCharAt(i, '\ufffd');
508                     isASCII=onlyLDH=false;
509                 } else if(asciiData[c]<0) {
510                     onlyLDH=false;
511                     if(disallowNonLDHDot) {
512                         dest.setCharAt(i, '\ufffd');
513                         isASCII=false;
514                     }
515                 }
516             } else {
517                 isASCII=onlyLDH=false;
518             }
519         } while(++i<limit);
520         if(onlyLDH) {
521             dest.insert(labelStart+labelLength, '\ufffd');
522             ++labelLength;
523         } else {
524             if(toASCII && isASCII && labelLength>63) {
525                 addLabelError(info, Error.LABEL_TOO_LONG);
526             }
527         }
528         return labelLength;
529     }
530 
531     private static final int L_MASK=U_MASK(UCharacterDirection.LEFT_TO_RIGHT);
532     private static final int R_AL_MASK=
533         U_MASK(UCharacterDirection.RIGHT_TO_LEFT)|
534         U_MASK(UCharacterDirection.RIGHT_TO_LEFT_ARABIC);
535     private static final int L_R_AL_MASK=L_MASK|R_AL_MASK;
536 
537     private static final int R_AL_AN_MASK=R_AL_MASK|U_MASK(UCharacterDirection.ARABIC_NUMBER);
538 
539     private static final int EN_AN_MASK=
540         U_MASK(UCharacterDirection.EUROPEAN_NUMBER)|
541         U_MASK(UCharacterDirection.ARABIC_NUMBER);
542     private static final int R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK;
543     private static final int L_EN_MASK=L_MASK|U_MASK(UCharacterDirection.EUROPEAN_NUMBER);
544 
545     private static final int ES_CS_ET_ON_BN_NSM_MASK=
546         U_MASK(UCharacterDirection.EUROPEAN_NUMBER_SEPARATOR)|
547         U_MASK(UCharacterDirection.COMMON_NUMBER_SEPARATOR)|
548         U_MASK(UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR)|
549         U_MASK(UCharacterDirection.OTHER_NEUTRAL)|
550         U_MASK(UCharacterDirection.BOUNDARY_NEUTRAL)|
551         U_MASK(UCharacterDirection.DIR_NON_SPACING_MARK);
552     private static final int L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
553     private static final int R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
554 
555     // We scan the whole label and check both for whether it contains RTL characters
556     // and whether it passes the BiDi Rule.
557     // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find
558     // that a domain name is a BiDi domain name (has an RTL label) only after
559     // processing several earlier labels.
560     private void
checkLabelBiDi(CharSequence label, int labelStart, int labelLength, Info info)561     checkLabelBiDi(CharSequence label, int labelStart, int labelLength, Info info) {
562         // IDNA2008 BiDi rule
563         // Get the directionality of the first character.
564         int c;
565         int i=labelStart;
566         c=Character.codePointAt(label, i);
567         i+=Character.charCount(c);
568         int firstMask=U_MASK(UBiDiProps.INSTANCE.getClass(c));
569         // 1. The first character must be a character with BIDI property L, R
570         // or AL.  If it has the R or AL property, it is an RTL label; if it
571         // has the L property, it is an LTR label.
572         if((firstMask&~L_R_AL_MASK)!=0) {
573             setNotOkBiDi(info);
574         }
575         // Get the directionality of the last non-NSM character.
576         int lastMask;
577         int labelLimit=labelStart+labelLength;
578         for(;;) {
579             if(i>=labelLimit) {
580                 lastMask=firstMask;
581                 break;
582             }
583             c=Character.codePointBefore(label, labelLimit);
584             labelLimit-=Character.charCount(c);
585             int dir=UBiDiProps.INSTANCE.getClass(c);
586             if(dir!=UCharacterDirection.DIR_NON_SPACING_MARK) {
587                 lastMask=U_MASK(dir);
588                 break;
589             }
590         }
591         // 3. In an RTL label, the end of the label must be a character with
592         // BIDI property R, AL, EN or AN, followed by zero or more
593         // characters with BIDI property NSM.
594         // 6. In an LTR label, the end of the label must be a character with
595         // BIDI property L or EN, followed by zero or more characters with
596         // BIDI property NSM.
597         if( (firstMask&L_MASK)!=0 ?
598                 (lastMask&~L_EN_MASK)!=0 :
599                 (lastMask&~R_AL_EN_AN_MASK)!=0
600         ) {
601             setNotOkBiDi(info);
602         }
603         // Add the directionalities of the intervening characters.
604         int mask=firstMask|lastMask;
605         while(i<labelLimit) {
606             c=Character.codePointAt(label, i);
607             i+=Character.charCount(c);
608             mask|=U_MASK(UBiDiProps.INSTANCE.getClass(c));
609         }
610         if((firstMask&L_MASK)!=0) {
611             // 5. In an LTR label, only characters with the BIDI properties L, EN,
612             // ES, CS, ET, ON, BN and NSM are allowed.
613             if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
614                 setNotOkBiDi(info);
615             }
616         } else {
617             // 2. In an RTL label, only characters with the BIDI properties R, AL,
618             // AN, EN, ES, CS, ET, ON, BN and NSM are allowed.
619             if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
620                 setNotOkBiDi(info);
621             }
622             // 4. In an RTL label, if an EN is present, no AN may be present, and
623             // vice versa.
624             if((mask&EN_AN_MASK)==EN_AN_MASK) {
625                 setNotOkBiDi(info);
626             }
627         }
628         // An RTL label is a label that contains at least one character of type
629         // R, AL or AN. [...]
630         // A "BIDI domain name" is a domain name that contains at least one RTL
631         // label. [...]
632         // The following rule, consisting of six conditions, applies to labels
633         // in BIDI domain names.
634         if((mask&R_AL_AN_MASK)!=0) {
635             setBiDi(info);
636         }
637     }
638 
639     // Special code for the ASCII prefix of a BiDi domain name.
640     // The ASCII prefix is all-LTR.
641 
642     // IDNA2008 BiDi rule, parts relevant to ASCII labels:
643     // 1. The first character must be a character with BIDI property L [...]
644     // 5. In an LTR label, only characters with the BIDI properties L, EN,
645     // ES, CS, ET, ON, BN and NSM are allowed.
646     // 6. In an LTR label, the end of the label must be a character with
647     // BIDI property L or EN [...]
648 
649     // UTF-16 version, called for mapped ASCII prefix.
650     // Cannot contain uppercase A-Z.
651     // s[length-1] must be the trailing dot.
652     private static boolean
isASCIIOkBiDi(CharSequence s, int length)653     isASCIIOkBiDi(CharSequence s, int length) {
654         int labelStart=0;
655         for(int i=0; i<length; ++i) {
656             char c=s.charAt(i);
657             if(c=='.') {  // dot
658                 if(i>labelStart) {
659                     c=s.charAt(i-1);
660                     if(!('a'<=c && c<='z') && !('0'<=c && c<='9')) {
661                         // Last character in the label is not an L or EN.
662                         return false;
663                     }
664                 }
665                 labelStart=i+1;
666             } else if(i==labelStart) {
667                 if(!('a'<=c && c<='z')) {
668                     // First character in the label is not an L.
669                     return false;
670                 }
671             } else {
672                 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
673                     // Intermediate character in the label is a B, S or WS.
674                     return false;
675                 }
676             }
677         }
678         return true;
679     }
680 
681     private boolean
isLabelOkContextJ(CharSequence label, int labelStart, int labelLength)682     isLabelOkContextJ(CharSequence label, int labelStart, int labelLength) {
683         // [IDNA2008-Tables]
684         // 200C..200D  ; CONTEXTJ    # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
685         int labelLimit=labelStart+labelLength;
686         for(int i=labelStart; i<labelLimit; ++i) {
687             if(label.charAt(i)==0x200c) {
688                 // Appendix A.1. ZERO WIDTH NON-JOINER
689                 // Rule Set:
690                 //  False;
691                 //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
692                 //  If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
693                 //     (Joining_Type:T)*(Joining_Type:{R,D})) Then True;
694                 if(i==labelStart) {
695                     return false;
696                 }
697                 int c;
698                 int j=i;
699                 c=Character.codePointBefore(label, j);
700                 j-=Character.charCount(c);
701                 if(uts46Norm2.getCombiningClass(c)==9) {
702                     continue;
703                 }
704                 // check precontext (Joining_Type:{L,D})(Joining_Type:T)*
705                 for(;;) {
706                     /* UJoiningType */ int type=UBiDiProps.INSTANCE.getJoiningType(c);
707                     if(type==UCharacter.JoiningType.TRANSPARENT) {
708                         if(j==0) {
709                             return false;
710                         }
711                         c=Character.codePointBefore(label, j);
712                         j-=Character.charCount(c);
713                     } else if(type==UCharacter.JoiningType.LEFT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) {
714                         break;  // precontext fulfilled
715                     } else {
716                         return false;
717                     }
718                 }
719                 // check postcontext (Joining_Type:T)*(Joining_Type:{R,D})
720                 for(j=i+1;;) {
721                     if(j==labelLimit) {
722                         return false;
723                     }
724                     c=Character.codePointAt(label, j);
725                     j+=Character.charCount(c);
726                     /* UJoiningType */ int type=UBiDiProps.INSTANCE.getJoiningType(c);
727                     if(type==UCharacter.JoiningType.TRANSPARENT) {
728                         // just skip this character
729                     } else if(type==UCharacter.JoiningType.RIGHT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) {
730                         break;  // postcontext fulfilled
731                     } else {
732                         return false;
733                     }
734                 }
735             } else if(label.charAt(i)==0x200d) {
736                 // Appendix A.2. ZERO WIDTH JOINER (U+200D)
737                 // Rule Set:
738                 //  False;
739                 //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
740                 if(i==labelStart) {
741                     return false;
742                 }
743                 int c=Character.codePointBefore(label, i);
744                 if(uts46Norm2.getCombiningClass(c)!=9) {
745                     return false;
746                 }
747             }
748         }
749         return true;
750     }
751 
752     private void
checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info)753     checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) {
754         int labelEnd=labelStart+labelLength-1;  // inclusive
755         int arabicDigits=0;  // -1 for 066x, +1 for 06Fx
756         for(int i=labelStart; i<=labelEnd; ++i) {
757             int c=label.charAt(i);
758             if(c<0xb7) {
759                 // ASCII fastpath
760             } else if(c<=0x6f9) {
761                 if(c==0xb7) {
762                     // Appendix A.3. MIDDLE DOT (U+00B7)
763                     // Rule Set:
764                     //  False;
765                     //  If Before(cp) .eq.  U+006C And
766                     //     After(cp) .eq.  U+006C Then True;
767                     if(!(labelStart<i && label.charAt(i-1)=='l' &&
768                          i<labelEnd && label.charAt(i+1)=='l')) {
769                         addLabelError(info, Error.CONTEXTO_PUNCTUATION);
770                     }
771                 } else if(c==0x375) {
772                     // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
773                     // Rule Set:
774                     //  False;
775                     //  If Script(After(cp)) .eq.  Greek Then True;
776                     if(!(i<labelEnd &&
777                          UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) {
778                         addLabelError(info, Error.CONTEXTO_PUNCTUATION);
779                     }
780                 } else if(c==0x5f3 || c==0x5f4) {
781                     // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
782                     // Rule Set:
783                     //  False;
784                     //  If Script(Before(cp)) .eq.  Hebrew Then True;
785                     //
786                     // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
787                     // Rule Set:
788                     //  False;
789                     //  If Script(Before(cp)) .eq.  Hebrew Then True;
790                     if(!(labelStart<i &&
791                          UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) {
792                         addLabelError(info, Error.CONTEXTO_PUNCTUATION);
793                     }
794                 } else if(0x660<=c /* && c<=0x6f9 */) {
795                     // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
796                     // Rule Set:
797                     //  True;
798                     //  For All Characters:
799                     //    If cp .in. 06F0..06F9 Then False;
800                     //  End For;
801                     //
802                     // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
803                     // Rule Set:
804                     //  True;
805                     //  For All Characters:
806                     //    If cp .in. 0660..0669 Then False;
807                     //  End For;
808                     if(c<=0x669) {
809                         if(arabicDigits>0) {
810                             addLabelError(info, Error.CONTEXTO_DIGITS);
811                         }
812                         arabicDigits=-1;
813                     } else if(0x6f0<=c) {
814                         if(arabicDigits<0) {
815                             addLabelError(info, Error.CONTEXTO_DIGITS);
816                         }
817                         arabicDigits=1;
818                     }
819                 }
820             } else if(c==0x30fb) {
821                 // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
822                 // Rule Set:
823                 //  False;
824                 //  For All Characters:
825                 //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
826                 //  End For;
827                 for(int j=labelStart;; j+=Character.charCount(c)) {
828                     if(j>labelEnd) {
829                         addLabelError(info, Error.CONTEXTO_PUNCTUATION);
830                         break;
831                     }
832                     c=Character.codePointAt(label, j);
833                     int script=UScript.getScript(c);
834                     if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) {
835                         break;
836                     }
837                 }
838             }
839         }
840     }
841 
842     // TODO: make public(?) -- in C, these are public in uchar.h
U_MASK(int x)843     private static int U_MASK(int x) {
844         return 1<<x;
845     }
U_GET_GC_MASK(int c)846     private static int U_GET_GC_MASK(int c) {
847         return (1<<UCharacter.getType(c));
848     }
849     private static int U_GC_M_MASK=
850         U_MASK(UCharacterCategory.NON_SPACING_MARK)|
851         U_MASK(UCharacterCategory.ENCLOSING_MARK)|
852         U_MASK(UCharacterCategory.COMBINING_SPACING_MARK);
853 }
854