1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2010-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package ohos.global.icu.impl; 11 12 import java.util.EnumSet; 13 14 import ohos.global.icu.impl.Normalizer2Impl.UTF16Plus; 15 import ohos.global.icu.lang.UCharacter; 16 import ohos.global.icu.lang.UCharacterCategory; 17 import ohos.global.icu.lang.UCharacterDirection; 18 import ohos.global.icu.lang.UScript; 19 import ohos.global.icu.text.IDNA; 20 import ohos.global.icu.text.Normalizer2; 21 import ohos.global.icu.text.StringPrepParseException; 22 import ohos.global.icu.util.ICUException; 23 24 // Note about tests for IDNA.Error.DOMAIN_NAME_TOO_LONG: 25 // 26 // The domain name length limit is 255 octets in an internal DNS representation 27 // where the last ("root") label is the empty label 28 // represented by length byte 0 alone. 29 // In a conventional string, this translates to 253 characters, or 254 30 // if there is a trailing dot for the root label. 31 32 /** 33 * UTS #46 (IDNA2008) implementation. 34 * @author Markus Scherer 35 * @hide exposed on OHOS 36 */ 37 public final class UTS46 extends IDNA { UTS46(int options)38 public UTS46(int options) { 39 this.options=options; 40 } 41 42 @Override labelToASCII(CharSequence label, StringBuilder dest, Info info)43 public StringBuilder labelToASCII(CharSequence label, StringBuilder dest, Info info) { 44 return process(label, true, true, dest, info); 45 } 46 47 @Override labelToUnicode(CharSequence label, StringBuilder dest, Info info)48 public StringBuilder labelToUnicode(CharSequence label, StringBuilder dest, Info info) { 49 return process(label, true, false, dest, info); 50 } 51 52 @Override nameToASCII(CharSequence name, StringBuilder dest, Info info)53 public StringBuilder nameToASCII(CharSequence name, StringBuilder dest, Info info) { 54 process(name, false, true, dest, info); 55 if( dest.length()>=254 && !info.getErrors().contains(Error.DOMAIN_NAME_TOO_LONG) && 56 isASCIIString(dest) && 57 (dest.length()>254 || dest.charAt(253)!='.') 58 ) { 59 addError(info, Error.DOMAIN_NAME_TOO_LONG); 60 } 61 return dest; 62 } 63 64 @Override nameToUnicode(CharSequence name, StringBuilder dest, Info info)65 public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info info) { 66 return process(name, false, false, dest, info); 67 } 68 69 private static final Normalizer2 uts46Norm2= 70 Normalizer2.getInstance(null, "uts46", Normalizer2.Mode.COMPOSE); // uts46.nrm 71 final int options; 72 73 // Severe errors which usually result in a U+FFFD replacement character in the result string. 74 private static final EnumSet<Error> severeErrors=EnumSet.of( 75 Error.LEADING_COMBINING_MARK, 76 Error.DISALLOWED, 77 Error.PUNYCODE, 78 Error.LABEL_HAS_DOT, 79 Error.INVALID_ACE_LABEL); 80 81 private static boolean isASCIIString(CharSequence dest)82 isASCIIString(CharSequence dest) { 83 int length=dest.length(); 84 for(int i=0; i<length; ++i) { 85 if(dest.charAt(i)>0x7f) { 86 return false; 87 } 88 } 89 return true; 90 } 91 92 // UTS #46 data for ASCII characters. 93 // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase 94 // and passes through all other ASCII characters. 95 // If USE_STD3_RULES is set, then non-LDH characters are disallowed 96 // using this data. 97 // The ASCII fastpath also uses this data. 98 // Values: -1=disallowed 0==valid 1==mapped (lowercase) 99 private static final byte asciiData[]={ 100 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 101 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 102 // 002D..002E; valid # HYPHEN-MINUS..FULL STOP 103 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, 104 // 0030..0039; valid # DIGIT ZERO..DIGIT NINE 105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 106 // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 107 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, 109 // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z 110 -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 112 }; 113 114 private StringBuilder process(CharSequence src, boolean isLabel, boolean toASCII, StringBuilder dest, Info info)115 process(CharSequence src, 116 boolean isLabel, boolean toASCII, 117 StringBuilder dest, 118 Info info) { 119 // uts46Norm2.normalize() would do all of this error checking and setup, 120 // but with the ASCII fastpath we do not always call it, and do not 121 // call it first. 122 if(dest==src) { 123 throw new IllegalArgumentException(); 124 } 125 // Arguments are fine, reset output values. 126 dest.delete(0, 0x7fffffff); 127 resetInfo(info); 128 int srcLength=src.length(); 129 if(srcLength==0) { 130 addError(info, Error.EMPTY_LABEL); 131 return dest; 132 } 133 // ASCII fastpath 134 boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; 135 int labelStart=0; 136 int i; 137 for(i=0;; ++i) { 138 if(i==srcLength) { 139 if(toASCII) { 140 if((i-labelStart)>63) { 141 addLabelError(info, Error.LABEL_TOO_LONG); 142 } 143 // There is a trailing dot if labelStart==i. 144 if(!isLabel && i>=254 && (i>254 || labelStart<i)) { 145 addError(info, Error.DOMAIN_NAME_TOO_LONG); 146 } 147 } 148 promoteAndResetLabelErrors(info); 149 return dest; 150 } 151 char c=src.charAt(i); 152 if(c>0x7f) { 153 break; 154 } 155 int cData=asciiData[c]; 156 if(cData>0) { 157 dest.append((char)(c+0x20)); // Lowercase an uppercase ASCII letter. 158 } else if(cData<0 && disallowNonLDHDot) { 159 break; // Replacing with U+FFFD can be complicated for toASCII. 160 } else { 161 dest.append(c); 162 if(c=='-') { // hyphen 163 if(i==(labelStart+3) && src.charAt(i-1)=='-') { 164 // "??--..." is Punycode or forbidden. 165 ++i; // '-' was copied to dest already 166 break; 167 } 168 if(i==labelStart) { 169 // label starts with "-" 170 addLabelError(info, Error.LEADING_HYPHEN); 171 } 172 if((i+1)==srcLength || src.charAt(i+1)=='.') { 173 // label ends with "-" 174 addLabelError(info, Error.TRAILING_HYPHEN); 175 } 176 } else if(c=='.') { // dot 177 if(isLabel) { 178 // Replacing with U+FFFD can be complicated for toASCII. 179 ++i; // '.' was copied to dest already 180 break; 181 } 182 if(i==labelStart) { 183 addLabelError(info, Error.EMPTY_LABEL); 184 } 185 if(toASCII && (i-labelStart)>63) { 186 addLabelError(info, Error.LABEL_TOO_LONG); 187 } 188 promoteAndResetLabelErrors(info); 189 labelStart=i+1; 190 } 191 } 192 } 193 promoteAndResetLabelErrors(info); 194 processUnicode(src, labelStart, i, isLabel, toASCII, dest, info); 195 if( isBiDi(info) && !hasCertainErrors(info, severeErrors) && 196 (!isOkBiDi(info) || (labelStart>0 && !isASCIIOkBiDi(dest, labelStart))) 197 ) { 198 addError(info, Error.BIDI); 199 } 200 return dest; 201 } 202 203 private StringBuilder processUnicode(CharSequence src, int labelStart, int mappingStart, boolean isLabel, boolean toASCII, StringBuilder dest, Info info)204 processUnicode(CharSequence src, 205 int labelStart, int mappingStart, 206 boolean isLabel, boolean toASCII, 207 StringBuilder dest, 208 Info info) { 209 if(mappingStart==0) { 210 uts46Norm2.normalize(src, dest); 211 } else { 212 uts46Norm2.normalizeSecondAndAppend(dest, src.subSequence(mappingStart, src.length())); 213 } 214 boolean doMapDevChars= 215 toASCII ? (options&NONTRANSITIONAL_TO_ASCII)==0 : 216 (options&NONTRANSITIONAL_TO_UNICODE)==0; 217 int destLength=dest.length(); 218 int labelLimit=labelStart; 219 while(labelLimit<destLength) { 220 char c=dest.charAt(labelLimit); 221 if(c=='.' && !isLabel) { 222 int labelLength=labelLimit-labelStart; 223 int newLength=processLabel(dest, labelStart, labelLength, 224 toASCII, info); 225 promoteAndResetLabelErrors(info); 226 destLength+=newLength-labelLength; 227 labelLimit=labelStart+=newLength+1; 228 continue; 229 } else if(c<0xdf) { 230 // pass 231 } else if(c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) { 232 setTransitionalDifferent(info); 233 if(doMapDevChars) { 234 destLength=mapDevChars(dest, labelStart, labelLimit); 235 // All deviation characters have been mapped, no need to check for them again. 236 doMapDevChars=false; 237 // Do not increment labelLimit in case c was removed. 238 continue; 239 } 240 } else if(Character.isSurrogate(c)) { 241 if(UTF16Plus.isSurrogateLead(c) ? 242 (labelLimit+1)==destLength || 243 !Character.isLowSurrogate(dest.charAt(labelLimit+1)) : 244 labelLimit==labelStart || 245 !Character.isHighSurrogate(dest.charAt(labelLimit-1))) { 246 // Map an unpaired surrogate to U+FFFD before normalization so that when 247 // that removes characters we do not turn two unpaired ones into a pair. 248 addLabelError(info, Error.DISALLOWED); 249 dest.setCharAt(labelLimit, '\ufffd'); 250 } 251 } 252 ++labelLimit; 253 } 254 // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok) 255 // but not an empty label elsewhere nor a completely empty domain name. 256 // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0. 257 if(0==labelStart || labelStart<labelLimit) { 258 processLabel(dest, labelStart, labelLimit-labelStart, toASCII, info); 259 promoteAndResetLabelErrors(info); 260 } 261 return dest; 262 } 263 264 // returns the new dest.length() 265 private int mapDevChars(StringBuilder dest, int labelStart, int mappingStart)266 mapDevChars(StringBuilder dest, int labelStart, int mappingStart) { 267 int length=dest.length(); 268 boolean didMapDevChars=false; 269 for(int i=mappingStart; i<length;) { 270 char c=dest.charAt(i); 271 switch(c) { 272 case 0xdf: 273 // Map sharp s to ss. 274 didMapDevChars=true; 275 dest.setCharAt(i++, 's'); 276 dest.insert(i++, 's'); 277 ++length; 278 break; 279 case 0x3c2: // Map final sigma to nonfinal sigma. 280 didMapDevChars=true; 281 dest.setCharAt(i++, '\u03c3'); 282 break; 283 case 0x200c: // Ignore/remove ZWNJ. 284 case 0x200d: // Ignore/remove ZWJ. 285 didMapDevChars=true; 286 dest.delete(i, i+1); 287 --length; 288 break; 289 default: 290 ++i; 291 break; 292 } 293 } 294 if(didMapDevChars) { 295 // Mapping deviation characters might have resulted in an un-NFC string. 296 // We could use either the NFC or the UTS #46 normalizer. 297 // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file. 298 String normalized=uts46Norm2.normalize(dest.subSequence(labelStart, dest.length())); 299 dest.replace(labelStart, 0x7fffffff, normalized); 300 return dest.length(); 301 } 302 return length; 303 } 304 // Some non-ASCII characters are equivalent to sequences with 305 // non-LDH ASCII characters. To find them: 306 // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt) 307 private static boolean isNonASCIIDisallowedSTD3Valid(int c)308 isNonASCIIDisallowedSTD3Valid(int c) { 309 return c==0x2260 || c==0x226E || c==0x226F; 310 } 311 312 313 // Replace the label in dest with the label string, if the label was modified. 314 // If label==dest then the label was modified in-place and labelLength 315 // is the new label length, different from label.length(). 316 // If label!=dest then labelLength==label.length(). 317 // Returns labelLength (= the new label length). 318 private static int replaceLabel(StringBuilder dest, int destLabelStart, int destLabelLength, CharSequence label, int labelLength)319 replaceLabel(StringBuilder dest, int destLabelStart, int destLabelLength, 320 CharSequence label, int labelLength) { 321 if(label!=dest) { 322 dest.delete(destLabelStart, destLabelStart+destLabelLength).insert(destLabelStart, label); 323 // or dest.replace(destLabelStart, destLabelStart+destLabelLength, label.toString()); 324 // which would create a String rather than moving characters in the StringBuilder. 325 } 326 return labelLength; 327 } 328 329 // returns the new label length 330 private int processLabel(StringBuilder dest, int labelStart, int labelLength, boolean toASCII, Info info)331 processLabel(StringBuilder dest, 332 int labelStart, int labelLength, 333 boolean toASCII, 334 Info info) { 335 StringBuilder fromPunycode; 336 StringBuilder labelString; 337 int destLabelStart=labelStart; 338 int destLabelLength=labelLength; 339 boolean wasPunycode; 340 if( labelLength>=4 && 341 dest.charAt(labelStart)=='x' && dest.charAt(labelStart+1)=='n' && 342 dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-' 343 ) { 344 // Label starts with "xn--", try to un-Punycode it. 345 wasPunycode=true; 346 try { 347 fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null); 348 } catch (StringPrepParseException e) { 349 addLabelError(info, Error.PUNYCODE); 350 return markBadACELabel(dest, labelStart, labelLength, toASCII, info); 351 } 352 // Check for NFC, and for characters that are not 353 // valid or deviation characters according to the normalizer. 354 // If there is something wrong, then the string will change. 355 // Note that the normalizer passes through non-LDH ASCII and deviation characters. 356 // Deviation characters are ok in Punycode even in transitional processing. 357 // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES 358 // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too. 359 boolean isValid=uts46Norm2.isNormalized(fromPunycode); 360 if(!isValid) { 361 addLabelError(info, Error.INVALID_ACE_LABEL); 362 return markBadACELabel(dest, labelStart, labelLength, toASCII, info); 363 } 364 labelString=fromPunycode; 365 labelStart=0; 366 labelLength=fromPunycode.length(); 367 } else { 368 wasPunycode=false; 369 labelString=dest; 370 } 371 // Validity check 372 if(labelLength==0) { 373 addLabelError(info, Error.EMPTY_LABEL); 374 return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength); 375 } 376 // labelLength>0 377 if(labelLength>=4 && labelString.charAt(labelStart+2)=='-' && labelString.charAt(labelStart+3)=='-') { 378 // label starts with "??--" 379 addLabelError(info, Error.HYPHEN_3_4); 380 } 381 if(labelString.charAt(labelStart)=='-') { 382 // label starts with "-" 383 addLabelError(info, Error.LEADING_HYPHEN); 384 } 385 if(labelString.charAt(labelStart+labelLength-1)=='-') { 386 // label ends with "-" 387 addLabelError(info, Error.TRAILING_HYPHEN); 388 } 389 // If the label was not a Punycode label, then it was the result of 390 // mapping, normalization and label segmentation. 391 // If the label was in Punycode, then we mapped it again above 392 // and checked its validity. 393 // Now we handle the STD3 restriction to LDH characters (if set) 394 // and we look for U+FFFD which indicates disallowed characters 395 // in a non-Punycode label or U+FFFD itself in a Punycode label. 396 // We also check for dots which can come from the input to a single-label function. 397 // Ok to cast away const because we own the UnicodeString. 398 int i=labelStart; 399 int limit=labelStart+labelLength; 400 char oredChars=0; 401 // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed. 402 boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; 403 do { 404 char c=labelString.charAt(i); 405 if(c<=0x7f) { 406 if(c=='.') { 407 addLabelError(info, Error.LABEL_HAS_DOT); 408 labelString.setCharAt(i, '\ufffd'); 409 } else if(disallowNonLDHDot && asciiData[c]<0) { 410 addLabelError(info, Error.DISALLOWED); 411 labelString.setCharAt(i, '\ufffd'); 412 } 413 } else { 414 oredChars|=c; 415 if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) { 416 addLabelError(info, Error.DISALLOWED); 417 labelString.setCharAt(i, '\ufffd'); 418 } else if(c==0xfffd) { 419 addLabelError(info, Error.DISALLOWED); 420 } 421 } 422 ++i; 423 } while(i<limit); 424 // Check for a leading combining mark after other validity checks 425 // so that we don't report IDNA.Error.DISALLOWED for the U+FFFD from here. 426 int c; 427 // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD. 428 c=labelString.codePointAt(labelStart); 429 if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) { 430 addLabelError(info, Error.LEADING_COMBINING_MARK); 431 labelString.setCharAt(labelStart, '\ufffd'); 432 if(c>0xffff) { 433 // Remove c's trail surrogate. 434 labelString.deleteCharAt(labelStart+1); 435 --labelLength; 436 if(labelString==dest) { 437 --destLabelLength; 438 } 439 } 440 } 441 if(!hasCertainLabelErrors(info, severeErrors)) { 442 // Do contextual checks only if we do not have U+FFFD from a severe error 443 // because U+FFFD can make these checks fail. 444 if((options&CHECK_BIDI)!=0 && (!isBiDi(info) || isOkBiDi(info))) { 445 checkLabelBiDi(labelString, labelStart, labelLength, info); 446 } 447 if( (options&CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c && 448 !isLabelOkContextJ(labelString, labelStart, labelLength) 449 ) { 450 addLabelError(info, Error.CONTEXTJ); 451 } 452 if((options&CHECK_CONTEXTO)!=0 && oredChars>=0xb7) { 453 checkLabelContextO(labelString, labelStart, labelLength, info); 454 } 455 if(toASCII) { 456 if(wasPunycode) { 457 // Leave a Punycode label unchanged if it has no severe errors. 458 if(destLabelLength>63) { 459 addLabelError(info, Error.LABEL_TOO_LONG); 460 } 461 return destLabelLength; 462 } else if(oredChars>=0x80) { 463 // Contains non-ASCII characters. 464 StringBuilder punycode; 465 try { 466 punycode=Punycode.encode(labelString.subSequence(labelStart, labelStart+labelLength), null); 467 } catch (StringPrepParseException e) { 468 throw new ICUException(e); // unexpected 469 } 470 punycode.insert(0, "xn--"); 471 if(punycode.length()>63) { 472 addLabelError(info, Error.LABEL_TOO_LONG); 473 } 474 return replaceLabel(dest, destLabelStart, destLabelLength, 475 punycode, punycode.length()); 476 } else { 477 // all-ASCII label 478 if(labelLength>63) { 479 addLabelError(info, Error.LABEL_TOO_LONG); 480 } 481 } 482 } 483 } else { 484 // If a Punycode label has severe errors, 485 // then leave it but make sure it does not look valid. 486 if(wasPunycode) { 487 addLabelError(info, Error.INVALID_ACE_LABEL); 488 return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info); 489 } 490 } 491 return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength); 492 } 493 private int markBadACELabel(StringBuilder dest, int labelStart, int labelLength, boolean toASCII, Info info)494 markBadACELabel(StringBuilder dest, 495 int labelStart, int labelLength, 496 boolean toASCII, Info info) { 497 boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; 498 boolean isASCII=true; 499 boolean onlyLDH=true; 500 int i=labelStart+4; // After the initial "xn--". 501 int limit=labelStart+labelLength; 502 do { 503 char c=dest.charAt(i); 504 if(c<=0x7f) { 505 if(c=='.') { 506 addLabelError(info, Error.LABEL_HAS_DOT); 507 dest.setCharAt(i, '\ufffd'); 508 isASCII=onlyLDH=false; 509 } else if(asciiData[c]<0) { 510 onlyLDH=false; 511 if(disallowNonLDHDot) { 512 dest.setCharAt(i, '\ufffd'); 513 isASCII=false; 514 } 515 } 516 } else { 517 isASCII=onlyLDH=false; 518 } 519 } while(++i<limit); 520 if(onlyLDH) { 521 dest.insert(labelStart+labelLength, '\ufffd'); 522 ++labelLength; 523 } else { 524 if(toASCII && isASCII && labelLength>63) { 525 addLabelError(info, Error.LABEL_TOO_LONG); 526 } 527 } 528 return labelLength; 529 } 530 531 private static final int L_MASK=U_MASK(UCharacterDirection.LEFT_TO_RIGHT); 532 private static final int R_AL_MASK= 533 U_MASK(UCharacterDirection.RIGHT_TO_LEFT)| 534 U_MASK(UCharacterDirection.RIGHT_TO_LEFT_ARABIC); 535 private static final int L_R_AL_MASK=L_MASK|R_AL_MASK; 536 537 private static final int R_AL_AN_MASK=R_AL_MASK|U_MASK(UCharacterDirection.ARABIC_NUMBER); 538 539 private static final int EN_AN_MASK= 540 U_MASK(UCharacterDirection.EUROPEAN_NUMBER)| 541 U_MASK(UCharacterDirection.ARABIC_NUMBER); 542 private static final int R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK; 543 private static final int L_EN_MASK=L_MASK|U_MASK(UCharacterDirection.EUROPEAN_NUMBER); 544 545 private static final int ES_CS_ET_ON_BN_NSM_MASK= 546 U_MASK(UCharacterDirection.EUROPEAN_NUMBER_SEPARATOR)| 547 U_MASK(UCharacterDirection.COMMON_NUMBER_SEPARATOR)| 548 U_MASK(UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR)| 549 U_MASK(UCharacterDirection.OTHER_NEUTRAL)| 550 U_MASK(UCharacterDirection.BOUNDARY_NEUTRAL)| 551 U_MASK(UCharacterDirection.DIR_NON_SPACING_MARK); 552 private static final int L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 553 private static final int R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 554 555 // We scan the whole label and check both for whether it contains RTL characters 556 // and whether it passes the BiDi Rule. 557 // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find 558 // that a domain name is a BiDi domain name (has an RTL label) only after 559 // processing several earlier labels. 560 private void checkLabelBiDi(CharSequence label, int labelStart, int labelLength, Info info)561 checkLabelBiDi(CharSequence label, int labelStart, int labelLength, Info info) { 562 // IDNA2008 BiDi rule 563 // Get the directionality of the first character. 564 int c; 565 int i=labelStart; 566 c=Character.codePointAt(label, i); 567 i+=Character.charCount(c); 568 int firstMask=U_MASK(UBiDiProps.INSTANCE.getClass(c)); 569 // 1. The first character must be a character with BIDI property L, R 570 // or AL. If it has the R or AL property, it is an RTL label; if it 571 // has the L property, it is an LTR label. 572 if((firstMask&~L_R_AL_MASK)!=0) { 573 setNotOkBiDi(info); 574 } 575 // Get the directionality of the last non-NSM character. 576 int lastMask; 577 int labelLimit=labelStart+labelLength; 578 for(;;) { 579 if(i>=labelLimit) { 580 lastMask=firstMask; 581 break; 582 } 583 c=Character.codePointBefore(label, labelLimit); 584 labelLimit-=Character.charCount(c); 585 int dir=UBiDiProps.INSTANCE.getClass(c); 586 if(dir!=UCharacterDirection.DIR_NON_SPACING_MARK) { 587 lastMask=U_MASK(dir); 588 break; 589 } 590 } 591 // 3. In an RTL label, the end of the label must be a character with 592 // BIDI property R, AL, EN or AN, followed by zero or more 593 // characters with BIDI property NSM. 594 // 6. In an LTR label, the end of the label must be a character with 595 // BIDI property L or EN, followed by zero or more characters with 596 // BIDI property NSM. 597 if( (firstMask&L_MASK)!=0 ? 598 (lastMask&~L_EN_MASK)!=0 : 599 (lastMask&~R_AL_EN_AN_MASK)!=0 600 ) { 601 setNotOkBiDi(info); 602 } 603 // Add the directionalities of the intervening characters. 604 int mask=firstMask|lastMask; 605 while(i<labelLimit) { 606 c=Character.codePointAt(label, i); 607 i+=Character.charCount(c); 608 mask|=U_MASK(UBiDiProps.INSTANCE.getClass(c)); 609 } 610 if((firstMask&L_MASK)!=0) { 611 // 5. In an LTR label, only characters with the BIDI properties L, EN, 612 // ES, CS, ET, ON, BN and NSM are allowed. 613 if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 614 setNotOkBiDi(info); 615 } 616 } else { 617 // 2. In an RTL label, only characters with the BIDI properties R, AL, 618 // AN, EN, ES, CS, ET, ON, BN and NSM are allowed. 619 if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 620 setNotOkBiDi(info); 621 } 622 // 4. In an RTL label, if an EN is present, no AN may be present, and 623 // vice versa. 624 if((mask&EN_AN_MASK)==EN_AN_MASK) { 625 setNotOkBiDi(info); 626 } 627 } 628 // An RTL label is a label that contains at least one character of type 629 // R, AL or AN. [...] 630 // A "BIDI domain name" is a domain name that contains at least one RTL 631 // label. [...] 632 // The following rule, consisting of six conditions, applies to labels 633 // in BIDI domain names. 634 if((mask&R_AL_AN_MASK)!=0) { 635 setBiDi(info); 636 } 637 } 638 639 // Special code for the ASCII prefix of a BiDi domain name. 640 // The ASCII prefix is all-LTR. 641 642 // IDNA2008 BiDi rule, parts relevant to ASCII labels: 643 // 1. The first character must be a character with BIDI property L [...] 644 // 5. In an LTR label, only characters with the BIDI properties L, EN, 645 // ES, CS, ET, ON, BN and NSM are allowed. 646 // 6. In an LTR label, the end of the label must be a character with 647 // BIDI property L or EN [...] 648 649 // UTF-16 version, called for mapped ASCII prefix. 650 // Cannot contain uppercase A-Z. 651 // s[length-1] must be the trailing dot. 652 private static boolean isASCIIOkBiDi(CharSequence s, int length)653 isASCIIOkBiDi(CharSequence s, int length) { 654 int labelStart=0; 655 for(int i=0; i<length; ++i) { 656 char c=s.charAt(i); 657 if(c=='.') { // dot 658 if(i>labelStart) { 659 c=s.charAt(i-1); 660 if(!('a'<=c && c<='z') && !('0'<=c && c<='9')) { 661 // Last character in the label is not an L or EN. 662 return false; 663 } 664 } 665 labelStart=i+1; 666 } else if(i==labelStart) { 667 if(!('a'<=c && c<='z')) { 668 // First character in the label is not an L. 669 return false; 670 } 671 } else { 672 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { 673 // Intermediate character in the label is a B, S or WS. 674 return false; 675 } 676 } 677 } 678 return true; 679 } 680 681 private boolean isLabelOkContextJ(CharSequence label, int labelStart, int labelLength)682 isLabelOkContextJ(CharSequence label, int labelStart, int labelLength) { 683 // [IDNA2008-Tables] 684 // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 685 int labelLimit=labelStart+labelLength; 686 for(int i=labelStart; i<labelLimit; ++i) { 687 if(label.charAt(i)==0x200c) { 688 // Appendix A.1. ZERO WIDTH NON-JOINER 689 // Rule Set: 690 // False; 691 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 692 // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C 693 // (Joining_Type:T)*(Joining_Type:{R,D})) Then True; 694 if(i==labelStart) { 695 return false; 696 } 697 int c; 698 int j=i; 699 c=Character.codePointBefore(label, j); 700 j-=Character.charCount(c); 701 if(uts46Norm2.getCombiningClass(c)==9) { 702 continue; 703 } 704 // check precontext (Joining_Type:{L,D})(Joining_Type:T)* 705 for(;;) { 706 /* UJoiningType */ int type=UBiDiProps.INSTANCE.getJoiningType(c); 707 if(type==UCharacter.JoiningType.TRANSPARENT) { 708 if(j==0) { 709 return false; 710 } 711 c=Character.codePointBefore(label, j); 712 j-=Character.charCount(c); 713 } else if(type==UCharacter.JoiningType.LEFT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) { 714 break; // precontext fulfilled 715 } else { 716 return false; 717 } 718 } 719 // check postcontext (Joining_Type:T)*(Joining_Type:{R,D}) 720 for(j=i+1;;) { 721 if(j==labelLimit) { 722 return false; 723 } 724 c=Character.codePointAt(label, j); 725 j+=Character.charCount(c); 726 /* UJoiningType */ int type=UBiDiProps.INSTANCE.getJoiningType(c); 727 if(type==UCharacter.JoiningType.TRANSPARENT) { 728 // just skip this character 729 } else if(type==UCharacter.JoiningType.RIGHT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) { 730 break; // postcontext fulfilled 731 } else { 732 return false; 733 } 734 } 735 } else if(label.charAt(i)==0x200d) { 736 // Appendix A.2. ZERO WIDTH JOINER (U+200D) 737 // Rule Set: 738 // False; 739 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 740 if(i==labelStart) { 741 return false; 742 } 743 int c=Character.codePointBefore(label, i); 744 if(uts46Norm2.getCombiningClass(c)!=9) { 745 return false; 746 } 747 } 748 } 749 return true; 750 } 751 752 private void checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info)753 checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) { 754 int labelEnd=labelStart+labelLength-1; // inclusive 755 int arabicDigits=0; // -1 for 066x, +1 for 06Fx 756 for(int i=labelStart; i<=labelEnd; ++i) { 757 int c=label.charAt(i); 758 if(c<0xb7) { 759 // ASCII fastpath 760 } else if(c<=0x6f9) { 761 if(c==0xb7) { 762 // Appendix A.3. MIDDLE DOT (U+00B7) 763 // Rule Set: 764 // False; 765 // If Before(cp) .eq. U+006C And 766 // After(cp) .eq. U+006C Then True; 767 if(!(labelStart<i && label.charAt(i-1)=='l' && 768 i<labelEnd && label.charAt(i+1)=='l')) { 769 addLabelError(info, Error.CONTEXTO_PUNCTUATION); 770 } 771 } else if(c==0x375) { 772 // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375) 773 // Rule Set: 774 // False; 775 // If Script(After(cp)) .eq. Greek Then True; 776 if(!(i<labelEnd && 777 UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) { 778 addLabelError(info, Error.CONTEXTO_PUNCTUATION); 779 } 780 } else if(c==0x5f3 || c==0x5f4) { 781 // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3) 782 // Rule Set: 783 // False; 784 // If Script(Before(cp)) .eq. Hebrew Then True; 785 // 786 // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4) 787 // Rule Set: 788 // False; 789 // If Script(Before(cp)) .eq. Hebrew Then True; 790 if(!(labelStart<i && 791 UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) { 792 addLabelError(info, Error.CONTEXTO_PUNCTUATION); 793 } 794 } else if(0x660<=c /* && c<=0x6f9 */) { 795 // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669) 796 // Rule Set: 797 // True; 798 // For All Characters: 799 // If cp .in. 06F0..06F9 Then False; 800 // End For; 801 // 802 // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9) 803 // Rule Set: 804 // True; 805 // For All Characters: 806 // If cp .in. 0660..0669 Then False; 807 // End For; 808 if(c<=0x669) { 809 if(arabicDigits>0) { 810 addLabelError(info, Error.CONTEXTO_DIGITS); 811 } 812 arabicDigits=-1; 813 } else if(0x6f0<=c) { 814 if(arabicDigits<0) { 815 addLabelError(info, Error.CONTEXTO_DIGITS); 816 } 817 arabicDigits=1; 818 } 819 } 820 } else if(c==0x30fb) { 821 // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB) 822 // Rule Set: 823 // False; 824 // For All Characters: 825 // If Script(cp) .in. {Hiragana, Katakana, Han} Then True; 826 // End For; 827 for(int j=labelStart;; j+=Character.charCount(c)) { 828 if(j>labelEnd) { 829 addLabelError(info, Error.CONTEXTO_PUNCTUATION); 830 break; 831 } 832 c=Character.codePointAt(label, j); 833 int script=UScript.getScript(c); 834 if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) { 835 break; 836 } 837 } 838 } 839 } 840 } 841 842 // TODO: make public(?) -- in C, these are public in uchar.h U_MASK(int x)843 private static int U_MASK(int x) { 844 return 1<<x; 845 } U_GET_GC_MASK(int c)846 private static int U_GET_GC_MASK(int c) { 847 return (1<<UCharacter.getType(c)); 848 } 849 private static int U_GC_M_MASK= 850 U_MASK(UCharacterCategory.NON_SPACING_MARK)| 851 U_MASK(UCharacterCategory.ENCLOSING_MARK)| 852 U_MASK(UCharacterCategory.COMBINING_SPACING_MARK); 853 } 854