1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2003-2010, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import com.ibm.icu.text.IDNA; 12 import com.ibm.icu.text.StringPrep; 13 import com.ibm.icu.text.StringPrepParseException; 14 import com.ibm.icu.text.UCharacterIterator; 15 16 /** 17 * IDNA2003 implementation code, moved out of com.ibm.icu.text.IDNA.java 18 * while extending that class to support IDNA2008/UTS #46 as well. 19 * @author Ram Viswanadha 20 */ 21 public final class IDNA2003 { 22 /* IDNA ACE Prefix is "xn--" */ 23 private static char[] ACE_PREFIX = new char[]{ 0x0078,0x006E,0x002d,0x002d } ; 24 //private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length; 25 26 private static final int MAX_LABEL_LENGTH = 63; 27 private static final int HYPHEN = 0x002D; 28 private static final int CAPITAL_A = 0x0041; 29 private static final int CAPITAL_Z = 0x005A; 30 private static final int LOWER_CASE_DELTA = 0x0020; 31 private static final int FULL_STOP = 0x002E; 32 private static final int MAX_DOMAIN_NAME_LENGTH = 255; 33 34 // The NamePrep profile object 35 private static final StringPrep namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP); 36 startsWithPrefix(StringBuffer src)37 private static boolean startsWithPrefix(StringBuffer src){ 38 boolean startsWithPrefix = true; 39 40 if(src.length() < ACE_PREFIX.length){ 41 return false; 42 } 43 for(int i=0; i<ACE_PREFIX.length;i++){ 44 if(toASCIILower(src.charAt(i)) != ACE_PREFIX[i]){ 45 startsWithPrefix = false; 46 } 47 } 48 return startsWithPrefix; 49 } 50 toASCIILower(char ch)51 private static char toASCIILower(char ch){ 52 if(CAPITAL_A <= ch && ch <= CAPITAL_Z){ 53 return (char)(ch + LOWER_CASE_DELTA); 54 } 55 return ch; 56 } 57 toASCIILower(CharSequence src)58 private static StringBuffer toASCIILower(CharSequence src){ 59 StringBuffer dest = new StringBuffer(); 60 for(int i=0; i<src.length();i++){ 61 dest.append(toASCIILower(src.charAt(i))); 62 } 63 return dest; 64 } 65 compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2)66 private static int compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2){ 67 char c1,c2; 68 int rc; 69 for(int i =0;/* no condition */;i++) { 70 /* If we reach the ends of both strings then they match */ 71 if(i == s1.length()) { 72 return 0; 73 } 74 75 c1 = s1.charAt(i); 76 c2 = s2.charAt(i); 77 78 /* Case-insensitive comparison */ 79 if(c1!=c2) { 80 rc=toASCIILower(c1)-toASCIILower(c2); 81 if(rc!=0) { 82 return rc; 83 } 84 } 85 } 86 } 87 getSeparatorIndex(char[] src,int start, int limit)88 private static int getSeparatorIndex(char[] src,int start, int limit){ 89 for(; start<limit;start++){ 90 if(isLabelSeparator(src[start])){ 91 return start; 92 } 93 } 94 // we have not found the separator just return length 95 return start; 96 } 97 98 /* 99 private static int getSeparatorIndex(UCharacterIterator iter){ 100 int currentIndex = iter.getIndex(); 101 int separatorIndex = 0; 102 int ch; 103 while((ch=iter.next())!= UCharacterIterator.DONE){ 104 if(isLabelSeparator(ch)){ 105 separatorIndex = iter.getIndex(); 106 iter.setIndex(currentIndex); 107 return separatorIndex; 108 } 109 } 110 // reset index 111 iter.setIndex(currentIndex); 112 // we have not found the separator just return the length 113 114 } 115 */ 116 117 isLDHChar(int ch)118 private static boolean isLDHChar(int ch){ 119 // high runner case 120 if(ch>0x007A){ 121 return false; 122 } 123 //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A] 124 if( (ch==0x002D) || 125 (0x0030 <= ch && ch <= 0x0039) || 126 (0x0041 <= ch && ch <= 0x005A) || 127 (0x0061 <= ch && ch <= 0x007A) 128 ){ 129 return true; 130 } 131 return false; 132 } 133 134 /** 135 * Ascertain if the given code point is a label separator as 136 * defined by the IDNA RFC 137 * 138 * @param ch The code point to be ascertained 139 * @return true if the char is a label separator 140 * @stable ICU 2.8 141 */ isLabelSeparator(int ch)142 private static boolean isLabelSeparator(int ch){ 143 switch(ch){ 144 case 0x002e: 145 case 0x3002: 146 case 0xFF0E: 147 case 0xFF61: 148 return true; 149 default: 150 return false; 151 } 152 } 153 convertToASCII(UCharacterIterator src, int options)154 public static StringBuffer convertToASCII(UCharacterIterator src, int options) 155 throws StringPrepParseException{ 156 157 boolean[] caseFlags = null; 158 159 // the source contains all ascii codepoints 160 boolean srcIsASCII = true; 161 // assume the source contains all LDH codepoints 162 boolean srcIsLDH = true; 163 164 //get the options 165 boolean useSTD3ASCIIRules = ((options & IDNA.USE_STD3_RULES) != 0); 166 int ch; 167 // step 1 168 while((ch = src.next())!= UCharacterIterator.DONE){ 169 if(ch> 0x7f){ 170 srcIsASCII = false; 171 } 172 } 173 int failPos = -1; 174 src.setToStart(); 175 StringBuffer processOut = null; 176 // step 2 is performed only if the source contains non ASCII 177 if(!srcIsASCII){ 178 // step 2 179 processOut = namePrep.prepare(src, options); 180 }else{ 181 processOut = new StringBuffer(src.getText()); 182 } 183 int poLen = processOut.length(); 184 185 if(poLen==0){ 186 throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL); 187 } 188 StringBuffer dest = new StringBuffer(); 189 190 // reset the variable to verify if output of prepare is ASCII or not 191 srcIsASCII = true; 192 193 // step 3 & 4 194 for(int j=0;j<poLen;j++ ){ 195 ch=processOut.charAt(j); 196 if(ch > 0x7F){ 197 srcIsASCII = false; 198 }else if(isLDHChar(ch)==false){ 199 // here we do not assemble surrogates 200 // since we know that LDH code points 201 // are in the ASCII range only 202 srcIsLDH = false; 203 failPos = j; 204 } 205 } 206 207 if(useSTD3ASCIIRules == true){ 208 // verify 3a and 3b 209 if( srcIsLDH == false /* source contains some non-LDH characters */ 210 || processOut.charAt(0) == HYPHEN 211 || processOut.charAt(processOut.length()-1) == HYPHEN){ 212 213 /* populate the parseError struct */ 214 if(srcIsLDH==false){ 215 throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules", 216 StringPrepParseException.STD3_ASCII_RULES_ERROR, 217 processOut.toString(), 218 (failPos>0) ? (failPos-1) : failPos); 219 }else if(processOut.charAt(0) == HYPHEN){ 220 throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", 221 StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0); 222 223 }else{ 224 throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", 225 StringPrepParseException.STD3_ASCII_RULES_ERROR, 226 processOut.toString(), 227 (poLen>0) ? poLen-1 : poLen); 228 229 } 230 } 231 } 232 if(srcIsASCII){ 233 dest = processOut; 234 }else{ 235 // step 5 : verify the sequence does not begin with ACE prefix 236 if(!startsWithPrefix(processOut)){ 237 238 //step 6: encode the sequence with punycode 239 caseFlags = new boolean[poLen]; 240 241 StringBuilder punyout = Punycode.encode(processOut,caseFlags); 242 243 // convert all codepoints to lower case ASCII 244 StringBuffer lowerOut = toASCIILower(punyout); 245 246 //Step 7: prepend the ACE prefix 247 dest.append(ACE_PREFIX,0,ACE_PREFIX.length); 248 //Step 6: copy the contents in b2 into dest 249 dest.append(lowerOut); 250 }else{ 251 252 throw new StringPrepParseException("The input does not start with the ACE Prefix.", 253 StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0); 254 } 255 } 256 if(dest.length() > MAX_LABEL_LENGTH){ 257 throw new StringPrepParseException("The labels in the input are too long. Length > 63.", 258 StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0); 259 } 260 return dest; 261 } 262 convertIDNToASCII(String src,int options)263 public static StringBuffer convertIDNToASCII(String src,int options) 264 throws StringPrepParseException{ 265 266 char[] srcArr = src.toCharArray(); 267 StringBuffer result = new StringBuffer(); 268 int sepIndex=0; 269 int oldSepIndex=0; 270 for(;;){ 271 sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length); 272 String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex); 273 //make sure this is not a root label separator. 274 if(!(label.length()==0 && sepIndex==srcArr.length)){ 275 UCharacterIterator iter = UCharacterIterator.getInstance(label); 276 result.append(convertToASCII(iter,options)); 277 } 278 if(sepIndex==srcArr.length){ 279 break; 280 } 281 282 // increment the sepIndex to skip past the separator 283 sepIndex++; 284 oldSepIndex = sepIndex; 285 result.append((char)FULL_STOP); 286 } 287 if(result.length() > MAX_DOMAIN_NAME_LENGTH){ 288 throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR); 289 } 290 return result; 291 } 292 convertToUnicode(UCharacterIterator src, int options)293 public static StringBuffer convertToUnicode(UCharacterIterator src, int options) 294 throws StringPrepParseException{ 295 296 boolean[] caseFlags = null; 297 298 // the source contains all ascii codepoints 299 boolean srcIsASCII = true; 300 // assume the source contains all LDH codepoints 301 //boolean srcIsLDH = true; 302 303 //get the options 304 //boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0); 305 306 //int failPos = -1; 307 int ch; 308 int saveIndex = src.getIndex(); 309 // step 1: find out if all the codepoints in src are ASCII 310 while((ch=src.next())!= UCharacterIterator.DONE){ 311 if(ch>0x7F){ 312 srcIsASCII = false; 313 }/*else if((srcIsLDH = isLDHChar(ch))==false){ 314 failPos = src.getIndex(); 315 }*/ 316 } 317 StringBuffer processOut; 318 319 if(srcIsASCII == false){ 320 try { 321 // step 2: process the string 322 src.setIndex(saveIndex); 323 processOut = namePrep.prepare(src,options); 324 } catch (StringPrepParseException ex) { 325 return new StringBuffer(src.getText()); 326 } 327 328 }else{ 329 //just point to source 330 processOut = new StringBuffer(src.getText()); 331 } 332 // TODO: 333 // The RFC states that 334 // <quote> 335 // ToUnicode never fails. If any step fails, then the original input 336 // is returned immediately in that step. 337 // </quote> 338 339 //step 3: verify ACE Prefix 340 if(startsWithPrefix(processOut)){ 341 StringBuffer decodeOut = null; 342 343 //step 4: Remove the ACE Prefix 344 String temp = processOut.substring(ACE_PREFIX.length,processOut.length()); 345 346 //step 5: Decode using punycode 347 try { 348 decodeOut = new StringBuffer(Punycode.decode(temp,caseFlags)); 349 } catch (StringPrepParseException e) { 350 decodeOut = null; 351 } 352 353 //step 6:Apply toASCII 354 if (decodeOut != null) { 355 StringBuffer toASCIIOut = convertToASCII(UCharacterIterator.getInstance(decodeOut), options); 356 357 //step 7: verify 358 if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){ 359 // throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed", 360 // StringPrepParseException.VERIFICATION_ERROR); 361 decodeOut = null; 362 } 363 } 364 365 //step 8: return output of step 5 366 if (decodeOut != null) { 367 return decodeOut; 368 } 369 } 370 371 // }else{ 372 // // verify that STD3 ASCII rules are satisfied 373 // if(useSTD3ASCIIRules == true){ 374 // if( srcIsLDH == false /* source contains some non-LDH characters */ 375 // || processOut.charAt(0) == HYPHEN 376 // || processOut.charAt(processOut.length()-1) == HYPHEN){ 377 // 378 // if(srcIsLDH==false){ 379 // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", 380 // StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(), 381 // (failPos>0) ? (failPos-1) : failPos); 382 // }else if(processOut.charAt(0) == HYPHEN){ 383 // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", 384 // StringPrepParseException.STD3_ASCII_RULES_ERROR, 385 // processOut.toString(),0); 386 // 387 // }else{ 388 // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", 389 // StringPrepParseException.STD3_ASCII_RULES_ERROR, 390 // processOut.toString(), 391 // processOut.length()); 392 // 393 // } 394 // } 395 // } 396 // // just return the source 397 // return new StringBuffer(src.getText()); 398 // } 399 400 return new StringBuffer(src.getText()); 401 } 402 convertIDNToUnicode(String src, int options)403 public static StringBuffer convertIDNToUnicode(String src, int options) 404 throws StringPrepParseException{ 405 406 char[] srcArr = src.toCharArray(); 407 StringBuffer result = new StringBuffer(); 408 int sepIndex=0; 409 int oldSepIndex=0; 410 for(;;){ 411 sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length); 412 String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex); 413 if(label.length()==0 && sepIndex!=srcArr.length ){ 414 throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL); 415 } 416 UCharacterIterator iter = UCharacterIterator.getInstance(label); 417 result.append(convertToUnicode(iter,options)); 418 if(sepIndex==srcArr.length){ 419 break; 420 } 421 // Unlike the ToASCII operation we don't normalize the label separators 422 result.append(srcArr[sepIndex]); 423 // increment the sepIndex to skip past the separator 424 sepIndex++; 425 oldSepIndex =sepIndex; 426 } 427 if(result.length() > MAX_DOMAIN_NAME_LENGTH){ 428 throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR); 429 } 430 return result; 431 } 432 compare(String s1, String s2, int options)433 public static int compare(String s1, String s2, int options) throws StringPrepParseException{ 434 StringBuffer s1Out = convertIDNToASCII(s1, options); 435 StringBuffer s2Out = convertIDNToASCII(s2, options); 436 return compareCaseInsensitiveASCII(s1Out,s2Out); 437 } 438 } 439