1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2003-2010, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import com.ibm.icu.text.IDNA; 12 import com.ibm.icu.text.StringPrep; 13 import com.ibm.icu.text.StringPrepParseException; 14 import com.ibm.icu.text.UCharacterIterator; 15 16 /** 17 * IDNA2003 implementation code, moved out of com.ibm.icu.text.IDNA.java 18 * while extending that class to support IDNA2008/UTS #46 as well. 19 * @author Ram Viswanadha 20 */ 21 public final class IDNA2003 { 22 /* IDNA ACE Prefix is "xn--" */ 23 private static char[] ACE_PREFIX = new char[]{ 0x0078,0x006E,0x002d,0x002d } ; 24 //private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length; 25 26 private static final int MAX_LABEL_LENGTH = 63; 27 private static final int HYPHEN = 0x002D; 28 private static final int CAPITAL_A = 0x0041; 29 private static final int CAPITAL_Z = 0x005A; 30 private static final int LOWER_CASE_DELTA = 0x0020; 31 private static final int FULL_STOP = 0x002E; 32 private static final int MAX_DOMAIN_NAME_LENGTH = 255; 33 34 // The NamePrep profile object 35 private static final StringPrep namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP); 36 startsWithPrefix(StringBuffer src)37 private static boolean startsWithPrefix(StringBuffer src){ 38 if(src.length() < ACE_PREFIX.length){ 39 return false; 40 } 41 for(int i=0; i<ACE_PREFIX.length;i++){ 42 if(toASCIILower(src.charAt(i)) != ACE_PREFIX[i]){ 43 return false; 44 } 45 } 46 return true; 47 } 48 toASCIILower(char ch)49 private static char toASCIILower(char ch){ 50 if(CAPITAL_A <= ch && ch <= CAPITAL_Z){ 51 return (char)(ch + LOWER_CASE_DELTA); 52 } 53 return ch; 54 } 55 toASCIILower(CharSequence src)56 private static StringBuffer toASCIILower(CharSequence src){ 57 StringBuffer dest = new StringBuffer(); 58 for(int i=0; i<src.length();i++){ 59 dest.append(toASCIILower(src.charAt(i))); 60 } 61 return dest; 62 } 63 compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2)64 private static int compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2){ 65 char c1,c2; 66 int rc; 67 for(int i =0;/* no condition */;i++) { 68 /* If we reach the ends of both strings then they match */ 69 if(i == s1.length()) { 70 return 0; 71 } 72 73 c1 = s1.charAt(i); 74 c2 = s2.charAt(i); 75 76 /* Case-insensitive comparison */ 77 if(c1!=c2) { 78 rc=toASCIILower(c1)-toASCIILower(c2); 79 if(rc!=0) { 80 return rc; 81 } 82 } 83 } 84 } 85 getSeparatorIndex(char[] src,int start, int limit)86 private static int getSeparatorIndex(char[] src,int start, int limit){ 87 for(; start<limit;start++){ 88 if(isLabelSeparator(src[start])){ 89 return start; 90 } 91 } 92 // we have not found the separator just return length 93 return start; 94 } 95 96 /* 97 private static int getSeparatorIndex(UCharacterIterator iter){ 98 int currentIndex = iter.getIndex(); 99 int separatorIndex = 0; 100 int ch; 101 while((ch=iter.next())!= UCharacterIterator.DONE){ 102 if(isLabelSeparator(ch)){ 103 separatorIndex = iter.getIndex(); 104 iter.setIndex(currentIndex); 105 return separatorIndex; 106 } 107 } 108 // reset index 109 iter.setIndex(currentIndex); 110 // we have not found the separator just return the length 111 112 } 113 */ 114 115 isLDHChar(int ch)116 private static boolean isLDHChar(int ch){ 117 // high runner case 118 if(ch>0x007A){ 119 return false; 120 } 121 //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A] 122 if( (ch==0x002D) || 123 (0x0030 <= ch && ch <= 0x0039) || 124 (0x0041 <= ch && ch <= 0x005A) || 125 (0x0061 <= ch && ch <= 0x007A) 126 ){ 127 return true; 128 } 129 return false; 130 } 131 132 /** 133 * Ascertain if the given code point is a label separator as 134 * defined by the IDNA RFC 135 * 136 * @param ch The code point to be ascertained 137 * @return true if the char is a label separator 138 * @stable ICU 2.8 139 */ isLabelSeparator(int ch)140 private static boolean isLabelSeparator(int ch){ 141 switch(ch){ 142 case 0x002e: 143 case 0x3002: 144 case 0xFF0E: 145 case 0xFF61: 146 return true; 147 default: 148 return false; 149 } 150 } 151 convertToASCII(UCharacterIterator src, int options)152 public static StringBuffer convertToASCII(UCharacterIterator src, int options) 153 throws StringPrepParseException{ 154 155 boolean[] caseFlags = null; 156 157 // the source contains all ascii codepoints 158 boolean srcIsASCII = true; 159 // assume the source contains all LDH codepoints 160 boolean srcIsLDH = true; 161 162 //get the options 163 boolean useSTD3ASCIIRules = ((options & IDNA.USE_STD3_RULES) != 0); 164 int ch; 165 // step 1 166 while((ch = src.next())!= UCharacterIterator.DONE){ 167 if(ch> 0x7f){ 168 srcIsASCII = false; 169 break; 170 } 171 } 172 int failPos = -1; 173 src.setToStart(); 174 StringBuffer processOut = null; 175 // step 2 is performed only if the source contains non ASCII 176 if(!srcIsASCII){ 177 // step 2 178 processOut = namePrep.prepare(src, options); 179 }else{ 180 processOut = new StringBuffer(src.getText()); 181 } 182 int poLen = processOut.length(); 183 184 if(poLen==0){ 185 throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL); 186 } 187 StringBuffer dest = new StringBuffer(); 188 189 // reset the variable to verify if output of prepare is ASCII or not 190 srcIsASCII = true; 191 192 // step 3 & 4 193 for(int j=0;j<poLen;j++ ){ 194 ch=processOut.charAt(j); 195 if(ch > 0x7F){ 196 srcIsASCII = false; 197 }else if(isLDHChar(ch)==false){ 198 // here we do not assemble surrogates 199 // since we know that LDH code points 200 // are in the ASCII range only 201 srcIsLDH = false; 202 failPos = j; 203 } 204 } 205 206 if(useSTD3ASCIIRules == true){ 207 // verify 3a and 3b 208 if( srcIsLDH == false /* source contains some non-LDH characters */ 209 || processOut.charAt(0) == HYPHEN 210 || processOut.charAt(processOut.length()-1) == HYPHEN){ 211 212 /* populate the parseError struct */ 213 if(srcIsLDH==false){ 214 throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules", 215 StringPrepParseException.STD3_ASCII_RULES_ERROR, 216 processOut.toString(), 217 (failPos>0) ? (failPos-1) : failPos); 218 }else if(processOut.charAt(0) == HYPHEN){ 219 throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", 220 StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0); 221 222 }else{ 223 throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", 224 StringPrepParseException.STD3_ASCII_RULES_ERROR, 225 processOut.toString(), 226 (poLen>0) ? poLen-1 : poLen); 227 228 } 229 } 230 } 231 if(srcIsASCII){ 232 dest = processOut; 233 }else{ 234 // step 5 : verify the sequence does not begin with ACE prefix 235 if(!startsWithPrefix(processOut)){ 236 237 //step 6: encode the sequence with punycode 238 caseFlags = new boolean[poLen]; 239 240 StringBuilder punyout = Punycode.encode(processOut,caseFlags); 241 242 // convert all codepoints to lower case ASCII 243 StringBuffer lowerOut = toASCIILower(punyout); 244 245 //Step 7: prepend the ACE prefix 246 dest.append(ACE_PREFIX,0,ACE_PREFIX.length); 247 //Step 6: copy the contents in b2 into dest 248 dest.append(lowerOut); 249 }else{ 250 251 throw new StringPrepParseException("The input does not start with the ACE Prefix.", 252 StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0); 253 } 254 } 255 if(dest.length() > MAX_LABEL_LENGTH){ 256 throw new StringPrepParseException("The labels in the input are too long. Length > 63.", 257 StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0); 258 } 259 return dest; 260 } 261 convertIDNToASCII(String src,int options)262 public static StringBuffer convertIDNToASCII(String src,int options) 263 throws StringPrepParseException{ 264 265 char[] srcArr = src.toCharArray(); 266 StringBuffer result = new StringBuffer(); 267 int sepIndex=0; 268 int oldSepIndex=0; 269 for(;;){ 270 sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length); 271 String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex); 272 //make sure this is not a root label separator. 273 if(!(label.length()==0 && sepIndex==srcArr.length)){ 274 UCharacterIterator iter = UCharacterIterator.getInstance(label); 275 result.append(convertToASCII(iter,options)); 276 } 277 if(sepIndex==srcArr.length){ 278 break; 279 } 280 281 // increment the sepIndex to skip past the separator 282 sepIndex++; 283 oldSepIndex = sepIndex; 284 result.append((char)FULL_STOP); 285 } 286 if(result.length() > MAX_DOMAIN_NAME_LENGTH){ 287 throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR); 288 } 289 return result; 290 } 291 convertToUnicode(UCharacterIterator src, int options)292 public static StringBuffer convertToUnicode(UCharacterIterator src, int options) 293 throws StringPrepParseException{ 294 295 boolean[] caseFlags = null; 296 297 // the source contains all ascii codepoints 298 boolean srcIsASCII = true; 299 // assume the source contains all LDH codepoints 300 //boolean srcIsLDH = true; 301 302 //get the options 303 //boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0); 304 305 //int failPos = -1; 306 int ch; 307 int saveIndex = src.getIndex(); 308 // step 1: find out if all the codepoints in src are ASCII 309 while((ch=src.next())!= UCharacterIterator.DONE){ 310 if(ch>0x7F){ 311 srcIsASCII = false; 312 }/*else if((srcIsLDH = isLDHChar(ch))==false){ 313 failPos = src.getIndex(); 314 }*/ 315 } 316 StringBuffer processOut; 317 318 if(srcIsASCII == false){ 319 try { 320 // step 2: process the string 321 src.setIndex(saveIndex); 322 processOut = namePrep.prepare(src,options); 323 } catch (StringPrepParseException ex) { 324 return new StringBuffer(src.getText()); 325 } 326 327 }else{ 328 //just point to source 329 processOut = new StringBuffer(src.getText()); 330 } 331 // TODO: 332 // The RFC states that 333 // <quote> 334 // ToUnicode never fails. If any step fails, then the original input 335 // is returned immediately in that step. 336 // </quote> 337 338 //step 3: verify ACE Prefix 339 if(startsWithPrefix(processOut)){ 340 StringBuffer decodeOut = null; 341 342 //step 4: Remove the ACE Prefix 343 String temp = processOut.substring(ACE_PREFIX.length,processOut.length()); 344 345 //step 5: Decode using punycode 346 try { 347 decodeOut = new StringBuffer(Punycode.decode(temp,caseFlags)); 348 } catch (StringPrepParseException e) { 349 decodeOut = null; 350 } 351 352 //step 6:Apply toASCII 353 if (decodeOut != null) { 354 StringBuffer toASCIIOut = convertToASCII(UCharacterIterator.getInstance(decodeOut), options); 355 356 //step 7: verify 357 if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){ 358 // throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed", 359 // StringPrepParseException.VERIFICATION_ERROR); 360 decodeOut = null; 361 } 362 } 363 364 //step 8: return output of step 5 365 if (decodeOut != null) { 366 return decodeOut; 367 } 368 } 369 370 // }else{ 371 // // verify that STD3 ASCII rules are satisfied 372 // if(useSTD3ASCIIRules == true){ 373 // if( srcIsLDH == false /* source contains some non-LDH characters */ 374 // || processOut.charAt(0) == HYPHEN 375 // || processOut.charAt(processOut.length()-1) == HYPHEN){ 376 // 377 // if(srcIsLDH==false){ 378 // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", 379 // StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(), 380 // (failPos>0) ? (failPos-1) : failPos); 381 // }else if(processOut.charAt(0) == HYPHEN){ 382 // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", 383 // StringPrepParseException.STD3_ASCII_RULES_ERROR, 384 // processOut.toString(),0); 385 // 386 // }else{ 387 // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", 388 // StringPrepParseException.STD3_ASCII_RULES_ERROR, 389 // processOut.toString(), 390 // processOut.length()); 391 // 392 // } 393 // } 394 // } 395 // // just return the source 396 // return new StringBuffer(src.getText()); 397 // } 398 399 return new StringBuffer(src.getText()); 400 } 401 convertIDNToUnicode(String src, int options)402 public static StringBuffer convertIDNToUnicode(String src, int options) 403 throws StringPrepParseException{ 404 405 char[] srcArr = src.toCharArray(); 406 StringBuffer result = new StringBuffer(); 407 int sepIndex=0; 408 int oldSepIndex=0; 409 for(;;){ 410 sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length); 411 String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex); 412 if(label.length()==0 && sepIndex!=srcArr.length ){ 413 throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL); 414 } 415 UCharacterIterator iter = UCharacterIterator.getInstance(label); 416 result.append(convertToUnicode(iter,options)); 417 if(sepIndex==srcArr.length){ 418 break; 419 } 420 // Unlike the ToASCII operation we don't normalize the label separators 421 result.append(srcArr[sepIndex]); 422 // increment the sepIndex to skip past the separator 423 sepIndex++; 424 oldSepIndex =sepIndex; 425 } 426 if(result.length() > MAX_DOMAIN_NAME_LENGTH){ 427 throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR); 428 } 429 return result; 430 } 431 compare(String s1, String s2, int options)432 public static int compare(String s1, String s2, int options) throws StringPrepParseException{ 433 StringBuffer s1Out = convertIDNToASCII(s1, options); 434 StringBuffer s2Out = convertIDNToASCII(s2, options); 435 return compareCaseInsensitiveASCII(s1Out,s2Out); 436 } 437 } 438