1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2003-2014, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import com.ibm.icu.lang.UCharacter; 12 import com.ibm.icu.text.StringPrepParseException; 13 import com.ibm.icu.text.UTF16; 14 import com.ibm.icu.util.ICUInputTooLongException; 15 16 /** 17 * Ported code from ICU punycode.c 18 * @author ram 19 */ 20 public final class Punycode { 21 22 /* Punycode parameters for Bootstring */ 23 private static final int BASE = 36; 24 private static final int TMIN = 1; 25 private static final int TMAX = 26; 26 private static final int SKEW = 38; 27 private static final int DAMP = 700; 28 private static final int INITIAL_BIAS = 72; 29 private static final int INITIAL_N = 0x80; 30 31 /* "Basic" Unicode/ASCII code points */ 32 private static final char HYPHEN = 0x2d; 33 private static final char DELIMITER = HYPHEN; 34 35 private static final int ZERO = 0x30; 36 //private static final int NINE = 0x39; 37 38 private static final int SMALL_A = 0x61; 39 private static final int SMALL_Z = 0x7a; 40 41 private static final int CAPITAL_A = 0x41; 42 private static final int CAPITAL_Z = 0x5a; 43 adaptBias(int delta, int length, boolean firstTime)44 private static int adaptBias(int delta, int length, boolean firstTime){ 45 if(firstTime){ 46 delta /=DAMP; 47 }else{ 48 delta /= 2; 49 } 50 delta += delta/length; 51 52 int count=0; 53 for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) { 54 delta/=(BASE-TMIN); 55 } 56 57 return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); 58 } 59 60 /** 61 * @return the numeric value of a basic code point (for use in representing integers) 62 * in the range 0 to BASE-1, or a negative value if cp is invalid. 63 */ decodeDigit(int cp)64 private static final int decodeDigit(int cp) { 65 if(cp<='Z') { 66 if(cp<='9') { 67 if(cp<'0') { 68 return -1; 69 } else { 70 return cp-'0'+26; // 0..9 -> 26..35 71 } 72 } else { 73 return cp-'A'; // A-Z -> 0..25 74 } 75 } else if(cp<='z') { 76 return cp-'a'; // a..z -> 0..25 77 } else { 78 return -1; 79 } 80 } 81 82 ///CLOVER:OFF asciiCaseMap(char b, boolean uppercase)83 private static char asciiCaseMap(char b, boolean uppercase) { 84 if(uppercase) { 85 if(SMALL_A<=b && b<=SMALL_Z) { 86 b-=(SMALL_A-CAPITAL_A); 87 } 88 } else { 89 if(CAPITAL_A<=b && b<=CAPITAL_Z) { 90 b+=(SMALL_A-CAPITAL_A); 91 } 92 } 93 return b; 94 } 95 ///CLOVER:ON 96 /** 97 * digitToBasic() returns the basic code point whose value 98 * (when used for representing integers) is d, which must be in the 99 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is 100 * nonzero, in which case the uppercase form is used. 101 */ digitToBasic(int digit, boolean uppercase)102 private static char digitToBasic(int digit, boolean uppercase) { 103 /* 0..25 map to ASCII a..z or A..Z */ 104 /* 26..35 map to ASCII 0..9 */ 105 if(digit<26) { 106 if(uppercase) { 107 return (char)(CAPITAL_A+digit); 108 } else { 109 return (char)(SMALL_A+digit); 110 } 111 } else { 112 return (char)((ZERO-26)+digit); 113 } 114 } 115 116 // ICU-13727: Limit input length for n^2 algorithm 117 // where well-formed strings are at most 59 characters long. 118 private static final int ENCODE_MAX_CODE_UNITS = 1000; 119 private static final int DECODE_MAX_CHARS = 2000; 120 121 /** 122 * Converts Unicode to Punycode. 123 * The input string must not contain single, unpaired surrogates. 124 * The output will be represented as an array of ASCII code points. 125 * 126 * @param src The source of the String Buffer passed. 127 * @param caseFlags The boolean array of case flags. 128 * @return An array of ASCII code points. 129 */ encode(CharSequence src, boolean[] caseFlags)130 public static StringBuilder encode(CharSequence src, boolean[] caseFlags) throws StringPrepParseException{ 131 int n, delta, handledCPCount, basicLength, bias, j, m, q, k, t, srcCPCount; 132 char c, c2; 133 int srcLength = src.length(); 134 if (srcLength > ENCODE_MAX_CODE_UNITS) { 135 throw new ICUInputTooLongException( 136 "input too long: " + srcLength + " UTF-16 code units"); 137 } 138 int[] cpBuffer = new int[srcLength]; 139 StringBuilder dest = new StringBuilder(srcLength); 140 /* 141 * Handle the basic code points and 142 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): 143 */ 144 srcCPCount=0; 145 146 for(j=0; j<srcLength; ++j) { 147 c=src.charAt(j); 148 if(isBasic(c)) { 149 cpBuffer[srcCPCount++]=0; 150 dest.append(caseFlags!=null ? asciiCaseMap(c, caseFlags[j]) : c); 151 } else { 152 n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L; 153 if(!UTF16.isSurrogate(c)) { 154 n|=c; 155 } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) { 156 ++j; 157 158 n|=UCharacter.getCodePoint(c, c2); 159 } else { 160 /* error: unmatched surrogate */ 161 throw new StringPrepParseException("Illegal char found",StringPrepParseException.ILLEGAL_CHAR_FOUND); 162 } 163 cpBuffer[srcCPCount++]=n; 164 } 165 } 166 167 /* Finish the basic string - if it is not empty - with a delimiter. */ 168 basicLength=dest.length(); 169 if(basicLength>0) { 170 dest.append(DELIMITER); 171 } 172 173 /* 174 * handledCPCount is the number of code points that have been handled 175 * basicLength is the number of basic code points 176 * destLength is the number of chars that have been output 177 */ 178 179 /* Initialize the state: */ 180 n=INITIAL_N; 181 delta=0; 182 bias=INITIAL_BIAS; 183 184 /* Main encoding loop: */ 185 for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) { 186 /* 187 * All non-basic code points < n have been handled already. 188 * Find the next larger one: 189 */ 190 for(m=0x7fffffff, j=0; j<srcCPCount; ++j) { 191 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 192 if(n<=q && q<m) { 193 m=q; 194 } 195 } 196 197 /* 198 * Increase delta enough to advance the decoder's 199 * <n,i> state to <m,0>, but guard against overflow: 200 */ 201 if(m-n>(0x7fffffff-handledCPCount-delta)/(handledCPCount+1)) { 202 throw new IllegalStateException("Internal program error"); 203 } 204 delta+=(m-n)*(handledCPCount+1); 205 n=m; 206 207 /* Encode a sequence of same code points n */ 208 for(j=0; j<srcCPCount; ++j) { 209 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 210 if(q<n) { 211 ++delta; 212 } else if(q==n) { 213 /* Represent delta as a generalized variable-length integer: */ 214 for(q=delta, k=BASE; /* no condition */; k+=BASE) { 215 216 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt 217 218 t=k-bias; 219 if(t<TMIN) { 220 t=TMIN; 221 } else if(t>TMAX) { 222 t=TMAX; 223 } 224 */ 225 226 t=k-bias; 227 if(t<TMIN) { 228 t=TMIN; 229 } else if(k>=(bias+TMAX)) { 230 t=TMAX; 231 } 232 233 if(q<t) { 234 break; 235 } 236 237 dest.append(digitToBasic(t+(q-t)%(BASE-t), false)); 238 q=(q-t)/(BASE-t); 239 } 240 241 dest.append(digitToBasic(q, (cpBuffer[j]<0))); 242 bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength)); 243 delta=0; 244 ++handledCPCount; 245 } 246 } 247 248 ++delta; 249 ++n; 250 } 251 252 return dest; 253 } 254 isBasic(int ch)255 private static boolean isBasic(int ch){ 256 return (ch < INITIAL_N); 257 } 258 ///CLOVER:OFF isBasicUpperCase(int ch)259 private static boolean isBasicUpperCase(int ch){ 260 return( CAPITAL_A<=ch && ch >= CAPITAL_Z); 261 } 262 ///CLOVER:ON isSurrogate(int ch)263 private static boolean isSurrogate(int ch){ 264 return (((ch)&0xfffff800)==0xd800); 265 } 266 /** 267 * Converts Punycode to Unicode. 268 * The Unicode string will be at most as long as the Punycode string. 269 * 270 * @param src The source of the string buffer being passed. 271 * @param caseFlags The array of boolean case flags. 272 * @return StringBuilder string. 273 */ decode(CharSequence src, boolean[] caseFlags)274 public static StringBuilder decode(CharSequence src, boolean[] caseFlags) 275 throws StringPrepParseException{ 276 int srcLength = src.length(); 277 if (srcLength > DECODE_MAX_CHARS) { 278 throw new ICUInputTooLongException("input too long: " + srcLength + " characters"); 279 } 280 StringBuilder dest = new StringBuilder(src.length()); 281 int n, i, bias, basicLength, j, in, oldi, w, k, digit, t, 282 destCPCount, firstSupplementaryIndex, cpLength; 283 char b; 284 285 /* 286 * Handle the basic code points: 287 * Let basicLength be the number of input code points 288 * before the last delimiter, or 0 if there is none, 289 * then copy the first basicLength code points to the output. 290 * 291 * The following loop iterates backward. 292 */ 293 for(j=srcLength; j>0;) { 294 if(src.charAt(--j)==DELIMITER) { 295 break; 296 } 297 } 298 basicLength=destCPCount=j; 299 300 for(j=0; j<basicLength; ++j) { 301 b=src.charAt(j); 302 if(!isBasic(b)) { 303 throw new StringPrepParseException("Illegal char found", StringPrepParseException.INVALID_CHAR_FOUND); 304 } 305 dest.append(b); 306 307 if(caseFlags!=null && j<caseFlags.length) { 308 caseFlags[j]=isBasicUpperCase(b); 309 } 310 } 311 312 /* Initialize the state: */ 313 n=INITIAL_N; 314 i=0; 315 bias=INITIAL_BIAS; 316 firstSupplementaryIndex=1000000000; 317 318 /* 319 * Main decoding loop: 320 * Start just after the last delimiter if any 321 * basic code points were copied; start at the beginning otherwise. 322 */ 323 for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) { 324 /* 325 * in is the index of the next character to be consumed, and 326 * destCPCount is the number of code points in the output array. 327 * 328 * Decode a generalized variable-length integer into delta, 329 * which gets added to i. The overflow checking is easier 330 * if we increase i as we go, then subtract off its starting 331 * value at the end to obtain delta. 332 */ 333 for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) { 334 if(in>=srcLength) { 335 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); 336 } 337 338 digit=decodeDigit(src.charAt(in++)); 339 if(digit<0) { 340 throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND); 341 } 342 if(digit>(0x7fffffff-i)/w) { 343 /* integer overflow */ 344 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); 345 } 346 347 i+=digit*w; 348 t=k-bias; 349 if(t<TMIN) { 350 t=TMIN; 351 } else if(k>=(bias+TMAX)) { 352 t=TMAX; 353 } 354 if(digit<t) { 355 break; 356 } 357 358 if(w>0x7fffffff/(BASE-t)) { 359 /* integer overflow */ 360 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); 361 } 362 w*=BASE-t; 363 } 364 365 /* 366 * Modification from sample code: 367 * Increments destCPCount here, 368 * where needed instead of in for() loop tail. 369 */ 370 ++destCPCount; 371 bias=adaptBias(i-oldi, destCPCount, (oldi==0)); 372 373 /* 374 * i was supposed to wrap around from (incremented) destCPCount to 0, 375 * incrementing n each time, so we'll fix that now: 376 */ 377 if(i/destCPCount>(0x7fffffff-n)) { 378 /* integer overflow */ 379 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); 380 } 381 382 n+=i/destCPCount; 383 i%=destCPCount; 384 /* not needed for Punycode: */ 385 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ 386 387 if(n>0x10ffff || isSurrogate(n)) { 388 /* Unicode code point overflow */ 389 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); 390 } 391 392 /* Insert n at position i of the output: */ 393 cpLength=Character.charCount(n); 394 int codeUnitIndex; 395 396 /* 397 * Handle indexes when supplementary code points are present. 398 * 399 * In almost all cases, there will be only BMP code points before i 400 * and even in the entire string. 401 * This is handled with the same efficiency as with UTF-32. 402 * 403 * Only the rare cases with supplementary code points are handled 404 * more slowly - but not too bad since this is an insertion anyway. 405 */ 406 if(i<=firstSupplementaryIndex) { 407 codeUnitIndex=i; 408 if(cpLength>1) { 409 firstSupplementaryIndex=codeUnitIndex; 410 } else { 411 ++firstSupplementaryIndex; 412 } 413 } else { 414 codeUnitIndex=dest.offsetByCodePoints(firstSupplementaryIndex, i-firstSupplementaryIndex); 415 } 416 417 /* use the UChar index codeUnitIndex instead of the code point index i */ 418 if(caseFlags!=null && (dest.length()+cpLength)<=caseFlags.length) { 419 if(codeUnitIndex<dest.length()) { 420 System.arraycopy(caseFlags, codeUnitIndex, 421 caseFlags, codeUnitIndex+cpLength, 422 dest.length()-codeUnitIndex); 423 } 424 /* Case of last character determines uppercase flag: */ 425 caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1)); 426 if(cpLength==2) { 427 caseFlags[codeUnitIndex+1]=false; 428 } 429 } 430 if(cpLength==1) { 431 /* BMP, insert one code unit */ 432 dest.insert(codeUnitIndex, (char)n); 433 } else { 434 /* supplementary character, insert two code units */ 435 dest.insert(codeUnitIndex, UTF16.getLeadSurrogate(n)); 436 dest.insert(codeUnitIndex+1, UTF16.getTrailSurrogate(n)); 437 } 438 ++i; 439 } 440 return dest; 441 } 442 } 443