1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2003-2014, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package ohos.global.icu.impl; 11 12 import ohos.global.icu.lang.UCharacter; 13 import ohos.global.icu.text.StringPrepParseException; 14 import ohos.global.icu.text.UTF16; 15 16 /** 17 * Ported code from ICU punycode.c 18 * @author ram 19 * @hide exposed on OHOS 20 */ 21 public final class Punycode { 22 23 /* Punycode parameters for Bootstring */ 24 private static final int BASE = 36; 25 private static final int TMIN = 1; 26 private static final int TMAX = 26; 27 private static final int SKEW = 38; 28 private static final int DAMP = 700; 29 private static final int INITIAL_BIAS = 72; 30 private static final int INITIAL_N = 0x80; 31 32 /* "Basic" Unicode/ASCII code points */ 33 private static final char HYPHEN = 0x2d; 34 private static final char DELIMITER = HYPHEN; 35 36 private static final int ZERO = 0x30; 37 //private static final int NINE = 0x39; 38 39 private static final int SMALL_A = 0x61; 40 private static final int SMALL_Z = 0x7a; 41 42 private static final int CAPITAL_A = 0x41; 43 private static final int CAPITAL_Z = 0x5a; 44 adaptBias(int delta, int length, boolean firstTime)45 private static int adaptBias(int delta, int length, boolean firstTime){ 46 if(firstTime){ 47 delta /=DAMP; 48 }else{ 49 delta /= 2; 50 } 51 delta += delta/length; 52 53 int count=0; 54 for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) { 55 delta/=(BASE-TMIN); 56 } 57 58 return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); 59 } 60 61 /** 62 * basicToDigit[] contains the numeric value of a basic code 63 * point (for use in representing integers) in the range 0 to 64 * BASE-1, or -1 if b is does not represent a value. 65 */ 66 static final int[] basicToDigit= new int[]{ 67 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 68 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 69 70 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 71 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, 72 73 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 74 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 75 76 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 77 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 78 79 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 80 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 81 82 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 83 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 84 85 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 86 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 87 88 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 89 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 90 }; 91 92 ///CLOVER:OFF asciiCaseMap(char b, boolean uppercase)93 private static char asciiCaseMap(char b, boolean uppercase) { 94 if(uppercase) { 95 if(SMALL_A<=b && b<=SMALL_Z) { 96 b-=(SMALL_A-CAPITAL_A); 97 } 98 } else { 99 if(CAPITAL_A<=b && b<=CAPITAL_Z) { 100 b+=(SMALL_A-CAPITAL_A); 101 } 102 } 103 return b; 104 } 105 ///CLOVER:ON 106 /** 107 * digitToBasic() returns the basic code point whose value 108 * (when used for representing integers) is d, which must be in the 109 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is 110 * nonzero, in which case the uppercase form is used. 111 */ digitToBasic(int digit, boolean uppercase)112 private static char digitToBasic(int digit, boolean uppercase) { 113 /* 0..25 map to ASCII a..z or A..Z */ 114 /* 26..35 map to ASCII 0..9 */ 115 if(digit<26) { 116 if(uppercase) { 117 return (char)(CAPITAL_A+digit); 118 } else { 119 return (char)(SMALL_A+digit); 120 } 121 } else { 122 return (char)((ZERO-26)+digit); 123 } 124 } 125 /** 126 * Converts Unicode to Punycode. 127 * The input string must not contain single, unpaired surrogates. 128 * The output will be represented as an array of ASCII code points. 129 * 130 * @param src The source of the String Buffer passed. 131 * @param caseFlags The boolean array of case flags. 132 * @return An array of ASCII code points. 133 */ encode(CharSequence src, boolean[] caseFlags)134 public static StringBuilder encode(CharSequence src, boolean[] caseFlags) throws StringPrepParseException{ 135 int n, delta, handledCPCount, basicLength, bias, j, m, q, k, t, srcCPCount; 136 char c, c2; 137 int srcLength = src.length(); 138 int[] cpBuffer = new int[srcLength]; 139 StringBuilder dest = new StringBuilder(srcLength); 140 /* 141 * Handle the basic code points and 142 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): 143 */ 144 srcCPCount=0; 145 146 for(j=0; j<srcLength; ++j) { 147 c=src.charAt(j); 148 if(isBasic(c)) { 149 cpBuffer[srcCPCount++]=0; 150 dest.append(caseFlags!=null ? asciiCaseMap(c, caseFlags[j]) : c); 151 } else { 152 n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L; 153 if(!UTF16.isSurrogate(c)) { 154 n|=c; 155 } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) { 156 ++j; 157 158 n|=UCharacter.getCodePoint(c, c2); 159 } else { 160 /* error: unmatched surrogate */ 161 throw new StringPrepParseException("Illegal char found",StringPrepParseException.ILLEGAL_CHAR_FOUND); 162 } 163 cpBuffer[srcCPCount++]=n; 164 } 165 } 166 167 /* Finish the basic string - if it is not empty - with a delimiter. */ 168 basicLength=dest.length(); 169 if(basicLength>0) { 170 dest.append(DELIMITER); 171 } 172 173 /* 174 * handledCPCount is the number of code points that have been handled 175 * basicLength is the number of basic code points 176 * destLength is the number of chars that have been output 177 */ 178 179 /* Initialize the state: */ 180 n=INITIAL_N; 181 delta=0; 182 bias=INITIAL_BIAS; 183 184 /* Main encoding loop: */ 185 for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) { 186 /* 187 * All non-basic code points < n have been handled already. 188 * Find the next larger one: 189 */ 190 for(m=0x7fffffff, j=0; j<srcCPCount; ++j) { 191 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 192 if(n<=q && q<m) { 193 m=q; 194 } 195 } 196 197 /* 198 * Increase delta enough to advance the decoder's 199 * <n,i> state to <m,0>, but guard against overflow: 200 */ 201 if(m-n>(0x7fffffff-delta)/(handledCPCount+1)) { 202 throw new IllegalStateException("Internal program error"); 203 } 204 delta+=(m-n)*(handledCPCount+1); 205 n=m; 206 207 /* Encode a sequence of same code points n */ 208 for(j=0; j<srcCPCount; ++j) { 209 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 210 if(q<n) { 211 ++delta; 212 } else if(q==n) { 213 /* Represent delta as a generalized variable-length integer: */ 214 for(q=delta, k=BASE; /* no condition */; k+=BASE) { 215 216 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt 217 218 t=k-bias; 219 if(t<TMIN) { 220 t=TMIN; 221 } else if(t>TMAX) { 222 t=TMAX; 223 } 224 */ 225 226 t=k-bias; 227 if(t<TMIN) { 228 t=TMIN; 229 } else if(k>=(bias+TMAX)) { 230 t=TMAX; 231 } 232 233 if(q<t) { 234 break; 235 } 236 237 dest.append(digitToBasic(t+(q-t)%(BASE-t), false)); 238 q=(q-t)/(BASE-t); 239 } 240 241 dest.append(digitToBasic(q, (cpBuffer[j]<0))); 242 bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength)); 243 delta=0; 244 ++handledCPCount; 245 } 246 } 247 248 ++delta; 249 ++n; 250 } 251 252 return dest; 253 } 254 isBasic(int ch)255 private static boolean isBasic(int ch){ 256 return (ch < INITIAL_N); 257 } 258 ///CLOVER:OFF isBasicUpperCase(int ch)259 private static boolean isBasicUpperCase(int ch){ 260 return( CAPITAL_A<=ch && ch >= CAPITAL_Z); 261 } 262 ///CLOVER:ON isSurrogate(int ch)263 private static boolean isSurrogate(int ch){ 264 return (((ch)&0xfffff800)==0xd800); 265 } 266 /** 267 * Converts Punycode to Unicode. 268 * The Unicode string will be at most as long as the Punycode string. 269 * 270 * @param src The source of the string buffer being passed. 271 * @param caseFlags The array of boolean case flags. 272 * @return StringBuilder string. 273 */ decode(CharSequence src, boolean[] caseFlags)274 public static StringBuilder decode(CharSequence src, boolean[] caseFlags) 275 throws StringPrepParseException{ 276 int srcLength = src.length(); 277 StringBuilder dest = new StringBuilder(src.length()); 278 int n, i, bias, basicLength, j, in, oldi, w, k, digit, t, 279 destCPCount, firstSupplementaryIndex, cpLength; 280 char b; 281 282 /* 283 * Handle the basic code points: 284 * Let basicLength be the number of input code points 285 * before the last delimiter, or 0 if there is none, 286 * then copy the first basicLength code points to the output. 287 * 288 * The following loop iterates backward. 289 */ 290 for(j=srcLength; j>0;) { 291 if(src.charAt(--j)==DELIMITER) { 292 break; 293 } 294 } 295 basicLength=destCPCount=j; 296 297 for(j=0; j<basicLength; ++j) { 298 b=src.charAt(j); 299 if(!isBasic(b)) { 300 throw new StringPrepParseException("Illegal char found", StringPrepParseException.INVALID_CHAR_FOUND); 301 } 302 dest.append(b); 303 304 if(caseFlags!=null && j<caseFlags.length) { 305 caseFlags[j]=isBasicUpperCase(b); 306 } 307 } 308 309 /* Initialize the state: */ 310 n=INITIAL_N; 311 i=0; 312 bias=INITIAL_BIAS; 313 firstSupplementaryIndex=1000000000; 314 315 /* 316 * Main decoding loop: 317 * Start just after the last delimiter if any 318 * basic code points were copied; start at the beginning otherwise. 319 */ 320 for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) { 321 /* 322 * in is the index of the next character to be consumed, and 323 * destCPCount is the number of code points in the output array. 324 * 325 * Decode a generalized variable-length integer into delta, 326 * which gets added to i. The overflow checking is easier 327 * if we increase i as we go, then subtract off its starting 328 * value at the end to obtain delta. 329 */ 330 for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) { 331 if(in>=srcLength) { 332 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); 333 } 334 335 digit=basicToDigit[src.charAt(in++) & 0xFF]; 336 if(digit<0) { 337 throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND); 338 } 339 if(digit>(0x7fffffff-i)/w) { 340 /* integer overflow */ 341 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); 342 } 343 344 i+=digit*w; 345 t=k-bias; 346 if(t<TMIN) { 347 t=TMIN; 348 } else if(k>=(bias+TMAX)) { 349 t=TMAX; 350 } 351 if(digit<t) { 352 break; 353 } 354 355 if(w>0x7fffffff/(BASE-t)) { 356 /* integer overflow */ 357 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); 358 } 359 w*=BASE-t; 360 } 361 362 /* 363 * Modification from sample code: 364 * Increments destCPCount here, 365 * where needed instead of in for() loop tail. 366 */ 367 ++destCPCount; 368 bias=adaptBias(i-oldi, destCPCount, (oldi==0)); 369 370 /* 371 * i was supposed to wrap around from (incremented) destCPCount to 0, 372 * incrementing n each time, so we'll fix that now: 373 */ 374 if(i/destCPCount>(0x7fffffff-n)) { 375 /* integer overflow */ 376 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); 377 } 378 379 n+=i/destCPCount; 380 i%=destCPCount; 381 /* not needed for Punycode: */ 382 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ 383 384 if(n>0x10ffff || isSurrogate(n)) { 385 /* Unicode code point overflow */ 386 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); 387 } 388 389 /* Insert n at position i of the output: */ 390 cpLength=Character.charCount(n); 391 int codeUnitIndex; 392 393 /* 394 * Handle indexes when supplementary code points are present. 395 * 396 * In almost all cases, there will be only BMP code points before i 397 * and even in the entire string. 398 * This is handled with the same efficiency as with UTF-32. 399 * 400 * Only the rare cases with supplementary code points are handled 401 * more slowly - but not too bad since this is an insertion anyway. 402 */ 403 if(i<=firstSupplementaryIndex) { 404 codeUnitIndex=i; 405 if(cpLength>1) { 406 firstSupplementaryIndex=codeUnitIndex; 407 } else { 408 ++firstSupplementaryIndex; 409 } 410 } else { 411 codeUnitIndex=dest.offsetByCodePoints(firstSupplementaryIndex, i-firstSupplementaryIndex); 412 } 413 414 /* use the UChar index codeUnitIndex instead of the code point index i */ 415 if(caseFlags!=null && (dest.length()+cpLength)<=caseFlags.length) { 416 if(codeUnitIndex<dest.length()) { 417 System.arraycopy(caseFlags, codeUnitIndex, 418 caseFlags, codeUnitIndex+cpLength, 419 dest.length()-codeUnitIndex); 420 } 421 /* Case of last character determines uppercase flag: */ 422 caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1)); 423 if(cpLength==2) { 424 caseFlags[codeUnitIndex+1]=false; 425 } 426 } 427 if(cpLength==1) { 428 /* BMP, insert one code unit */ 429 dest.insert(codeUnitIndex, (char)n); 430 } else { 431 /* supplementary character, insert two code units */ 432 dest.insert(codeUnitIndex, UTF16.getLeadSurrogate(n)); 433 dest.insert(codeUnitIndex+1, UTF16.getTrailSurrogate(n)); 434 } 435 ++i; 436 } 437 return dest; 438 } 439 } 440