• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 2003-2014, International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *******************************************************************************
9  */
10 package ohos.global.icu.impl;
11 
12 import ohos.global.icu.lang.UCharacter;
13 import ohos.global.icu.text.StringPrepParseException;
14 import ohos.global.icu.text.UTF16;
15 
16 /**
17  * Ported code from ICU punycode.c
18  * @author ram
19  * @hide exposed on OHOS
20  */
21 public final class Punycode {
22 
23     /* Punycode parameters for Bootstring */
24     private static final int BASE           = 36;
25     private static final int TMIN           = 1;
26     private static final int TMAX           = 26;
27     private static final int SKEW           = 38;
28     private static final int DAMP           = 700;
29     private static final int INITIAL_BIAS   = 72;
30     private static final int INITIAL_N      = 0x80;
31 
32     /* "Basic" Unicode/ASCII code points */
33     private static final char HYPHEN        = 0x2d;
34     private static final char DELIMITER     = HYPHEN;
35 
36     private static final int ZERO           = 0x30;
37     //private static final int NINE           = 0x39;
38 
39     private static final int SMALL_A        = 0x61;
40     private static final int SMALL_Z        = 0x7a;
41 
42     private static final int CAPITAL_A      = 0x41;
43     private static final int CAPITAL_Z      = 0x5a;
44 
adaptBias(int delta, int length, boolean firstTime)45     private static int adaptBias(int delta, int length, boolean firstTime){
46         if(firstTime){
47             delta /=DAMP;
48         }else{
49             delta /=  2;
50         }
51         delta += delta/length;
52 
53         int count=0;
54         for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
55             delta/=(BASE-TMIN);
56         }
57 
58         return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
59     }
60 
61     /**
62      * basicToDigit[] contains the numeric value of a basic code
63      * point (for use in representing integers) in the range 0 to
64      * BASE-1, or -1 if b is does not represent a value.
65      */
66     static final int[]    basicToDigit= new int[]{
67         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
68         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
69 
70         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
71         26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
72 
73         -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
74         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
75 
76         -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
77         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
78 
79         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
80         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
81 
82         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
83         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
84 
85         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
86         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
87 
88         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
89         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
90     };
91 
92     ///CLOVER:OFF
asciiCaseMap(char b, boolean uppercase)93     private static char asciiCaseMap(char b, boolean uppercase) {
94         if(uppercase) {
95             if(SMALL_A<=b && b<=SMALL_Z) {
96                 b-=(SMALL_A-CAPITAL_A);
97             }
98         } else {
99             if(CAPITAL_A<=b && b<=CAPITAL_Z) {
100                 b+=(SMALL_A-CAPITAL_A);
101             }
102         }
103         return b;
104     }
105     ///CLOVER:ON
106     /**
107      * digitToBasic() returns the basic code point whose value
108      * (when used for representing integers) is d, which must be in the
109      * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
110      * nonzero, in which case the uppercase form is used.
111      */
digitToBasic(int digit, boolean uppercase)112     private static char digitToBasic(int digit, boolean uppercase) {
113         /*  0..25 map to ASCII a..z or A..Z */
114         /* 26..35 map to ASCII 0..9         */
115         if(digit<26) {
116             if(uppercase) {
117                 return (char)(CAPITAL_A+digit);
118             } else {
119                 return (char)(SMALL_A+digit);
120             }
121         } else {
122             return (char)((ZERO-26)+digit);
123         }
124     }
125     /**
126      * Converts Unicode to Punycode.
127      * The input string must not contain single, unpaired surrogates.
128      * The output will be represented as an array of ASCII code points.
129      *
130      * @param src The source of the String Buffer passed.
131      * @param caseFlags The boolean array of case flags.
132      * @return An array of ASCII code points.
133      */
encode(CharSequence src, boolean[] caseFlags)134     public static StringBuilder encode(CharSequence src, boolean[] caseFlags) throws StringPrepParseException{
135         int n, delta, handledCPCount, basicLength, bias, j, m, q, k, t, srcCPCount;
136         char c, c2;
137         int srcLength = src.length();
138         int[] cpBuffer = new int[srcLength];
139         StringBuilder dest = new StringBuilder(srcLength);
140         /*
141          * Handle the basic code points and
142          * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
143          */
144         srcCPCount=0;
145 
146         for(j=0; j<srcLength; ++j) {
147             c=src.charAt(j);
148             if(isBasic(c)) {
149                 cpBuffer[srcCPCount++]=0;
150                 dest.append(caseFlags!=null ? asciiCaseMap(c, caseFlags[j]) : c);
151             } else {
152                 n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L;
153                 if(!UTF16.isSurrogate(c)) {
154                     n|=c;
155                 } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
156                     ++j;
157 
158                     n|=UCharacter.getCodePoint(c, c2);
159                 } else {
160                     /* error: unmatched surrogate */
161                     throw new StringPrepParseException("Illegal char found",StringPrepParseException.ILLEGAL_CHAR_FOUND);
162                 }
163                 cpBuffer[srcCPCount++]=n;
164             }
165         }
166 
167         /* Finish the basic string - if it is not empty - with a delimiter. */
168         basicLength=dest.length();
169         if(basicLength>0) {
170             dest.append(DELIMITER);
171         }
172 
173         /*
174          * handledCPCount is the number of code points that have been handled
175          * basicLength is the number of basic code points
176          * destLength is the number of chars that have been output
177          */
178 
179         /* Initialize the state: */
180         n=INITIAL_N;
181         delta=0;
182         bias=INITIAL_BIAS;
183 
184         /* Main encoding loop: */
185         for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
186             /*
187              * All non-basic code points < n have been handled already.
188              * Find the next larger one:
189              */
190             for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
191                 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
192                 if(n<=q && q<m) {
193                     m=q;
194                 }
195             }
196 
197             /*
198              * Increase delta enough to advance the decoder's
199              * <n,i> state to <m,0>, but guard against overflow:
200              */
201             if(m-n>(0x7fffffff-delta)/(handledCPCount+1)) {
202                 throw new IllegalStateException("Internal program error");
203             }
204             delta+=(m-n)*(handledCPCount+1);
205             n=m;
206 
207             /* Encode a sequence of same code points n */
208             for(j=0; j<srcCPCount; ++j) {
209                 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
210                 if(q<n) {
211                     ++delta;
212                 } else if(q==n) {
213                     /* Represent delta as a generalized variable-length integer: */
214                     for(q=delta, k=BASE; /* no condition */; k+=BASE) {
215 
216                         /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
217 
218                         t=k-bias;
219                         if(t<TMIN) {
220                             t=TMIN;
221                         } else if(t>TMAX) {
222                             t=TMAX;
223                         }
224                         */
225 
226                         t=k-bias;
227                         if(t<TMIN) {
228                             t=TMIN;
229                         } else if(k>=(bias+TMAX)) {
230                             t=TMAX;
231                         }
232 
233                         if(q<t) {
234                             break;
235                         }
236 
237                         dest.append(digitToBasic(t+(q-t)%(BASE-t), false));
238                         q=(q-t)/(BASE-t);
239                     }
240 
241                     dest.append(digitToBasic(q, (cpBuffer[j]<0)));
242                     bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength));
243                     delta=0;
244                     ++handledCPCount;
245                 }
246             }
247 
248             ++delta;
249             ++n;
250         }
251 
252         return dest;
253     }
254 
isBasic(int ch)255     private static boolean isBasic(int ch){
256         return (ch < INITIAL_N);
257     }
258     ///CLOVER:OFF
isBasicUpperCase(int ch)259     private static boolean isBasicUpperCase(int ch){
260         return( CAPITAL_A<=ch && ch >= CAPITAL_Z);
261     }
262     ///CLOVER:ON
isSurrogate(int ch)263     private static boolean isSurrogate(int ch){
264         return (((ch)&0xfffff800)==0xd800);
265     }
266     /**
267      * Converts Punycode to Unicode.
268      * The Unicode string will be at most as long as the Punycode string.
269      *
270      * @param src The source of the string buffer being passed.
271      * @param caseFlags The array of boolean case flags.
272      * @return StringBuilder string.
273      */
decode(CharSequence src, boolean[] caseFlags)274     public static StringBuilder decode(CharSequence src, boolean[] caseFlags)
275                                throws StringPrepParseException{
276         int srcLength = src.length();
277         StringBuilder dest = new StringBuilder(src.length());
278         int n, i, bias, basicLength, j, in, oldi, w, k, digit, t,
279                 destCPCount, firstSupplementaryIndex, cpLength;
280         char b;
281 
282         /*
283          * Handle the basic code points:
284          * Let basicLength be the number of input code points
285          * before the last delimiter, or 0 if there is none,
286          * then copy the first basicLength code points to the output.
287          *
288          * The following loop iterates backward.
289          */
290         for(j=srcLength; j>0;) {
291             if(src.charAt(--j)==DELIMITER) {
292                 break;
293             }
294         }
295         basicLength=destCPCount=j;
296 
297         for(j=0; j<basicLength; ++j) {
298             b=src.charAt(j);
299             if(!isBasic(b)) {
300                 throw new StringPrepParseException("Illegal char found", StringPrepParseException.INVALID_CHAR_FOUND);
301             }
302             dest.append(b);
303 
304             if(caseFlags!=null && j<caseFlags.length) {
305                 caseFlags[j]=isBasicUpperCase(b);
306             }
307         }
308 
309         /* Initialize the state: */
310         n=INITIAL_N;
311         i=0;
312         bias=INITIAL_BIAS;
313         firstSupplementaryIndex=1000000000;
314 
315         /*
316          * Main decoding loop:
317          * Start just after the last delimiter if any
318          * basic code points were copied; start at the beginning otherwise.
319          */
320         for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
321             /*
322              * in is the index of the next character to be consumed, and
323              * destCPCount is the number of code points in the output array.
324              *
325              * Decode a generalized variable-length integer into delta,
326              * which gets added to i.  The overflow checking is easier
327              * if we increase i as we go, then subtract off its starting
328              * value at the end to obtain delta.
329              */
330             for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
331                 if(in>=srcLength) {
332                     throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
333                 }
334 
335                 digit=basicToDigit[src.charAt(in++) & 0xFF];
336                 if(digit<0) {
337                     throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND);
338                 }
339                 if(digit>(0x7fffffff-i)/w) {
340                     /* integer overflow */
341                     throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
342                 }
343 
344                 i+=digit*w;
345                 t=k-bias;
346                 if(t<TMIN) {
347                     t=TMIN;
348                 } else if(k>=(bias+TMAX)) {
349                     t=TMAX;
350                 }
351                 if(digit<t) {
352                     break;
353                 }
354 
355                 if(w>0x7fffffff/(BASE-t)) {
356                     /* integer overflow */
357                     throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
358                 }
359                 w*=BASE-t;
360             }
361 
362             /*
363              * Modification from sample code:
364              * Increments destCPCount here,
365              * where needed instead of in for() loop tail.
366              */
367             ++destCPCount;
368             bias=adaptBias(i-oldi, destCPCount, (oldi==0));
369 
370             /*
371              * i was supposed to wrap around from (incremented) destCPCount to 0,
372              * incrementing n each time, so we'll fix that now:
373              */
374             if(i/destCPCount>(0x7fffffff-n)) {
375                 /* integer overflow */
376                 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
377             }
378 
379             n+=i/destCPCount;
380             i%=destCPCount;
381             /* not needed for Punycode: */
382             /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
383 
384             if(n>0x10ffff || isSurrogate(n)) {
385                 /* Unicode code point overflow */
386                 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
387             }
388 
389             /* Insert n at position i of the output: */
390             cpLength=Character.charCount(n);
391             int codeUnitIndex;
392 
393             /*
394              * Handle indexes when supplementary code points are present.
395              *
396              * In almost all cases, there will be only BMP code points before i
397              * and even in the entire string.
398              * This is handled with the same efficiency as with UTF-32.
399              *
400              * Only the rare cases with supplementary code points are handled
401              * more slowly - but not too bad since this is an insertion anyway.
402              */
403             if(i<=firstSupplementaryIndex) {
404                 codeUnitIndex=i;
405                 if(cpLength>1) {
406                     firstSupplementaryIndex=codeUnitIndex;
407                 } else {
408                     ++firstSupplementaryIndex;
409                 }
410             } else {
411                 codeUnitIndex=dest.offsetByCodePoints(firstSupplementaryIndex, i-firstSupplementaryIndex);
412             }
413 
414             /* use the UChar index codeUnitIndex instead of the code point index i */
415             if(caseFlags!=null && (dest.length()+cpLength)<=caseFlags.length) {
416                 if(codeUnitIndex<dest.length()) {
417                     System.arraycopy(caseFlags, codeUnitIndex,
418                                      caseFlags, codeUnitIndex+cpLength,
419                                      dest.length()-codeUnitIndex);
420                 }
421                 /* Case of last character determines uppercase flag: */
422                 caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1));
423                 if(cpLength==2) {
424                     caseFlags[codeUnitIndex+1]=false;
425                 }
426             }
427             if(cpLength==1) {
428                 /* BMP, insert one code unit */
429                 dest.insert(codeUnitIndex, (char)n);
430             } else {
431                 /* supplementary character, insert two code units */
432                 dest.insert(codeUnitIndex, UTF16.getLeadSurrogate(n));
433                 dest.insert(codeUnitIndex+1, UTF16.getTrailSurrogate(n));
434             }
435             ++i;
436         }
437         return dest;
438     }
439 }
440