1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2002-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: punycode.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002jan31
14 * created by: Markus W. Scherer
15 */
16
17
18 /* This ICU code derived from: */
19 /*
20 punycode.c 0.4.0 (2001-Nov-17-Sat)
21 http://www.cs.berkeley.edu/~amc/idn/
22 Adam M. Costello
23 http://www.nicemice.net/amc/
24
25 Disclaimer and license
26
27 Regarding this entire document or any portion of it (including
28 the pseudocode and C code), the author makes no guarantees and
29 is not responsible for any damage resulting from its use. The
30 author grants irrevocable permission to anyone to use, modify,
31 and distribute it in any way that does not diminish the rights
32 of anyone else to use, modify, and distribute it, provided that
33 redistributed derivative works do not contain misleading author or
34 version information. Derivative works need not be licensed under
35 similar terms.
36 */
37 /*
38 * ICU modifications:
39 * - ICU data types and coding conventions
40 * - ICU string buffer handling with implicit source lengths
41 * and destination preflighting
42 * - UTF-16 handling
43 */
44
45 #include "unicode/utypes.h"
46
47 #if !UCONFIG_NO_IDNA
48
49 #include "ustr_imp.h"
50 #include "cstring.h"
51 #include "cmemory.h"
52 #include "punycode.h"
53 #include "unicode/ustring.h"
54
55
56 /* Punycode ----------------------------------------------------------------- */
57
58 /* Punycode parameters for Bootstring */
59 #define BASE 36
60 #define TMIN 1
61 #define TMAX 26
62 #define SKEW 38
63 #define DAMP 700
64 #define INITIAL_BIAS 72
65 #define INITIAL_N 0x80
66
67 /* "Basic" Unicode/ASCII code points */
68 #define _HYPHEN 0X2d
69 #define DELIMITER _HYPHEN
70
71 #define _ZERO_ 0X30
72 #define _NINE 0x39
73
74 #define _SMALL_A 0X61
75 #define _SMALL_Z 0X7a
76
77 #define _CAPITAL_A 0X41
78 #define _CAPITAL_Z 0X5a
79
80 #define IS_BASIC(c) ((c)<0x80)
81 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
82
83 /**
84 * digitToBasic() returns the basic code point whose value
85 * (when used for representing integers) is d, which must be in the
86 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
87 * nonzero, in which case the uppercase form is used.
88 */
89 static U_INLINE char
digitToBasic(int32_t digit,UBool uppercase)90 digitToBasic(int32_t digit, UBool uppercase) {
91 /* 0..25 map to ASCII a..z or A..Z */
92 /* 26..35 map to ASCII 0..9 */
93 if(digit<26) {
94 if(uppercase) {
95 return (char)(_CAPITAL_A+digit);
96 } else {
97 return (char)(_SMALL_A+digit);
98 }
99 } else {
100 return (char)((_ZERO_-26)+digit);
101 }
102 }
103
104 /**
105 * basicToDigit[] contains the numeric value of a basic code
106 * point (for use in representing integers) in the range 0 to
107 * BASE-1, or -1 if b is does not represent a value.
108 */
109 static const int8_t
110 basicToDigit[256]={
111 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
112 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
113
114 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
115 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
116
117 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
118 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
119
120 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
121 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
122
123 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
124 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
125
126 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
127 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
128
129 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
130 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
131
132 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
133 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
134 };
135
136 static U_INLINE char
asciiCaseMap(char b,UBool uppercase)137 asciiCaseMap(char b, UBool uppercase) {
138 if(uppercase) {
139 if(_SMALL_A<=b && b<=_SMALL_Z) {
140 b-=(_SMALL_A-_CAPITAL_A);
141 }
142 } else {
143 if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
144 b+=(_SMALL_A-_CAPITAL_A);
145 }
146 }
147 return b;
148 }
149
150 /* Punycode-specific Bootstring code ---------------------------------------- */
151
152 /*
153 * The following code omits the {parts} of the pseudo-algorithm in the spec
154 * that are not used with the Punycode parameter set.
155 */
156
157 /* Bias adaptation function. */
158 static int32_t
adaptBias(int32_t delta,int32_t length,UBool firstTime)159 adaptBias(int32_t delta, int32_t length, UBool firstTime) {
160 int32_t count;
161
162 if(firstTime) {
163 delta/=DAMP;
164 } else {
165 delta/=2;
166 }
167
168 delta+=delta/length;
169 for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
170 delta/=(BASE-TMIN);
171 }
172
173 return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
174 }
175
176 #define MAX_CP_COUNT 200
177
178 U_CFUNC int32_t
u_strToPunycode(const UChar * src,int32_t srcLength,UChar * dest,int32_t destCapacity,const UBool * caseFlags,UErrorCode * pErrorCode)179 u_strToPunycode(const UChar *src, int32_t srcLength,
180 UChar *dest, int32_t destCapacity,
181 const UBool *caseFlags,
182 UErrorCode *pErrorCode) {
183
184 int32_t cpBuffer[MAX_CP_COUNT];
185 int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
186 UChar c, c2;
187
188 /* argument checking */
189 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
190 return 0;
191 }
192
193 if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
194 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
195 return 0;
196 }
197
198 /*
199 * Handle the basic code points and
200 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
201 */
202 srcCPCount=destLength=0;
203 if(srcLength==-1) {
204 /* NUL-terminated input */
205 for(j=0; /* no condition */; ++j) {
206 if((c=src[j])==0) {
207 break;
208 }
209 if(srcCPCount==MAX_CP_COUNT) {
210 /* too many input code points */
211 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
212 return 0;
213 }
214 if(IS_BASIC(c)) {
215 cpBuffer[srcCPCount++]=0;
216 if(destLength<destCapacity) {
217 dest[destLength]=
218 caseFlags!=NULL ?
219 asciiCaseMap((char)c, caseFlags[j]) :
220 (char)c;
221 }
222 ++destLength;
223 } else {
224 n=(caseFlags!=NULL && caseFlags[j])<<31L;
225 if(UTF_IS_SINGLE(c)) {
226 n|=c;
227 } else if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(c2=src[j+1])) {
228 ++j;
229 n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
230 } else {
231 /* error: unmatched surrogate */
232 *pErrorCode=U_INVALID_CHAR_FOUND;
233 return 0;
234 }
235 cpBuffer[srcCPCount++]=n;
236 }
237 }
238 } else {
239 /* length-specified input */
240 for(j=0; j<srcLength; ++j) {
241 if(srcCPCount==MAX_CP_COUNT) {
242 /* too many input code points */
243 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
244 return 0;
245 }
246 c=src[j];
247 if(IS_BASIC(c)) {
248 cpBuffer[srcCPCount++]=0;
249 if(destLength<destCapacity) {
250 dest[destLength]=
251 caseFlags!=NULL ?
252 asciiCaseMap((char)c, caseFlags[j]) :
253 (char)c;
254 }
255 ++destLength;
256 } else {
257 n=(caseFlags!=NULL && caseFlags[j])<<31L;
258 if(UTF_IS_SINGLE(c)) {
259 n|=c;
260 } else if(UTF_IS_LEAD(c) && (j+1)<srcLength && UTF_IS_TRAIL(c2=src[j+1])) {
261 ++j;
262 n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
263 } else {
264 /* error: unmatched surrogate */
265 *pErrorCode=U_INVALID_CHAR_FOUND;
266 return 0;
267 }
268 cpBuffer[srcCPCount++]=n;
269 }
270 }
271 }
272
273 /* Finish the basic string - if it is not empty - with a delimiter. */
274 basicLength=destLength;
275 if(basicLength>0) {
276 if(destLength<destCapacity) {
277 dest[destLength]=DELIMITER;
278 }
279 ++destLength;
280 }
281
282 /*
283 * handledCPCount is the number of code points that have been handled
284 * basicLength is the number of basic code points
285 * destLength is the number of chars that have been output
286 */
287
288 /* Initialize the state: */
289 n=INITIAL_N;
290 delta=0;
291 bias=INITIAL_BIAS;
292
293 /* Main encoding loop: */
294 for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
295 /*
296 * All non-basic code points < n have been handled already.
297 * Find the next larger one:
298 */
299 for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
300 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
301 if(n<=q && q<m) {
302 m=q;
303 }
304 }
305
306 /*
307 * Increase delta enough to advance the decoder's
308 * <n,i> state to <m,0>, but guard against overflow:
309 */
310 if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
311 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
312 return 0;
313 }
314 delta+=(m-n)*(handledCPCount+1);
315 n=m;
316
317 /* Encode a sequence of same code points n */
318 for(j=0; j<srcCPCount; ++j) {
319 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
320 if(q<n) {
321 ++delta;
322 } else if(q==n) {
323 /* Represent delta as a generalized variable-length integer: */
324 for(q=delta, k=BASE; /* no condition */; k+=BASE) {
325
326 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
327
328 t=k-bias;
329 if(t<TMIN) {
330 t=TMIN;
331 } else if(t>TMAX) {
332 t=TMAX;
333 }
334 */
335
336 t=k-bias;
337 if(t<TMIN) {
338 t=TMIN;
339 } else if(k>=(bias+TMAX)) {
340 t=TMAX;
341 }
342
343 if(q<t) {
344 break;
345 }
346
347 if(destLength<destCapacity) {
348 dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
349 }
350 ++destLength;
351 q=(q-t)/(BASE-t);
352 }
353
354 if(destLength<destCapacity) {
355 dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
356 }
357 ++destLength;
358 bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
359 delta=0;
360 ++handledCPCount;
361 }
362 }
363
364 ++delta;
365 ++n;
366 }
367
368 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
369 }
370
371 U_CFUNC int32_t
u_strFromPunycode(const UChar * src,int32_t srcLength,UChar * dest,int32_t destCapacity,UBool * caseFlags,UErrorCode * pErrorCode)372 u_strFromPunycode(const UChar *src, int32_t srcLength,
373 UChar *dest, int32_t destCapacity,
374 UBool *caseFlags,
375 UErrorCode *pErrorCode) {
376 int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
377 destCPCount, firstSupplementaryIndex, cpLength;
378 UChar b;
379
380 /* argument checking */
381 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
382 return 0;
383 }
384
385 if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
386 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
387 return 0;
388 }
389
390 if(srcLength==-1) {
391 srcLength=u_strlen(src);
392 }
393
394 /*
395 * Handle the basic code points:
396 * Let basicLength be the number of input code points
397 * before the last delimiter, or 0 if there is none,
398 * then copy the first basicLength code points to the output.
399 *
400 * The two following loops iterate backward.
401 */
402 for(j=srcLength; j>0;) {
403 if(src[--j]==DELIMITER) {
404 break;
405 }
406 }
407 destLength=basicLength=destCPCount=j;
408
409 while(j>0) {
410 b=src[--j];
411 if(!IS_BASIC(b)) {
412 *pErrorCode=U_INVALID_CHAR_FOUND;
413 return 0;
414 }
415
416 if(j<destCapacity) {
417 dest[j]=(UChar)b;
418
419 if(caseFlags!=NULL) {
420 caseFlags[j]=IS_BASIC_UPPERCASE(b);
421 }
422 }
423 }
424
425 /* Initialize the state: */
426 n=INITIAL_N;
427 i=0;
428 bias=INITIAL_BIAS;
429 firstSupplementaryIndex=1000000000;
430
431 /*
432 * Main decoding loop:
433 * Start just after the last delimiter if any
434 * basic code points were copied; start at the beginning otherwise.
435 */
436 for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
437 /*
438 * in is the index of the next character to be consumed, and
439 * destCPCount is the number of code points in the output array.
440 *
441 * Decode a generalized variable-length integer into delta,
442 * which gets added to i. The overflow checking is easier
443 * if we increase i as we go, then subtract off its starting
444 * value at the end to obtain delta.
445 */
446 for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
447 if(in>=srcLength) {
448 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
449 return 0;
450 }
451
452 digit=basicToDigit[(uint8_t)src[in++]];
453 if(digit<0) {
454 *pErrorCode=U_INVALID_CHAR_FOUND;
455 return 0;
456 }
457 if(digit>(0x7fffffff-i)/w) {
458 /* integer overflow */
459 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
460 return 0;
461 }
462
463 i+=digit*w;
464 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
465 t=k-bias;
466 if(t<TMIN) {
467 t=TMIN;
468 } else if(t>TMAX) {
469 t=TMAX;
470 }
471 */
472 t=k-bias;
473 if(t<TMIN) {
474 t=TMIN;
475 } else if(k>=(bias+TMAX)) {
476 t=TMAX;
477 }
478 if(digit<t) {
479 break;
480 }
481
482 if(w>0x7fffffff/(BASE-t)) {
483 /* integer overflow */
484 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
485 return 0;
486 }
487 w*=BASE-t;
488 }
489
490 /*
491 * Modification from sample code:
492 * Increments destCPCount here,
493 * where needed instead of in for() loop tail.
494 */
495 ++destCPCount;
496 bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
497
498 /*
499 * i was supposed to wrap around from (incremented) destCPCount to 0,
500 * incrementing n each time, so we'll fix that now:
501 */
502 if(i/destCPCount>(0x7fffffff-n)) {
503 /* integer overflow */
504 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
505 return 0;
506 }
507
508 n+=i/destCPCount;
509 i%=destCPCount;
510 /* not needed for Punycode: */
511 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
512
513 if(n>0x10ffff || UTF_IS_SURROGATE(n)) {
514 /* Unicode code point overflow */
515 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
516 return 0;
517 }
518
519 /* Insert n at position i of the output: */
520 cpLength=UTF_CHAR_LENGTH(n);
521 if((destLength+cpLength)<=destCapacity) {
522 int32_t codeUnitIndex;
523
524 /*
525 * Handle indexes when supplementary code points are present.
526 *
527 * In almost all cases, there will be only BMP code points before i
528 * and even in the entire string.
529 * This is handled with the same efficiency as with UTF-32.
530 *
531 * Only the rare cases with supplementary code points are handled
532 * more slowly - but not too bad since this is an insertion anyway.
533 */
534 if(i<=firstSupplementaryIndex) {
535 codeUnitIndex=i;
536 if(cpLength>1) {
537 firstSupplementaryIndex=codeUnitIndex;
538 } else {
539 ++firstSupplementaryIndex;
540 }
541 } else {
542 codeUnitIndex=firstSupplementaryIndex;
543 UTF_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
544 }
545
546 /* use the UChar index codeUnitIndex instead of the code point index i */
547 if(codeUnitIndex<destLength) {
548 uprv_memmove(dest+codeUnitIndex+cpLength,
549 dest+codeUnitIndex,
550 (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
551 if(caseFlags!=NULL) {
552 uprv_memmove(caseFlags+codeUnitIndex+cpLength,
553 caseFlags+codeUnitIndex,
554 destLength-codeUnitIndex);
555 }
556 }
557 if(cpLength==1) {
558 /* BMP, insert one code unit */
559 dest[codeUnitIndex]=(UChar)n;
560 } else {
561 /* supplementary character, insert two code units */
562 dest[codeUnitIndex]=UTF16_LEAD(n);
563 dest[codeUnitIndex+1]=UTF16_TRAIL(n);
564 }
565 if(caseFlags!=NULL) {
566 /* Case of last character determines uppercase flag: */
567 caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
568 if(cpLength==2) {
569 caseFlags[codeUnitIndex+1]=FALSE;
570 }
571 }
572 }
573 destLength+=cpLength;
574 ++i;
575 }
576
577 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
578 }
579
580 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
581
582 #endif /* #if !UCONFIG_NO_IDNA */
583