• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2  *******************************************************************************
3  *
4  *   Copyright (C) 2002-2011, International Business Machines
5  *   Corporation and others.  All Rights Reserved.
6  *
7  *******************************************************************************
8  *   file name:  punycode.cpp
9  *   encoding:   US-ASCII
10  *   tab size:   8 (not used)
11  *   indentation:4
12  *
13  *   created on: 2002jan31
14  *   created by: Markus W. Scherer
15  */
16  
17  
18  /* This ICU code derived from: */
19  /*
20  punycode.c 0.4.0 (2001-Nov-17-Sat)
21  http://www.cs.berkeley.edu/~amc/idn/
22  Adam M. Costello
23  http://www.nicemice.net/amc/
24  
25  Disclaimer and license
26  
27      Regarding this entire document or any portion of it (including
28      the pseudocode and C code), the author makes no guarantees and
29      is not responsible for any damage resulting from its use.  The
30      author grants irrevocable permission to anyone to use, modify,
31      and distribute it in any way that does not diminish the rights
32      of anyone else to use, modify, and distribute it, provided that
33      redistributed derivative works do not contain misleading author or
34      version information.  Derivative works need not be licensed under
35      similar terms.
36  */
37  /*
38   * ICU modifications:
39   * - ICU data types and coding conventions
40   * - ICU string buffer handling with implicit source lengths
41   *   and destination preflighting
42   * - UTF-16 handling
43   */
44  
45  #include "unicode/utypes.h"
46  
47  #if !UCONFIG_NO_IDNA
48  
49  #include "unicode/ustring.h"
50  #include "unicode/utf.h"
51  #include "unicode/utf16.h"
52  #include "ustr_imp.h"
53  #include "cstring.h"
54  #include "cmemory.h"
55  #include "punycode.h"
56  #include "uassert.h"
57  
58  
59  /* Punycode ----------------------------------------------------------------- */
60  
61  /* Punycode parameters for Bootstring */
62  #define BASE            36
63  #define TMIN            1
64  #define TMAX            26
65  #define SKEW            38
66  #define DAMP            700
67  #define INITIAL_BIAS    72
68  #define INITIAL_N       0x80
69  
70  /* "Basic" Unicode/ASCII code points */
71  #define _HYPHEN         0X2d
72  #define DELIMITER       _HYPHEN
73  
74  #define _ZERO_          0X30
75  #define _NINE           0x39
76  
77  #define _SMALL_A        0X61
78  #define _SMALL_Z        0X7a
79  
80  #define _CAPITAL_A      0X41
81  #define _CAPITAL_Z      0X5a
82  
83  #define IS_BASIC(c) ((c)<0x80)
84  #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
85  
86  /**
87   * digitToBasic() returns the basic code point whose value
88   * (when used for representing integers) is d, which must be in the
89   * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
90   * nonzero, in which case the uppercase form is used.
91   */
92  static inline char
digitToBasic(int32_t digit,UBool uppercase)93  digitToBasic(int32_t digit, UBool uppercase) {
94      /*  0..25 map to ASCII a..z or A..Z */
95      /* 26..35 map to ASCII 0..9         */
96      if(digit<26) {
97          if(uppercase) {
98              return (char)(_CAPITAL_A+digit);
99          } else {
100              return (char)(_SMALL_A+digit);
101          }
102      } else {
103          return (char)((_ZERO_-26)+digit);
104      }
105  }
106  
107  /**
108   * basicToDigit[] contains the numeric value of a basic code
109   * point (for use in representing integers) in the range 0 to
110   * BASE-1, or -1 if b is does not represent a value.
111   */
112  static const int8_t
113  basicToDigit[256]={
114      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
115      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
116  
117      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
118      26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
119  
120      -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
121      15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
122  
123      -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
124      15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
125  
126      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
127      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
128  
129      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
130      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
131  
132      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
133      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
134  
135      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
136      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
137  };
138  
139  static inline char
asciiCaseMap(char b,UBool uppercase)140  asciiCaseMap(char b, UBool uppercase) {
141      if(uppercase) {
142          if(_SMALL_A<=b && b<=_SMALL_Z) {
143              b-=(_SMALL_A-_CAPITAL_A);
144          }
145      } else {
146          if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
147              b+=(_SMALL_A-_CAPITAL_A);
148          }
149      }
150      return b;
151  }
152  
153  /* Punycode-specific Bootstring code ---------------------------------------- */
154  
155  /*
156   * The following code omits the {parts} of the pseudo-algorithm in the spec
157   * that are not used with the Punycode parameter set.
158   */
159  
160  /* Bias adaptation function. */
161  static int32_t
adaptBias(int32_t delta,int32_t length,UBool firstTime)162  adaptBias(int32_t delta, int32_t length, UBool firstTime) {
163      int32_t count;
164  
165      if(firstTime) {
166          delta/=DAMP;
167      } else {
168          delta/=2;
169      }
170  
171      delta+=delta/length;
172      for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
173          delta/=(BASE-TMIN);
174      }
175  
176      return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
177  }
178  
179  #define MAX_CP_COUNT    200
180  
181  U_CFUNC int32_t
u_strToPunycode(const UChar * src,int32_t srcLength,UChar * dest,int32_t destCapacity,const UBool * caseFlags,UErrorCode * pErrorCode)182  u_strToPunycode(const UChar *src, int32_t srcLength,
183                  UChar *dest, int32_t destCapacity,
184                  const UBool *caseFlags,
185                  UErrorCode *pErrorCode) {
186  
187      int32_t cpBuffer[MAX_CP_COUNT];
188      int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
189      UChar c, c2;
190  
191      /* argument checking */
192      if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
193          return 0;
194      }
195  
196      if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
197          *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
198          return 0;
199      }
200  
201      /*
202       * Handle the basic code points and
203       * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
204       */
205      srcCPCount=destLength=0;
206      if(srcLength==-1) {
207          /* NUL-terminated input */
208          for(j=0; /* no condition */; ++j) {
209              if((c=src[j])==0) {
210                  break;
211              }
212              if(srcCPCount==MAX_CP_COUNT) {
213                  /* too many input code points */
214                  *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
215                  return 0;
216              }
217              if(IS_BASIC(c)) {
218                  cpBuffer[srcCPCount++]=0;
219                  if(destLength<destCapacity) {
220                      dest[destLength]=
221                          caseFlags!=NULL ?
222                              asciiCaseMap((char)c, caseFlags[j]) :
223                              (char)c;
224                  }
225                  ++destLength;
226              } else {
227                  n=(caseFlags!=NULL && caseFlags[j])<<31L;
228                  if(U16_IS_SINGLE(c)) {
229                      n|=c;
230                  } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
231                      ++j;
232                      n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
233                  } else {
234                      /* error: unmatched surrogate */
235                      *pErrorCode=U_INVALID_CHAR_FOUND;
236                      return 0;
237                  }
238                  cpBuffer[srcCPCount++]=n;
239              }
240          }
241      } else {
242          /* length-specified input */
243          for(j=0; j<srcLength; ++j) {
244              if(srcCPCount==MAX_CP_COUNT) {
245                  /* too many input code points */
246                  *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
247                  return 0;
248              }
249              c=src[j];
250              if(IS_BASIC(c)) {
251                  cpBuffer[srcCPCount++]=0;
252                  if(destLength<destCapacity) {
253                      dest[destLength]=
254                          caseFlags!=NULL ?
255                              asciiCaseMap((char)c, caseFlags[j]) :
256                              (char)c;
257                  }
258                  ++destLength;
259              } else {
260                  n=(caseFlags!=NULL && caseFlags[j])<<31L;
261                  if(U16_IS_SINGLE(c)) {
262                      n|=c;
263                  } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
264                      ++j;
265                      n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
266                  } else {
267                      /* error: unmatched surrogate */
268                      *pErrorCode=U_INVALID_CHAR_FOUND;
269                      return 0;
270                  }
271                  cpBuffer[srcCPCount++]=n;
272              }
273          }
274      }
275  
276      /* Finish the basic string - if it is not empty - with a delimiter. */
277      basicLength=destLength;
278      if(basicLength>0) {
279          if(destLength<destCapacity) {
280              dest[destLength]=DELIMITER;
281          }
282          ++destLength;
283      }
284  
285      /*
286       * handledCPCount is the number of code points that have been handled
287       * basicLength is the number of basic code points
288       * destLength is the number of chars that have been output
289       */
290  
291      /* Initialize the state: */
292      n=INITIAL_N;
293      delta=0;
294      bias=INITIAL_BIAS;
295  
296      /* Main encoding loop: */
297      for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
298          /*
299           * All non-basic code points < n have been handled already.
300           * Find the next larger one:
301           */
302          for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
303              q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
304              if(n<=q && q<m) {
305                  m=q;
306              }
307          }
308  
309          /*
310           * Increase delta enough to advance the decoder's
311           * <n,i> state to <m,0>, but guard against overflow:
312           */
313          if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
314              *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
315              return 0;
316          }
317          delta+=(m-n)*(handledCPCount+1);
318          n=m;
319  
320          /* Encode a sequence of same code points n */
321          for(j=0; j<srcCPCount; ++j) {
322              q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
323              if(q<n) {
324                  ++delta;
325              } else if(q==n) {
326                  /* Represent delta as a generalized variable-length integer: */
327                  for(q=delta, k=BASE; /* no condition */; k+=BASE) {
328  
329                      /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
330  
331                      t=k-bias;
332                      if(t<TMIN) {
333                          t=TMIN;
334                      } else if(t>TMAX) {
335                          t=TMAX;
336                      }
337                      */
338  
339                      t=k-bias;
340                      if(t<TMIN) {
341                          t=TMIN;
342                      } else if(k>=(bias+TMAX)) {
343                          t=TMAX;
344                      }
345  
346                      if(q<t) {
347                          break;
348                      }
349  
350                      if(destLength<destCapacity) {
351                          dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
352                      }
353                      ++destLength;
354                      q=(q-t)/(BASE-t);
355                  }
356  
357                  if(destLength<destCapacity) {
358                      dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
359                  }
360                  ++destLength;
361                  bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
362                  delta=0;
363                  ++handledCPCount;
364              }
365          }
366  
367          ++delta;
368          ++n;
369      }
370  
371      return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
372  }
373  
374  U_CFUNC int32_t
u_strFromPunycode(const UChar * src,int32_t srcLength,UChar * dest,int32_t destCapacity,UBool * caseFlags,UErrorCode * pErrorCode)375  u_strFromPunycode(const UChar *src, int32_t srcLength,
376                    UChar *dest, int32_t destCapacity,
377                    UBool *caseFlags,
378                    UErrorCode *pErrorCode) {
379      int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
380              destCPCount, firstSupplementaryIndex, cpLength;
381      UChar b;
382  
383      /* argument checking */
384      if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
385          return 0;
386      }
387  
388      if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
389          *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
390          return 0;
391      }
392  
393      if(srcLength==-1) {
394          srcLength=u_strlen(src);
395      }
396  
397      /*
398       * Handle the basic code points:
399       * Let basicLength be the number of input code points
400       * before the last delimiter, or 0 if there is none,
401       * then copy the first basicLength code points to the output.
402       *
403       * The two following loops iterate backward.
404       */
405      for(j=srcLength; j>0;) {
406          if(src[--j]==DELIMITER) {
407              break;
408          }
409      }
410      destLength=basicLength=destCPCount=j;
411      U_ASSERT(destLength>=0);
412  
413      while(j>0) {
414          b=src[--j];
415          if(!IS_BASIC(b)) {
416              *pErrorCode=U_INVALID_CHAR_FOUND;
417              return 0;
418          }
419  
420          if(j<destCapacity) {
421              dest[j]=(UChar)b;
422  
423              if(caseFlags!=NULL) {
424                  caseFlags[j]=IS_BASIC_UPPERCASE(b);
425              }
426          }
427      }
428  
429      /* Initialize the state: */
430      n=INITIAL_N;
431      i=0;
432      bias=INITIAL_BIAS;
433      firstSupplementaryIndex=1000000000;
434  
435      /*
436       * Main decoding loop:
437       * Start just after the last delimiter if any
438       * basic code points were copied; start at the beginning otherwise.
439       */
440      for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
441          /*
442           * in is the index of the next character to be consumed, and
443           * destCPCount is the number of code points in the output array.
444           *
445           * Decode a generalized variable-length integer into delta,
446           * which gets added to i.  The overflow checking is easier
447           * if we increase i as we go, then subtract off its starting
448           * value at the end to obtain delta.
449           */
450          for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
451              if(in>=srcLength) {
452                  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
453                  return 0;
454              }
455  
456              digit=basicToDigit[(uint8_t)src[in++]];
457              if(digit<0) {
458                  *pErrorCode=U_INVALID_CHAR_FOUND;
459                  return 0;
460              }
461              if(digit>(0x7fffffff-i)/w) {
462                  /* integer overflow */
463                  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
464                  return 0;
465              }
466  
467              i+=digit*w;
468              /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
469              t=k-bias;
470              if(t<TMIN) {
471                  t=TMIN;
472              } else if(t>TMAX) {
473                  t=TMAX;
474              }
475              */
476              t=k-bias;
477              if(t<TMIN) {
478                  t=TMIN;
479              } else if(k>=(bias+TMAX)) {
480                  t=TMAX;
481              }
482              if(digit<t) {
483                  break;
484              }
485  
486              if(w>0x7fffffff/(BASE-t)) {
487                  /* integer overflow */
488                  *pErrorCode=U_ILLEGAL_CHAR_FOUND;
489                  return 0;
490              }
491              w*=BASE-t;
492          }
493  
494          /*
495           * Modification from sample code:
496           * Increments destCPCount here,
497           * where needed instead of in for() loop tail.
498           */
499          ++destCPCount;
500          bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
501  
502          /*
503           * i was supposed to wrap around from (incremented) destCPCount to 0,
504           * incrementing n each time, so we'll fix that now:
505           */
506          if(i/destCPCount>(0x7fffffff-n)) {
507              /* integer overflow */
508              *pErrorCode=U_ILLEGAL_CHAR_FOUND;
509              return 0;
510          }
511  
512          n+=i/destCPCount;
513          i%=destCPCount;
514          /* not needed for Punycode: */
515          /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
516  
517          if(n>0x10ffff || U_IS_SURROGATE(n)) {
518              /* Unicode code point overflow */
519              *pErrorCode=U_ILLEGAL_CHAR_FOUND;
520              return 0;
521          }
522  
523          /* Insert n at position i of the output: */
524          cpLength=U16_LENGTH(n);
525          if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
526              int32_t codeUnitIndex;
527  
528              /*
529               * Handle indexes when supplementary code points are present.
530               *
531               * In almost all cases, there will be only BMP code points before i
532               * and even in the entire string.
533               * This is handled with the same efficiency as with UTF-32.
534               *
535               * Only the rare cases with supplementary code points are handled
536               * more slowly - but not too bad since this is an insertion anyway.
537               */
538              if(i<=firstSupplementaryIndex) {
539                  codeUnitIndex=i;
540                  if(cpLength>1) {
541                      firstSupplementaryIndex=codeUnitIndex;
542                  } else {
543                      ++firstSupplementaryIndex;
544                  }
545              } else {
546                  codeUnitIndex=firstSupplementaryIndex;
547                  U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
548              }
549  
550              /* use the UChar index codeUnitIndex instead of the code point index i */
551              if(codeUnitIndex<destLength) {
552                  uprv_memmove(dest+codeUnitIndex+cpLength,
553                               dest+codeUnitIndex,
554                               (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
555                  if(caseFlags!=NULL) {
556                      uprv_memmove(caseFlags+codeUnitIndex+cpLength,
557                                   caseFlags+codeUnitIndex,
558                                   destLength-codeUnitIndex);
559                  }
560              }
561              if(cpLength==1) {
562                  /* BMP, insert one code unit */
563                  dest[codeUnitIndex]=(UChar)n;
564              } else {
565                  /* supplementary character, insert two code units */
566                  dest[codeUnitIndex]=U16_LEAD(n);
567                  dest[codeUnitIndex+1]=U16_TRAIL(n);
568              }
569              if(caseFlags!=NULL) {
570                  /* Case of last character determines uppercase flag: */
571                  caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
572                  if(cpLength==2) {
573                      caseFlags[codeUnitIndex+1]=FALSE;
574                  }
575              }
576          }
577          destLength+=cpLength;
578          U_ASSERT(destLength>=0);
579          ++i;
580      }
581  
582      return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
583  }
584  
585  /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
586  
587  #endif /* #if !UCONFIG_NO_IDNA */
588