• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright (c) 2011, 2018 Ben Noordhuis <info@bnoordhuis.nl>
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14  */
15 
16 /* Derived from https://github.com/bnoordhuis/punycode
17  * but updated to support IDNA 2008.
18  */
19 
20 #include "uv.h"
21 #include "idna.h"
22 #include <assert.h>
23 #include <string.h>
24 #include <limits.h> /* UINT_MAX */
25 
uv__utf8_decode1_slow(const char ** p,const char * pe,unsigned a)26 static unsigned uv__utf8_decode1_slow(const char** p,
27                                       const char* pe,
28                                       unsigned a) {
29   unsigned b;
30   unsigned c;
31   unsigned d;
32   unsigned min;
33 
34   if (a > 0xF7)
35     return -1;
36 
37   switch (pe - *p) {
38   default:
39     if (a > 0xEF) {
40       min = 0x10000;
41       a = a & 7;
42       b = (unsigned char) *(*p)++;
43       c = (unsigned char) *(*p)++;
44       d = (unsigned char) *(*p)++;
45       break;
46     }
47     /* Fall through. */
48   case 2:
49     if (a > 0xDF) {
50       min = 0x800;
51       b = 0x80 | (a & 15);
52       c = (unsigned char) *(*p)++;
53       d = (unsigned char) *(*p)++;
54       a = 0;
55       break;
56     }
57     /* Fall through. */
58   case 1:
59     if (a > 0xBF) {
60       min = 0x80;
61       b = 0x80;
62       c = 0x80 | (a & 31);
63       d = (unsigned char) *(*p)++;
64       a = 0;
65       break;
66     }
67     /* Fall through. */
68   case 0:
69     return -1;  /* Invalid continuation byte. */
70   }
71 
72   if (0x80 != (0xC0 & (b ^ c ^ d)))
73     return -1;  /* Invalid sequence. */
74 
75   b &= 63;
76   c &= 63;
77   d &= 63;
78   a = (a << 18) | (b << 12) | (c << 6) | d;
79 
80   if (a < min)
81     return -1;  /* Overlong sequence. */
82 
83   if (a > 0x10FFFF)
84     return -1;  /* Four-byte sequence > U+10FFFF. */
85 
86   if (a >= 0xD800 && a <= 0xDFFF)
87     return -1;  /* Surrogate pair. */
88 
89   return a;
90 }
91 
uv__utf8_decode1(const char ** p,const char * pe)92 unsigned uv__utf8_decode1(const char** p, const char* pe) {
93   unsigned a;
94 
95   assert(*p < pe);
96 
97   a = (unsigned char) *(*p)++;
98 
99   if (a < 128)
100     return a;  /* ASCII, common case. */
101 
102   return uv__utf8_decode1_slow(p, pe, a);
103 }
104 
uv__idna_toascii_label(const char * s,const char * se,char ** d,char * de)105 static int uv__idna_toascii_label(const char* s, const char* se,
106                                   char** d, char* de) {
107   static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";
108   const char* ss;
109   unsigned c;
110   unsigned h;
111   unsigned k;
112   unsigned n;
113   unsigned m;
114   unsigned q;
115   unsigned t;
116   unsigned x;
117   unsigned y;
118   unsigned bias;
119   unsigned delta;
120   unsigned todo;
121   int first;
122 
123   h = 0;
124   ss = s;
125   todo = 0;
126 
127   /* Note: after this loop we've visited all UTF-8 characters and know
128    * they're legal so we no longer need to check for decode errors.
129    */
130   while (s < se) {
131     c = uv__utf8_decode1(&s, se);
132 
133     if (c == UINT_MAX)
134       return UV_EINVAL;
135 
136     if (c < 128)
137       h++;
138     else
139       todo++;
140   }
141 
142   /* Only write "xn--" when there are non-ASCII characters. */
143   if (todo > 0) {
144     if (*d < de) *(*d)++ = 'x';
145     if (*d < de) *(*d)++ = 'n';
146     if (*d < de) *(*d)++ = '-';
147     if (*d < de) *(*d)++ = '-';
148   }
149 
150   /* Write ASCII characters. */
151   x = 0;
152   s = ss;
153   while (s < se) {
154     c = uv__utf8_decode1(&s, se);
155     assert(c != UINT_MAX);
156 
157     if (c > 127)
158       continue;
159 
160     if (*d < de)
161       *(*d)++ = c;
162 
163     if (++x == h)
164       break;  /* Visited all ASCII characters. */
165   }
166 
167   if (todo == 0)
168     return h;
169 
170   /* Only write separator when we've written ASCII characters first. */
171   if (h > 0)
172     if (*d < de)
173       *(*d)++ = '-';
174 
175   n = 128;
176   bias = 72;
177   delta = 0;
178   first = 1;
179 
180   while (todo > 0) {
181     m = -1;
182     s = ss;
183 
184     while (s < se) {
185       c = uv__utf8_decode1(&s, se);
186       assert(c != UINT_MAX);
187 
188       if (c >= n)
189         if (c < m)
190           m = c;
191     }
192 
193     x = m - n;
194     y = h + 1;
195 
196     if (x > ~delta / y)
197       return UV_E2BIG;  /* Overflow. */
198 
199     delta += x * y;
200     n = m;
201 
202     s = ss;
203     while (s < se) {
204       c = uv__utf8_decode1(&s, se);
205       assert(c != UINT_MAX);
206 
207       if (c < n)
208         if (++delta == 0)
209           return UV_E2BIG;  /* Overflow. */
210 
211       if (c != n)
212         continue;
213 
214       for (k = 36, q = delta; /* empty */; k += 36) {
215         t = 1;
216 
217         if (k > bias)
218           t = k - bias;
219 
220         if (t > 26)
221           t = 26;
222 
223         if (q < t)
224           break;
225 
226         /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore
227          * 10 <= y <= 35, we can optimize the long division
228          * into a table-based reciprocal multiplication.
229          */
230         x = q - t;
231         y = 36 - t;  /* 10 <= y <= 35 since 1 <= t <= 26. */
232         q = x / y;
233         t = t + x % y;  /* 1 <= t <= 35 because of y. */
234 
235         if (*d < de)
236           *(*d)++ = alphabet[t];
237       }
238 
239       if (*d < de)
240         *(*d)++ = alphabet[q];
241 
242       delta /= 2;
243 
244       if (first) {
245         delta /= 350;
246         first = 0;
247       }
248 
249       /* No overflow check is needed because |delta| was just
250        * divided by 2 and |delta+delta >= delta + delta/h|.
251        */
252       h++;
253       delta += delta / h;
254 
255       for (bias = 0; delta > 35 * 26 / 2; bias += 36)
256         delta /= 35;
257 
258       bias += 36 * delta / (delta + 38);
259       delta = 0;
260       todo--;
261     }
262 
263     delta++;
264     n++;
265   }
266 
267   return 0;
268 }
269 
uv__idna_toascii(const char * s,const char * se,char * d,char * de)270 long uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
271   const char* si;
272   const char* st;
273   unsigned c;
274   char* ds;
275   int rc;
276 
277   if (s == se)
278     return UV_EINVAL;
279 
280   ds = d;
281 
282   si = s;
283   while (si < se) {
284     st = si;
285     c = uv__utf8_decode1(&si, se);
286 
287     if (c == UINT_MAX)
288       return UV_EINVAL;
289 
290     if (c != '.')
291       if (c != 0x3002)  /* 。 */
292         if (c != 0xFF0E)  /* . */
293           if (c != 0xFF61)  /* 。 */
294             continue;
295 
296     rc = uv__idna_toascii_label(s, st, &d, de);
297 
298     if (rc < 0)
299       return rc;
300 
301     if (d < de)
302       *d++ = '.';
303 
304     s = si;
305   }
306 
307   if (s < se) {
308     rc = uv__idna_toascii_label(s, se, &d, de);
309 
310     if (rc < 0)
311       return rc;
312   }
313 
314   if (d >= de)
315     return UV_EINVAL;
316 
317   *d++ = '\0';
318   return d - ds;  /* Number of bytes written. */
319 }
320