1 /* Copyright (c) 2011, 2018 Ben Noordhuis <info@bnoordhuis.nl>
2 *
3 * Permission to use, copy, modify, and/or distribute this software for any
4 * purpose with or without fee is hereby granted, provided that the above
5 * copyright notice and this permission notice appear in all copies.
6 *
7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14 */
15
16 /* Derived from https://github.com/bnoordhuis/punycode
17 * but updated to support IDNA 2008.
18 */
19
20 #include "uv.h"
21 #include "idna.h"
22 #include <assert.h>
23 #include <string.h>
24 #include <limits.h> /* UINT_MAX */
25
uv__utf8_decode1_slow(const char ** p,const char * pe,unsigned a)26 static unsigned uv__utf8_decode1_slow(const char** p,
27 const char* pe,
28 unsigned a) {
29 unsigned b;
30 unsigned c;
31 unsigned d;
32 unsigned min;
33
34 if (a > 0xF7)
35 return -1;
36
37 switch (pe - *p) {
38 default:
39 if (a > 0xEF) {
40 min = 0x10000;
41 a = a & 7;
42 b = (unsigned char) *(*p)++;
43 c = (unsigned char) *(*p)++;
44 d = (unsigned char) *(*p)++;
45 break;
46 }
47 /* Fall through. */
48 case 2:
49 if (a > 0xDF) {
50 min = 0x800;
51 b = 0x80 | (a & 15);
52 c = (unsigned char) *(*p)++;
53 d = (unsigned char) *(*p)++;
54 a = 0;
55 break;
56 }
57 /* Fall through. */
58 case 1:
59 if (a > 0xBF) {
60 min = 0x80;
61 b = 0x80;
62 c = 0x80 | (a & 31);
63 d = (unsigned char) *(*p)++;
64 a = 0;
65 break;
66 }
67 /* Fall through. */
68 case 0:
69 return -1; /* Invalid continuation byte. */
70 }
71
72 if (0x80 != (0xC0 & (b ^ c ^ d)))
73 return -1; /* Invalid sequence. */
74
75 b &= 63;
76 c &= 63;
77 d &= 63;
78 a = (a << 18) | (b << 12) | (c << 6) | d;
79
80 if (a < min)
81 return -1; /* Overlong sequence. */
82
83 if (a > 0x10FFFF)
84 return -1; /* Four-byte sequence > U+10FFFF. */
85
86 if (a >= 0xD800 && a <= 0xDFFF)
87 return -1; /* Surrogate pair. */
88
89 return a;
90 }
91
uv__utf8_decode1(const char ** p,const char * pe)92 unsigned uv__utf8_decode1(const char** p, const char* pe) {
93 unsigned a;
94
95 assert(*p < pe);
96
97 a = (unsigned char) *(*p)++;
98
99 if (a < 128)
100 return a; /* ASCII, common case. */
101
102 return uv__utf8_decode1_slow(p, pe, a);
103 }
104
uv__idna_toascii_label(const char * s,const char * se,char ** d,char * de)105 static int uv__idna_toascii_label(const char* s, const char* se,
106 char** d, char* de) {
107 static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";
108 const char* ss;
109 unsigned c;
110 unsigned h;
111 unsigned k;
112 unsigned n;
113 unsigned m;
114 unsigned q;
115 unsigned t;
116 unsigned x;
117 unsigned y;
118 unsigned bias;
119 unsigned delta;
120 unsigned todo;
121 int first;
122
123 h = 0;
124 ss = s;
125 todo = 0;
126
127 /* Note: after this loop we've visited all UTF-8 characters and know
128 * they're legal so we no longer need to check for decode errors.
129 */
130 while (s < se) {
131 c = uv__utf8_decode1(&s, se);
132
133 if (c == UINT_MAX)
134 return UV_EINVAL;
135
136 if (c < 128)
137 h++;
138 else
139 todo++;
140 }
141
142 /* Only write "xn--" when there are non-ASCII characters. */
143 if (todo > 0) {
144 if (*d < de) *(*d)++ = 'x';
145 if (*d < de) *(*d)++ = 'n';
146 if (*d < de) *(*d)++ = '-';
147 if (*d < de) *(*d)++ = '-';
148 }
149
150 /* Write ASCII characters. */
151 x = 0;
152 s = ss;
153 while (s < se) {
154 c = uv__utf8_decode1(&s, se);
155 assert(c != UINT_MAX);
156
157 if (c > 127)
158 continue;
159
160 if (*d < de)
161 *(*d)++ = c;
162
163 if (++x == h)
164 break; /* Visited all ASCII characters. */
165 }
166
167 if (todo == 0)
168 return h;
169
170 /* Only write separator when we've written ASCII characters first. */
171 if (h > 0)
172 if (*d < de)
173 *(*d)++ = '-';
174
175 n = 128;
176 bias = 72;
177 delta = 0;
178 first = 1;
179
180 while (todo > 0) {
181 m = -1;
182 s = ss;
183
184 while (s < se) {
185 c = uv__utf8_decode1(&s, se);
186 assert(c != UINT_MAX);
187
188 if (c >= n)
189 if (c < m)
190 m = c;
191 }
192
193 x = m - n;
194 y = h + 1;
195
196 if (x > ~delta / y)
197 return UV_E2BIG; /* Overflow. */
198
199 delta += x * y;
200 n = m;
201
202 s = ss;
203 while (s < se) {
204 c = uv__utf8_decode1(&s, se);
205 assert(c != UINT_MAX);
206
207 if (c < n)
208 if (++delta == 0)
209 return UV_E2BIG; /* Overflow. */
210
211 if (c != n)
212 continue;
213
214 for (k = 36, q = delta; /* empty */; k += 36) {
215 t = 1;
216
217 if (k > bias)
218 t = k - bias;
219
220 if (t > 26)
221 t = 26;
222
223 if (q < t)
224 break;
225
226 /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore
227 * 10 <= y <= 35, we can optimize the long division
228 * into a table-based reciprocal multiplication.
229 */
230 x = q - t;
231 y = 36 - t; /* 10 <= y <= 35 since 1 <= t <= 26. */
232 q = x / y;
233 t = t + x % y; /* 1 <= t <= 35 because of y. */
234
235 if (*d < de)
236 *(*d)++ = alphabet[t];
237 }
238
239 if (*d < de)
240 *(*d)++ = alphabet[q];
241
242 delta /= 2;
243
244 if (first) {
245 delta /= 350;
246 first = 0;
247 }
248
249 /* No overflow check is needed because |delta| was just
250 * divided by 2 and |delta+delta >= delta + delta/h|.
251 */
252 h++;
253 delta += delta / h;
254
255 for (bias = 0; delta > 35 * 26 / 2; bias += 36)
256 delta /= 35;
257
258 bias += 36 * delta / (delta + 38);
259 delta = 0;
260 todo--;
261 }
262
263 delta++;
264 n++;
265 }
266
267 return 0;
268 }
269
uv__idna_toascii(const char * s,const char * se,char * d,char * de)270 long uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
271 const char* si;
272 const char* st;
273 unsigned c;
274 char* ds;
275 int rc;
276
277 if (s == se)
278 return UV_EINVAL;
279
280 ds = d;
281
282 si = s;
283 while (si < se) {
284 st = si;
285 c = uv__utf8_decode1(&si, se);
286
287 if (c == UINT_MAX)
288 return UV_EINVAL;
289
290 if (c != '.')
291 if (c != 0x3002) /* 。 */
292 if (c != 0xFF0E) /* . */
293 if (c != 0xFF61) /* 。 */
294 continue;
295
296 rc = uv__idna_toascii_label(s, st, &d, de);
297
298 if (rc < 0)
299 return rc;
300
301 if (d < de)
302 *d++ = '.';
303
304 s = si;
305 }
306
307 if (s < se) {
308 rc = uv__idna_toascii_label(s, se, &d, de);
309
310 if (rc < 0)
311 return rc;
312 }
313
314 if (d >= de)
315 return UV_EINVAL;
316
317 *d++ = '\0';
318 return d - ds; /* Number of bytes written. */
319 }
320