1 /* Copyright (c) 2011, 2018 Ben Noordhuis <info@bnoordhuis.nl> 2 * 3 * Permission to use, copy, modify, and/or distribute this software for any 4 * purpose with or without fee is hereby granted, provided that the above 5 * copyright notice and this permission notice appear in all copies. 6 * 7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 */ 15 16 /* Derived from https://github.com/bnoordhuis/punycode 17 * but updated to support IDNA 2008. 18 */ 19 20 #include "uv.h" 21 #include "idna.h" 22 #include <assert.h> 23 #include <string.h> 24 #include <limits.h> /* UINT_MAX */ 25 uv__utf8_decode1_slow(const char ** p,const char * pe,unsigned a)26 static unsigned uv__utf8_decode1_slow(const char** p, 27 const char* pe, 28 unsigned a) { 29 unsigned b; 30 unsigned c; 31 unsigned d; 32 unsigned min; 33 34 if (a > 0xF7) 35 return -1; 36 37 switch (pe - *p) { 38 default: 39 if (a > 0xEF) { 40 min = 0x10000; 41 a = a & 7; 42 b = (unsigned char) *(*p)++; 43 c = (unsigned char) *(*p)++; 44 d = (unsigned char) *(*p)++; 45 break; 46 } 47 /* Fall through. */ 48 case 2: 49 if (a > 0xDF) { 50 min = 0x800; 51 b = 0x80 | (a & 15); 52 c = (unsigned char) *(*p)++; 53 d = (unsigned char) *(*p)++; 54 a = 0; 55 break; 56 } 57 /* Fall through. */ 58 case 1: 59 if (a > 0xBF) { 60 min = 0x80; 61 b = 0x80; 62 c = 0x80 | (a & 31); 63 d = (unsigned char) *(*p)++; 64 a = 0; 65 break; 66 } 67 /* Fall through. */ 68 case 0: 69 return -1; /* Invalid continuation byte. */ 70 } 71 72 if (0x80 != (0xC0 & (b ^ c ^ d))) 73 return -1; /* Invalid sequence. */ 74 75 b &= 63; 76 c &= 63; 77 d &= 63; 78 a = (a << 18) | (b << 12) | (c << 6) | d; 79 80 if (a < min) 81 return -1; /* Overlong sequence. */ 82 83 if (a > 0x10FFFF) 84 return -1; /* Four-byte sequence > U+10FFFF. */ 85 86 if (a >= 0xD800 && a <= 0xDFFF) 87 return -1; /* Surrogate pair. */ 88 89 return a; 90 } 91 uv__utf8_decode1(const char ** p,const char * pe)92 unsigned uv__utf8_decode1(const char** p, const char* pe) { 93 unsigned a; 94 95 assert(*p < pe); 96 97 a = (unsigned char) *(*p)++; 98 99 if (a < 128) 100 return a; /* ASCII, common case. */ 101 102 return uv__utf8_decode1_slow(p, pe, a); 103 } 104 uv__idna_toascii_label(const char * s,const char * se,char ** d,char * de)105 static int uv__idna_toascii_label(const char* s, const char* se, 106 char** d, char* de) { 107 static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789"; 108 const char* ss; 109 unsigned c; 110 unsigned h; 111 unsigned k; 112 unsigned n; 113 unsigned m; 114 unsigned q; 115 unsigned t; 116 unsigned x; 117 unsigned y; 118 unsigned bias; 119 unsigned delta; 120 unsigned todo; 121 int first; 122 123 h = 0; 124 ss = s; 125 todo = 0; 126 127 /* Note: after this loop we've visited all UTF-8 characters and know 128 * they're legal so we no longer need to check for decode errors. 129 */ 130 while (s < se) { 131 c = uv__utf8_decode1(&s, se); 132 133 if (c == UINT_MAX) 134 return UV_EINVAL; 135 136 if (c < 128) 137 h++; 138 else 139 todo++; 140 } 141 142 /* Only write "xn--" when there are non-ASCII characters. */ 143 if (todo > 0) { 144 if (*d < de) *(*d)++ = 'x'; 145 if (*d < de) *(*d)++ = 'n'; 146 if (*d < de) *(*d)++ = '-'; 147 if (*d < de) *(*d)++ = '-'; 148 } 149 150 /* Write ASCII characters. */ 151 x = 0; 152 s = ss; 153 while (s < se) { 154 c = uv__utf8_decode1(&s, se); 155 assert(c != UINT_MAX); 156 157 if (c > 127) 158 continue; 159 160 if (*d < de) 161 *(*d)++ = c; 162 163 if (++x == h) 164 break; /* Visited all ASCII characters. */ 165 } 166 167 if (todo == 0) 168 return h; 169 170 /* Only write separator when we've written ASCII characters first. */ 171 if (h > 0) 172 if (*d < de) 173 *(*d)++ = '-'; 174 175 n = 128; 176 bias = 72; 177 delta = 0; 178 first = 1; 179 180 while (todo > 0) { 181 m = -1; 182 s = ss; 183 184 while (s < se) { 185 c = uv__utf8_decode1(&s, se); 186 assert(c != UINT_MAX); 187 188 if (c >= n) 189 if (c < m) 190 m = c; 191 } 192 193 x = m - n; 194 y = h + 1; 195 196 if (x > ~delta / y) 197 return UV_E2BIG; /* Overflow. */ 198 199 delta += x * y; 200 n = m; 201 202 s = ss; 203 while (s < se) { 204 c = uv__utf8_decode1(&s, se); 205 assert(c != UINT_MAX); 206 207 if (c < n) 208 if (++delta == 0) 209 return UV_E2BIG; /* Overflow. */ 210 211 if (c != n) 212 continue; 213 214 for (k = 36, q = delta; /* empty */; k += 36) { 215 t = 1; 216 217 if (k > bias) 218 t = k - bias; 219 220 if (t > 26) 221 t = 26; 222 223 if (q < t) 224 break; 225 226 /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore 227 * 10 <= y <= 35, we can optimize the long division 228 * into a table-based reciprocal multiplication. 229 */ 230 x = q - t; 231 y = 36 - t; /* 10 <= y <= 35 since 1 <= t <= 26. */ 232 q = x / y; 233 t = t + x % y; /* 1 <= t <= 35 because of y. */ 234 235 if (*d < de) 236 *(*d)++ = alphabet[t]; 237 } 238 239 if (*d < de) 240 *(*d)++ = alphabet[q]; 241 242 delta /= 2; 243 244 if (first) { 245 delta /= 350; 246 first = 0; 247 } 248 249 /* No overflow check is needed because |delta| was just 250 * divided by 2 and |delta+delta >= delta + delta/h|. 251 */ 252 h++; 253 delta += delta / h; 254 255 for (bias = 0; delta > 35 * 26 / 2; bias += 36) 256 delta /= 35; 257 258 bias += 36 * delta / (delta + 38); 259 delta = 0; 260 todo--; 261 } 262 263 delta++; 264 n++; 265 } 266 267 return 0; 268 } 269 uv__idna_toascii(const char * s,const char * se,char * d,char * de)270 long uv__idna_toascii(const char* s, const char* se, char* d, char* de) { 271 const char* si; 272 const char* st; 273 unsigned c; 274 char* ds; 275 int rc; 276 277 ds = d; 278 279 si = s; 280 while (si < se) { 281 st = si; 282 c = uv__utf8_decode1(&si, se); 283 284 if (c == UINT_MAX) 285 return UV_EINVAL; 286 287 if (c != '.') 288 if (c != 0x3002) /* 。 */ 289 if (c != 0xFF0E) /* . */ 290 if (c != 0xFF61) /* 。 */ 291 continue; 292 293 rc = uv__idna_toascii_label(s, st, &d, de); 294 295 if (rc < 0) 296 return rc; 297 298 if (d < de) 299 *d++ = '.'; 300 301 s = si; 302 } 303 304 if (s < se) { 305 rc = uv__idna_toascii_label(s, se, &d, de); 306 307 if (rc < 0) 308 return rc; 309 } 310 311 if (d < de) 312 *d++ = '\0'; 313 314 return d - ds; /* Number of bytes written. */ 315 } 316