• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /* Copyright (c) 2011, 2018 Ben Noordhuis <info@bnoordhuis.nl>
2   *
3   * Permission to use, copy, modify, and/or distribute this software for any
4   * purpose with or without fee is hereby granted, provided that the above
5   * copyright notice and this permission notice appear in all copies.
6   *
7   * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8   * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9   * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10   * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11   * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12   * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13   * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14   */
15  
16  /* Derived from https://github.com/bnoordhuis/punycode
17   * but updated to support IDNA 2008.
18   */
19  
20  #include "uv.h"
21  #include "idna.h"
22  #include <assert.h>
23  #include <string.h>
24  #include <limits.h> /* UINT_MAX */
25  
uv__utf8_decode1_slow(const char ** p,const char * pe,unsigned a)26  static unsigned uv__utf8_decode1_slow(const char** p,
27                                        const char* pe,
28                                        unsigned a) {
29    unsigned b;
30    unsigned c;
31    unsigned d;
32    unsigned min;
33  
34    if (a > 0xF7)
35      return -1;
36  
37    switch (pe - *p) {
38    default:
39      if (a > 0xEF) {
40        min = 0x10000;
41        a = a & 7;
42        b = (unsigned char) *(*p)++;
43        c = (unsigned char) *(*p)++;
44        d = (unsigned char) *(*p)++;
45        break;
46      }
47      /* Fall through. */
48    case 2:
49      if (a > 0xDF) {
50        min = 0x800;
51        b = 0x80 | (a & 15);
52        c = (unsigned char) *(*p)++;
53        d = (unsigned char) *(*p)++;
54        a = 0;
55        break;
56      }
57      /* Fall through. */
58    case 1:
59      if (a > 0xBF) {
60        min = 0x80;
61        b = 0x80;
62        c = 0x80 | (a & 31);
63        d = (unsigned char) *(*p)++;
64        a = 0;
65        break;
66      }
67      /* Fall through. */
68    case 0:
69      return -1;  /* Invalid continuation byte. */
70    }
71  
72    if (0x80 != (0xC0 & (b ^ c ^ d)))
73      return -1;  /* Invalid sequence. */
74  
75    b &= 63;
76    c &= 63;
77    d &= 63;
78    a = (a << 18) | (b << 12) | (c << 6) | d;
79  
80    if (a < min)
81      return -1;  /* Overlong sequence. */
82  
83    if (a > 0x10FFFF)
84      return -1;  /* Four-byte sequence > U+10FFFF. */
85  
86    if (a >= 0xD800 && a <= 0xDFFF)
87      return -1;  /* Surrogate pair. */
88  
89    return a;
90  }
91  
uv__utf8_decode1(const char ** p,const char * pe)92  unsigned uv__utf8_decode1(const char** p, const char* pe) {
93    unsigned a;
94  
95    assert(*p < pe);
96  
97    a = (unsigned char) *(*p)++;
98  
99    if (a < 128)
100      return a;  /* ASCII, common case. */
101  
102    return uv__utf8_decode1_slow(p, pe, a);
103  }
104  
uv__idna_toascii_label(const char * s,const char * se,char ** d,char * de)105  static int uv__idna_toascii_label(const char* s, const char* se,
106                                    char** d, char* de) {
107    static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";
108    const char* ss;
109    unsigned c;
110    unsigned h;
111    unsigned k;
112    unsigned n;
113    unsigned m;
114    unsigned q;
115    unsigned t;
116    unsigned x;
117    unsigned y;
118    unsigned bias;
119    unsigned delta;
120    unsigned todo;
121    int first;
122  
123    h = 0;
124    ss = s;
125    todo = 0;
126  
127    /* Note: after this loop we've visited all UTF-8 characters and know
128     * they're legal so we no longer need to check for decode errors.
129     */
130    while (s < se) {
131      c = uv__utf8_decode1(&s, se);
132  
133      if (c == UINT_MAX)
134        return UV_EINVAL;
135  
136      if (c < 128)
137        h++;
138      else
139        todo++;
140    }
141  
142    /* Only write "xn--" when there are non-ASCII characters. */
143    if (todo > 0) {
144      if (*d < de) *(*d)++ = 'x';
145      if (*d < de) *(*d)++ = 'n';
146      if (*d < de) *(*d)++ = '-';
147      if (*d < de) *(*d)++ = '-';
148    }
149  
150    /* Write ASCII characters. */
151    x = 0;
152    s = ss;
153    while (s < se) {
154      c = uv__utf8_decode1(&s, se);
155      assert(c != UINT_MAX);
156  
157      if (c > 127)
158        continue;
159  
160      if (*d < de)
161        *(*d)++ = c;
162  
163      if (++x == h)
164        break;  /* Visited all ASCII characters. */
165    }
166  
167    if (todo == 0)
168      return h;
169  
170    /* Only write separator when we've written ASCII characters first. */
171    if (h > 0)
172      if (*d < de)
173        *(*d)++ = '-';
174  
175    n = 128;
176    bias = 72;
177    delta = 0;
178    first = 1;
179  
180    while (todo > 0) {
181      m = -1;
182      s = ss;
183  
184      while (s < se) {
185        c = uv__utf8_decode1(&s, se);
186        assert(c != UINT_MAX);
187  
188        if (c >= n)
189          if (c < m)
190            m = c;
191      }
192  
193      x = m - n;
194      y = h + 1;
195  
196      if (x > ~delta / y)
197        return UV_E2BIG;  /* Overflow. */
198  
199      delta += x * y;
200      n = m;
201  
202      s = ss;
203      while (s < se) {
204        c = uv__utf8_decode1(&s, se);
205        assert(c != UINT_MAX);
206  
207        if (c < n)
208          if (++delta == 0)
209            return UV_E2BIG;  /* Overflow. */
210  
211        if (c != n)
212          continue;
213  
214        for (k = 36, q = delta; /* empty */; k += 36) {
215          t = 1;
216  
217          if (k > bias)
218            t = k - bias;
219  
220          if (t > 26)
221            t = 26;
222  
223          if (q < t)
224            break;
225  
226          /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore
227           * 10 <= y <= 35, we can optimize the long division
228           * into a table-based reciprocal multiplication.
229           */
230          x = q - t;
231          y = 36 - t;  /* 10 <= y <= 35 since 1 <= t <= 26. */
232          q = x / y;
233          t = t + x % y;  /* 1 <= t <= 35 because of y. */
234  
235          if (*d < de)
236            *(*d)++ = alphabet[t];
237        }
238  
239        if (*d < de)
240          *(*d)++ = alphabet[q];
241  
242        delta /= 2;
243  
244        if (first) {
245          delta /= 350;
246          first = 0;
247        }
248  
249        /* No overflow check is needed because |delta| was just
250         * divided by 2 and |delta+delta >= delta + delta/h|.
251         */
252        h++;
253        delta += delta / h;
254  
255        for (bias = 0; delta > 35 * 26 / 2; bias += 36)
256          delta /= 35;
257  
258        bias += 36 * delta / (delta + 38);
259        delta = 0;
260        todo--;
261      }
262  
263      delta++;
264      n++;
265    }
266  
267    return 0;
268  }
269  
uv__idna_toascii(const char * s,const char * se,char * d,char * de)270  long uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
271    const char* si;
272    const char* st;
273    unsigned c;
274    char* ds;
275    int rc;
276  
277    ds = d;
278  
279    si = s;
280    while (si < se) {
281      st = si;
282      c = uv__utf8_decode1(&si, se);
283  
284      if (c == UINT_MAX)
285        return UV_EINVAL;
286  
287      if (c != '.')
288        if (c != 0x3002)  /* 。 */
289          if (c != 0xFF0E)  /* . */
290            if (c != 0xFF61)  /* 。 */
291              continue;
292  
293      rc = uv__idna_toascii_label(s, st, &d, de);
294  
295      if (rc < 0)
296        return rc;
297  
298      if (d < de)
299        *d++ = '.';
300  
301      s = si;
302    }
303  
304    if (s < se) {
305      rc = uv__idna_toascii_label(s, se, &d, de);
306  
307      if (rc < 0)
308        return rc;
309    }
310  
311    if (d < de)
312      *d++ = '\0';
313  
314    return d - ds;  /* Number of bytes written. */
315  }
316