• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
2
3import stringprep, re, codecs
4from unicodedata import ucd_3_2_0 as unicodedata
5
6# IDNA section 3.1
7dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
8
9# IDNA section 5
10ace_prefix = b"xn--"
11sace_prefix = "xn--"
12
13# This assumes query strings, so AllowUnassigned is true
14def nameprep(label):  # type: (str) -> str
15    # Map
16    newlabel = []
17    for c in label:
18        if stringprep.in_table_b1(c):
19            # Map to nothing
20            continue
21        newlabel.append(stringprep.map_table_b2(c))
22    label = "".join(newlabel)
23
24    # Normalize
25    label = unicodedata.normalize("NFKC", label)
26
27    # Prohibit
28    for i, c in enumerate(label):
29        if stringprep.in_table_c12(c) or \
30           stringprep.in_table_c22(c) or \
31           stringprep.in_table_c3(c) or \
32           stringprep.in_table_c4(c) or \
33           stringprep.in_table_c5(c) or \
34           stringprep.in_table_c6(c) or \
35           stringprep.in_table_c7(c) or \
36           stringprep.in_table_c8(c) or \
37           stringprep.in_table_c9(c):
38            raise UnicodeEncodeError("idna", label, i, i+1, f"Invalid character {c!r}")
39
40    # Check bidi
41    RandAL = [stringprep.in_table_d1(x) for x in label]
42    if any(RandAL):
43        # There is a RandAL char in the string. Must perform further
44        # tests:
45        # 1) The characters in section 5.8 MUST be prohibited.
46        # This is table C.8, which was already checked
47        # 2) If a string contains any RandALCat character, the string
48        # MUST NOT contain any LCat character.
49        for i, x in enumerate(label):
50            if stringprep.in_table_d2(x):
51                raise UnicodeEncodeError("idna", label, i, i+1,
52                                         "Violation of BIDI requirement 2")
53        # 3) If a string contains any RandALCat character, a
54        # RandALCat character MUST be the first character of the
55        # string, and a RandALCat character MUST be the last
56        # character of the string.
57        if not RandAL[0]:
58            raise UnicodeEncodeError("idna", label, 0, 1,
59                                     "Violation of BIDI requirement 3")
60        if not RandAL[-1]:
61            raise UnicodeEncodeError("idna", label, len(label)-1, len(label),
62                                     "Violation of BIDI requirement 3")
63
64    return label
65
66def ToASCII(label):  # type: (str) -> bytes
67    try:
68        # Step 1: try ASCII
69        label_ascii = label.encode("ascii")
70    except UnicodeEncodeError:
71        pass
72    else:
73        # Skip to step 3: UseSTD3ASCIIRules is false, so
74        # Skip to step 8.
75        if 0 < len(label_ascii) < 64:
76            return label_ascii
77        if len(label) == 0:
78            raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
79        else:
80            raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")
81
82    # Step 2: nameprep
83    label = nameprep(label)
84
85    # Step 3: UseSTD3ASCIIRules is false
86    # Step 4: try ASCII
87    try:
88        label_ascii = label.encode("ascii")
89    except UnicodeEncodeError:
90        pass
91    else:
92        # Skip to step 8.
93        if 0 < len(label) < 64:
94            return label_ascii
95        if len(label) == 0:
96            raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
97        else:
98            raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")
99
100    # Step 5: Check ACE prefix
101    if label.lower().startswith(sace_prefix):
102        raise UnicodeEncodeError(
103            "idna", label, 0, len(sace_prefix), "Label starts with ACE prefix")
104
105    # Step 6: Encode with PUNYCODE
106    label_ascii = label.encode("punycode")
107
108    # Step 7: Prepend ACE prefix
109    label_ascii = ace_prefix + label_ascii
110
111    # Step 8: Check size
112    # do not check for empty as we prepend ace_prefix.
113    if len(label_ascii) < 64:
114        return label_ascii
115    raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")
116
117def ToUnicode(label):
118    if len(label) > 1024:
119        # Protection from https://github.com/python/cpython/issues/98433.
120        # https://datatracker.ietf.org/doc/html/rfc5894#section-6
121        # doesn't specify a label size limit prior to NAMEPREP. But having
122        # one makes practical sense.
123        # This leaves ample room for nameprep() to remove Nothing characters
124        # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
125        # preventing us from wasting time decoding a big thing that'll just
126        # hit the actual <= 63 length limit in Step 6.
127        if isinstance(label, str):
128            label = label.encode("utf-8", errors="backslashreplace")
129        raise UnicodeDecodeError("idna", label, 0, len(label), "label way too long")
130    # Step 1: Check for ASCII
131    if isinstance(label, bytes):
132        pure_ascii = True
133    else:
134        try:
135            label = label.encode("ascii")
136            pure_ascii = True
137        except UnicodeEncodeError:
138            pure_ascii = False
139    if not pure_ascii:
140        assert isinstance(label, str)
141        # Step 2: Perform nameprep
142        label = nameprep(label)
143        # It doesn't say this, but apparently, it should be ASCII now
144        try:
145            label = label.encode("ascii")
146        except UnicodeEncodeError as exc:
147            raise UnicodeEncodeError("idna", label, exc.start, exc.end,
148                                     "Invalid character in IDN label")
149    # Step 3: Check for ACE prefix
150    assert isinstance(label, bytes)
151    if not label.lower().startswith(ace_prefix):
152        return str(label, "ascii")
153
154    # Step 4: Remove ACE prefix
155    label1 = label[len(ace_prefix):]
156
157    # Step 5: Decode using PUNYCODE
158    try:
159        result = label1.decode("punycode")
160    except UnicodeDecodeError as exc:
161        offset = len(ace_prefix)
162        raise UnicodeDecodeError("idna", label, offset+exc.start, offset+exc.end, exc.reason)
163
164    # Step 6: Apply ToASCII
165    label2 = ToASCII(result)
166
167    # Step 7: Compare the result of step 6 with the one of step 3
168    # label2 will already be in lower case.
169    if str(label, "ascii").lower() != str(label2, "ascii"):
170        raise UnicodeDecodeError("idna", label, 0, len(label),
171                                 f"IDNA does not round-trip, '{label!r}' != '{label2!r}'")
172
173    # Step 8: return the result of step 5
174    return result
175
176### Codec APIs
177
178class Codec(codecs.Codec):
179    def encode(self, input, errors='strict'):
180
181        if errors != 'strict':
182            # IDNA is quite clear that implementations must be strict
183            raise UnicodeError(f"Unsupported error handling: {errors}")
184
185        if not input:
186            return b'', 0
187
188        try:
189            result = input.encode('ascii')
190        except UnicodeEncodeError:
191            pass
192        else:
193            # ASCII name: fast path
194            labels = result.split(b'.')
195            for i, label in enumerate(labels[:-1]):
196                if len(label) == 0:
197                    offset = sum(len(l) for l in labels[:i]) + i
198                    raise UnicodeEncodeError("idna", input, offset, offset+1,
199                                             "label empty")
200            for i, label in enumerate(labels):
201                if len(label) >= 64:
202                    offset = sum(len(l) for l in labels[:i]) + i
203                    raise UnicodeEncodeError("idna", input, offset, offset+len(label),
204                                             "label too long")
205            return result, len(input)
206
207        result = bytearray()
208        labels = dots.split(input)
209        if labels and not labels[-1]:
210            trailing_dot = b'.'
211            del labels[-1]
212        else:
213            trailing_dot = b''
214        for i, label in enumerate(labels):
215            if result:
216                # Join with U+002E
217                result.extend(b'.')
218            try:
219                result.extend(ToASCII(label))
220            except (UnicodeEncodeError, UnicodeDecodeError) as exc:
221                offset = sum(len(l) for l in labels[:i]) + i
222                raise UnicodeEncodeError(
223                    "idna",
224                    input,
225                    offset + exc.start,
226                    offset + exc.end,
227                    exc.reason,
228                )
229        return bytes(result+trailing_dot), len(input)
230
231    def decode(self, input, errors='strict'):
232
233        if errors != 'strict':
234            raise UnicodeError(f"Unsupported error handling: {errors}")
235
236        if not input:
237            return "", 0
238
239        # IDNA allows decoding to operate on Unicode strings, too.
240        if not isinstance(input, bytes):
241            # XXX obviously wrong, see #3232
242            input = bytes(input)
243
244        if ace_prefix not in input.lower():
245            # Fast path
246            try:
247                return input.decode('ascii'), len(input)
248            except UnicodeDecodeError:
249                pass
250
251        labels = input.split(b".")
252
253        if labels and len(labels[-1]) == 0:
254            trailing_dot = '.'
255            del labels[-1]
256        else:
257            trailing_dot = ''
258
259        result = []
260        for i, label in enumerate(labels):
261            try:
262                u_label = ToUnicode(label)
263            except (UnicodeEncodeError, UnicodeDecodeError) as exc:
264                offset = sum(len(x) for x in labels[:i]) + len(labels[:i])
265                raise UnicodeDecodeError(
266                    "idna", input, offset+exc.start, offset+exc.end, exc.reason)
267            else:
268                result.append(u_label)
269
270        return ".".join(result)+trailing_dot, len(input)
271
272class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
273    def _buffer_encode(self, input, errors, final):
274        if errors != 'strict':
275            # IDNA is quite clear that implementations must be strict
276            raise UnicodeError(f"Unsupported error handling: {errors}")
277
278        if not input:
279            return (b'', 0)
280
281        labels = dots.split(input)
282        trailing_dot = b''
283        if labels:
284            if not labels[-1]:
285                trailing_dot = b'.'
286                del labels[-1]
287            elif not final:
288                # Keep potentially unfinished label until the next call
289                del labels[-1]
290                if labels:
291                    trailing_dot = b'.'
292
293        result = bytearray()
294        size = 0
295        for label in labels:
296            if size:
297                # Join with U+002E
298                result.extend(b'.')
299                size += 1
300            try:
301                result.extend(ToASCII(label))
302            except (UnicodeEncodeError, UnicodeDecodeError) as exc:
303                raise UnicodeEncodeError(
304                    "idna",
305                    input,
306                    size + exc.start,
307                    size + exc.end,
308                    exc.reason,
309                )
310            size += len(label)
311
312        result += trailing_dot
313        size += len(trailing_dot)
314        return (bytes(result), size)
315
316class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
317    def _buffer_decode(self, input, errors, final):
318        if errors != 'strict':
319            raise UnicodeError("Unsupported error handling: {errors}")
320
321        if not input:
322            return ("", 0)
323
324        # IDNA allows decoding to operate on Unicode strings, too.
325        if isinstance(input, str):
326            labels = dots.split(input)
327        else:
328            # Must be ASCII string
329            try:
330                input = str(input, "ascii")
331            except (UnicodeEncodeError, UnicodeDecodeError) as exc:
332                raise UnicodeDecodeError("idna", input,
333                                         exc.start, exc.end, exc.reason)
334            labels = input.split(".")
335
336        trailing_dot = ''
337        if labels:
338            if not labels[-1]:
339                trailing_dot = '.'
340                del labels[-1]
341            elif not final:
342                # Keep potentially unfinished label until the next call
343                del labels[-1]
344                if labels:
345                    trailing_dot = '.'
346
347        result = []
348        size = 0
349        for label in labels:
350            try:
351                u_label = ToUnicode(label)
352            except (UnicodeEncodeError, UnicodeDecodeError) as exc:
353                raise UnicodeDecodeError(
354                    "idna",
355                    input.encode("ascii", errors="backslashreplace"),
356                    size + exc.start,
357                    size + exc.end,
358                    exc.reason,
359                )
360            else:
361                result.append(u_label)
362            if size:
363                size += 1
364            size += len(label)
365
366        result = ".".join(result) + trailing_dot
367        size += len(trailing_dot)
368        return (result, size)
369
370class StreamWriter(Codec,codecs.StreamWriter):
371    pass
372
373class StreamReader(Codec,codecs.StreamReader):
374    pass
375
376### encodings module API
377
378def getregentry():
379    return codecs.CodecInfo(
380        name='idna',
381        encode=Codec().encode,
382        decode=Codec().decode,
383        incrementalencoder=IncrementalEncoder,
384        incrementaldecoder=IncrementalDecoder,
385        streamwriter=StreamWriter,
386        streamreader=StreamReader,
387    )
388