• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
2
3import stringprep, re, codecs
4from unicodedata import ucd_3_2_0 as unicodedata
5
6# IDNA section 3.1
7dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
8
9# IDNA section 5
10ace_prefix = b"xn--"
11sace_prefix = "xn--"
12
13# This assumes query strings, so AllowUnassigned is true
14def nameprep(label):
15    # Map
16    newlabel = []
17    for c in label:
18        if stringprep.in_table_b1(c):
19            # Map to nothing
20            continue
21        newlabel.append(stringprep.map_table_b2(c))
22    label = "".join(newlabel)
23
24    # Normalize
25    label = unicodedata.normalize("NFKC", label)
26
27    # Prohibit
28    for c in label:
29        if stringprep.in_table_c12(c) or \
30           stringprep.in_table_c22(c) or \
31           stringprep.in_table_c3(c) or \
32           stringprep.in_table_c4(c) or \
33           stringprep.in_table_c5(c) or \
34           stringprep.in_table_c6(c) or \
35           stringprep.in_table_c7(c) or \
36           stringprep.in_table_c8(c) or \
37           stringprep.in_table_c9(c):
38            raise UnicodeError("Invalid character %r" % c)
39
40    # Check bidi
41    RandAL = [stringprep.in_table_d1(x) for x in label]
42    if any(RandAL):
43        # There is a RandAL char in the string. Must perform further
44        # tests:
45        # 1) The characters in section 5.8 MUST be prohibited.
46        # This is table C.8, which was already checked
47        # 2) If a string contains any RandALCat character, the string
48        # MUST NOT contain any LCat character.
49        if any(stringprep.in_table_d2(x) for x in label):
50            raise UnicodeError("Violation of BIDI requirement 2")
51        # 3) If a string contains any RandALCat character, a
52        # RandALCat character MUST be the first character of the
53        # string, and a RandALCat character MUST be the last
54        # character of the string.
55        if not RandAL[0] or not RandAL[-1]:
56            raise UnicodeError("Violation of BIDI requirement 3")
57
58    return label
59
60def ToASCII(label):
61    try:
62        # Step 1: try ASCII
63        label = label.encode("ascii")
64    except UnicodeError:
65        pass
66    else:
67        # Skip to step 3: UseSTD3ASCIIRules is false, so
68        # Skip to step 8.
69        if 0 < len(label) < 64:
70            return label
71        raise UnicodeError("label empty or too long")
72
73    # Step 2: nameprep
74    label = nameprep(label)
75
76    # Step 3: UseSTD3ASCIIRules is false
77    # Step 4: try ASCII
78    try:
79        label = label.encode("ascii")
80    except UnicodeError:
81        pass
82    else:
83        # Skip to step 8.
84        if 0 < len(label) < 64:
85            return label
86        raise UnicodeError("label empty or too long")
87
88    # Step 5: Check ACE prefix
89    if label.startswith(sace_prefix):
90        raise UnicodeError("Label starts with ACE prefix")
91
92    # Step 6: Encode with PUNYCODE
93    label = label.encode("punycode")
94
95    # Step 7: Prepend ACE prefix
96    label = ace_prefix + label
97
98    # Step 8: Check size
99    if 0 < len(label) < 64:
100        return label
101    raise UnicodeError("label empty or too long")
102
103def ToUnicode(label):
104    # Step 1: Check for ASCII
105    if isinstance(label, bytes):
106        pure_ascii = True
107    else:
108        try:
109            label = label.encode("ascii")
110            pure_ascii = True
111        except UnicodeError:
112            pure_ascii = False
113    if not pure_ascii:
114        # Step 2: Perform nameprep
115        label = nameprep(label)
116        # It doesn't say this, but apparently, it should be ASCII now
117        try:
118            label = label.encode("ascii")
119        except UnicodeError:
120            raise UnicodeError("Invalid character in IDN label")
121    # Step 3: Check for ACE prefix
122    if not label.startswith(ace_prefix):
123        return str(label, "ascii")
124
125    # Step 4: Remove ACE prefix
126    label1 = label[len(ace_prefix):]
127
128    # Step 5: Decode using PUNYCODE
129    result = label1.decode("punycode")
130
131    # Step 6: Apply ToASCII
132    label2 = ToASCII(result)
133
134    # Step 7: Compare the result of step 6 with the one of step 3
135    # label2 will already be in lower case.
136    if str(label, "ascii").lower() != str(label2, "ascii"):
137        raise UnicodeError("IDNA does not round-trip", label, label2)
138
139    # Step 8: return the result of step 5
140    return result
141
142### Codec APIs
143
144class Codec(codecs.Codec):
145    def encode(self, input, errors='strict'):
146
147        if errors != 'strict':
148            # IDNA is quite clear that implementations must be strict
149            raise UnicodeError("unsupported error handling "+errors)
150
151        if not input:
152            return b'', 0
153
154        try:
155            result = input.encode('ascii')
156        except UnicodeEncodeError:
157            pass
158        else:
159            # ASCII name: fast path
160            labels = result.split(b'.')
161            for label in labels[:-1]:
162                if not (0 < len(label) < 64):
163                    raise UnicodeError("label empty or too long")
164            if len(labels[-1]) >= 64:
165                raise UnicodeError("label too long")
166            return result, len(input)
167
168        result = bytearray()
169        labels = dots.split(input)
170        if labels and not labels[-1]:
171            trailing_dot = b'.'
172            del labels[-1]
173        else:
174            trailing_dot = b''
175        for label in labels:
176            if result:
177                # Join with U+002E
178                result.extend(b'.')
179            result.extend(ToASCII(label))
180        return bytes(result+trailing_dot), len(input)
181
182    def decode(self, input, errors='strict'):
183
184        if errors != 'strict':
185            raise UnicodeError("Unsupported error handling "+errors)
186
187        if not input:
188            return "", 0
189
190        # IDNA allows decoding to operate on Unicode strings, too.
191        if not isinstance(input, bytes):
192            # XXX obviously wrong, see #3232
193            input = bytes(input)
194
195        if ace_prefix not in input:
196            # Fast path
197            try:
198                return input.decode('ascii'), len(input)
199            except UnicodeDecodeError:
200                pass
201
202        labels = input.split(b".")
203
204        if labels and len(labels[-1]) == 0:
205            trailing_dot = '.'
206            del labels[-1]
207        else:
208            trailing_dot = ''
209
210        result = []
211        for label in labels:
212            result.append(ToUnicode(label))
213
214        return ".".join(result)+trailing_dot, len(input)
215
216class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
217    def _buffer_encode(self, input, errors, final):
218        if errors != 'strict':
219            # IDNA is quite clear that implementations must be strict
220            raise UnicodeError("unsupported error handling "+errors)
221
222        if not input:
223            return (b'', 0)
224
225        labels = dots.split(input)
226        trailing_dot = b''
227        if labels:
228            if not labels[-1]:
229                trailing_dot = b'.'
230                del labels[-1]
231            elif not final:
232                # Keep potentially unfinished label until the next call
233                del labels[-1]
234                if labels:
235                    trailing_dot = b'.'
236
237        result = bytearray()
238        size = 0
239        for label in labels:
240            if size:
241                # Join with U+002E
242                result.extend(b'.')
243                size += 1
244            result.extend(ToASCII(label))
245            size += len(label)
246
247        result += trailing_dot
248        size += len(trailing_dot)
249        return (bytes(result), size)
250
251class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
252    def _buffer_decode(self, input, errors, final):
253        if errors != 'strict':
254            raise UnicodeError("Unsupported error handling "+errors)
255
256        if not input:
257            return ("", 0)
258
259        # IDNA allows decoding to operate on Unicode strings, too.
260        if isinstance(input, str):
261            labels = dots.split(input)
262        else:
263            # Must be ASCII string
264            input = str(input, "ascii")
265            labels = input.split(".")
266
267        trailing_dot = ''
268        if labels:
269            if not labels[-1]:
270                trailing_dot = '.'
271                del labels[-1]
272            elif not final:
273                # Keep potentially unfinished label until the next call
274                del labels[-1]
275                if labels:
276                    trailing_dot = '.'
277
278        result = []
279        size = 0
280        for label in labels:
281            result.append(ToUnicode(label))
282            if size:
283                size += 1
284            size += len(label)
285
286        result = ".".join(result) + trailing_dot
287        size += len(trailing_dot)
288        return (result, size)
289
290class StreamWriter(Codec,codecs.StreamWriter):
291    pass
292
293class StreamReader(Codec,codecs.StreamReader):
294    pass
295
296### encodings module API
297
298def getregentry():
299    return codecs.CodecInfo(
300        name='idna',
301        encode=Codec().encode,
302        decode=Codec().decode,
303        incrementalencoder=IncrementalEncoder,
304        incrementaldecoder=IncrementalDecoder,
305        streamwriter=StreamWriter,
306        streamreader=StreamReader,
307    )
308