1# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) 2 3import stringprep, re, codecs 4from unicodedata import ucd_3_2_0 as unicodedata 5 6# IDNA section 3.1 7dots = re.compile("[\u002E\u3002\uFF0E\uFF61]") 8 9# IDNA section 5 10ace_prefix = b"xn--" 11sace_prefix = "xn--" 12 13# This assumes query strings, so AllowUnassigned is true 14def nameprep(label): # type: (str) -> str 15 # Map 16 newlabel = [] 17 for c in label: 18 if stringprep.in_table_b1(c): 19 # Map to nothing 20 continue 21 newlabel.append(stringprep.map_table_b2(c)) 22 label = "".join(newlabel) 23 24 # Normalize 25 label = unicodedata.normalize("NFKC", label) 26 27 # Prohibit 28 for i, c in enumerate(label): 29 if stringprep.in_table_c12(c) or \ 30 stringprep.in_table_c22(c) or \ 31 stringprep.in_table_c3(c) or \ 32 stringprep.in_table_c4(c) or \ 33 stringprep.in_table_c5(c) or \ 34 stringprep.in_table_c6(c) or \ 35 stringprep.in_table_c7(c) or \ 36 stringprep.in_table_c8(c) or \ 37 stringprep.in_table_c9(c): 38 raise UnicodeEncodeError("idna", label, i, i+1, f"Invalid character {c!r}") 39 40 # Check bidi 41 RandAL = [stringprep.in_table_d1(x) for x in label] 42 if any(RandAL): 43 # There is a RandAL char in the string. Must perform further 44 # tests: 45 # 1) The characters in section 5.8 MUST be prohibited. 46 # This is table C.8, which was already checked 47 # 2) If a string contains any RandALCat character, the string 48 # MUST NOT contain any LCat character. 49 for i, x in enumerate(label): 50 if stringprep.in_table_d2(x): 51 raise UnicodeEncodeError("idna", label, i, i+1, 52 "Violation of BIDI requirement 2") 53 # 3) If a string contains any RandALCat character, a 54 # RandALCat character MUST be the first character of the 55 # string, and a RandALCat character MUST be the last 56 # character of the string. 57 if not RandAL[0]: 58 raise UnicodeEncodeError("idna", label, 0, 1, 59 "Violation of BIDI requirement 3") 60 if not RandAL[-1]: 61 raise UnicodeEncodeError("idna", label, len(label)-1, len(label), 62 "Violation of BIDI requirement 3") 63 64 return label 65 66def ToASCII(label): # type: (str) -> bytes 67 try: 68 # Step 1: try ASCII 69 label_ascii = label.encode("ascii") 70 except UnicodeEncodeError: 71 pass 72 else: 73 # Skip to step 3: UseSTD3ASCIIRules is false, so 74 # Skip to step 8. 75 if 0 < len(label_ascii) < 64: 76 return label_ascii 77 if len(label) == 0: 78 raise UnicodeEncodeError("idna", label, 0, 1, "label empty") 79 else: 80 raise UnicodeEncodeError("idna", label, 0, len(label), "label too long") 81 82 # Step 2: nameprep 83 label = nameprep(label) 84 85 # Step 3: UseSTD3ASCIIRules is false 86 # Step 4: try ASCII 87 try: 88 label_ascii = label.encode("ascii") 89 except UnicodeEncodeError: 90 pass 91 else: 92 # Skip to step 8. 93 if 0 < len(label) < 64: 94 return label_ascii 95 if len(label) == 0: 96 raise UnicodeEncodeError("idna", label, 0, 1, "label empty") 97 else: 98 raise UnicodeEncodeError("idna", label, 0, len(label), "label too long") 99 100 # Step 5: Check ACE prefix 101 if label.lower().startswith(sace_prefix): 102 raise UnicodeEncodeError( 103 "idna", label, 0, len(sace_prefix), "Label starts with ACE prefix") 104 105 # Step 6: Encode with PUNYCODE 106 label_ascii = label.encode("punycode") 107 108 # Step 7: Prepend ACE prefix 109 label_ascii = ace_prefix + label_ascii 110 111 # Step 8: Check size 112 # do not check for empty as we prepend ace_prefix. 113 if len(label_ascii) < 64: 114 return label_ascii 115 raise UnicodeEncodeError("idna", label, 0, len(label), "label too long") 116 117def ToUnicode(label): 118 if len(label) > 1024: 119 # Protection from https://github.com/python/cpython/issues/98433. 120 # https://datatracker.ietf.org/doc/html/rfc5894#section-6 121 # doesn't specify a label size limit prior to NAMEPREP. But having 122 # one makes practical sense. 123 # This leaves ample room for nameprep() to remove Nothing characters 124 # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still 125 # preventing us from wasting time decoding a big thing that'll just 126 # hit the actual <= 63 length limit in Step 6. 127 if isinstance(label, str): 128 label = label.encode("utf-8", errors="backslashreplace") 129 raise UnicodeDecodeError("idna", label, 0, len(label), "label way too long") 130 # Step 1: Check for ASCII 131 if isinstance(label, bytes): 132 pure_ascii = True 133 else: 134 try: 135 label = label.encode("ascii") 136 pure_ascii = True 137 except UnicodeEncodeError: 138 pure_ascii = False 139 if not pure_ascii: 140 assert isinstance(label, str) 141 # Step 2: Perform nameprep 142 label = nameprep(label) 143 # It doesn't say this, but apparently, it should be ASCII now 144 try: 145 label = label.encode("ascii") 146 except UnicodeEncodeError as exc: 147 raise UnicodeEncodeError("idna", label, exc.start, exc.end, 148 "Invalid character in IDN label") 149 # Step 3: Check for ACE prefix 150 assert isinstance(label, bytes) 151 if not label.lower().startswith(ace_prefix): 152 return str(label, "ascii") 153 154 # Step 4: Remove ACE prefix 155 label1 = label[len(ace_prefix):] 156 157 # Step 5: Decode using PUNYCODE 158 try: 159 result = label1.decode("punycode") 160 except UnicodeDecodeError as exc: 161 offset = len(ace_prefix) 162 raise UnicodeDecodeError("idna", label, offset+exc.start, offset+exc.end, exc.reason) 163 164 # Step 6: Apply ToASCII 165 label2 = ToASCII(result) 166 167 # Step 7: Compare the result of step 6 with the one of step 3 168 # label2 will already be in lower case. 169 if str(label, "ascii").lower() != str(label2, "ascii"): 170 raise UnicodeDecodeError("idna", label, 0, len(label), 171 f"IDNA does not round-trip, '{label!r}' != '{label2!r}'") 172 173 # Step 8: return the result of step 5 174 return result 175 176### Codec APIs 177 178class Codec(codecs.Codec): 179 def encode(self, input, errors='strict'): 180 181 if errors != 'strict': 182 # IDNA is quite clear that implementations must be strict 183 raise UnicodeError(f"Unsupported error handling: {errors}") 184 185 if not input: 186 return b'', 0 187 188 try: 189 result = input.encode('ascii') 190 except UnicodeEncodeError: 191 pass 192 else: 193 # ASCII name: fast path 194 labels = result.split(b'.') 195 for i, label in enumerate(labels[:-1]): 196 if len(label) == 0: 197 offset = sum(len(l) for l in labels[:i]) + i 198 raise UnicodeEncodeError("idna", input, offset, offset+1, 199 "label empty") 200 for i, label in enumerate(labels): 201 if len(label) >= 64: 202 offset = sum(len(l) for l in labels[:i]) + i 203 raise UnicodeEncodeError("idna", input, offset, offset+len(label), 204 "label too long") 205 return result, len(input) 206 207 result = bytearray() 208 labels = dots.split(input) 209 if labels and not labels[-1]: 210 trailing_dot = b'.' 211 del labels[-1] 212 else: 213 trailing_dot = b'' 214 for i, label in enumerate(labels): 215 if result: 216 # Join with U+002E 217 result.extend(b'.') 218 try: 219 result.extend(ToASCII(label)) 220 except (UnicodeEncodeError, UnicodeDecodeError) as exc: 221 offset = sum(len(l) for l in labels[:i]) + i 222 raise UnicodeEncodeError( 223 "idna", 224 input, 225 offset + exc.start, 226 offset + exc.end, 227 exc.reason, 228 ) 229 return bytes(result+trailing_dot), len(input) 230 231 def decode(self, input, errors='strict'): 232 233 if errors != 'strict': 234 raise UnicodeError(f"Unsupported error handling: {errors}") 235 236 if not input: 237 return "", 0 238 239 # IDNA allows decoding to operate on Unicode strings, too. 240 if not isinstance(input, bytes): 241 # XXX obviously wrong, see #3232 242 input = bytes(input) 243 244 if ace_prefix not in input.lower(): 245 # Fast path 246 try: 247 return input.decode('ascii'), len(input) 248 except UnicodeDecodeError: 249 pass 250 251 labels = input.split(b".") 252 253 if labels and len(labels[-1]) == 0: 254 trailing_dot = '.' 255 del labels[-1] 256 else: 257 trailing_dot = '' 258 259 result = [] 260 for i, label in enumerate(labels): 261 try: 262 u_label = ToUnicode(label) 263 except (UnicodeEncodeError, UnicodeDecodeError) as exc: 264 offset = sum(len(x) for x in labels[:i]) + len(labels[:i]) 265 raise UnicodeDecodeError( 266 "idna", input, offset+exc.start, offset+exc.end, exc.reason) 267 else: 268 result.append(u_label) 269 270 return ".".join(result)+trailing_dot, len(input) 271 272class IncrementalEncoder(codecs.BufferedIncrementalEncoder): 273 def _buffer_encode(self, input, errors, final): 274 if errors != 'strict': 275 # IDNA is quite clear that implementations must be strict 276 raise UnicodeError(f"Unsupported error handling: {errors}") 277 278 if not input: 279 return (b'', 0) 280 281 labels = dots.split(input) 282 trailing_dot = b'' 283 if labels: 284 if not labels[-1]: 285 trailing_dot = b'.' 286 del labels[-1] 287 elif not final: 288 # Keep potentially unfinished label until the next call 289 del labels[-1] 290 if labels: 291 trailing_dot = b'.' 292 293 result = bytearray() 294 size = 0 295 for label in labels: 296 if size: 297 # Join with U+002E 298 result.extend(b'.') 299 size += 1 300 try: 301 result.extend(ToASCII(label)) 302 except (UnicodeEncodeError, UnicodeDecodeError) as exc: 303 raise UnicodeEncodeError( 304 "idna", 305 input, 306 size + exc.start, 307 size + exc.end, 308 exc.reason, 309 ) 310 size += len(label) 311 312 result += trailing_dot 313 size += len(trailing_dot) 314 return (bytes(result), size) 315 316class IncrementalDecoder(codecs.BufferedIncrementalDecoder): 317 def _buffer_decode(self, input, errors, final): 318 if errors != 'strict': 319 raise UnicodeError("Unsupported error handling: {errors}") 320 321 if not input: 322 return ("", 0) 323 324 # IDNA allows decoding to operate on Unicode strings, too. 325 if isinstance(input, str): 326 labels = dots.split(input) 327 else: 328 # Must be ASCII string 329 try: 330 input = str(input, "ascii") 331 except (UnicodeEncodeError, UnicodeDecodeError) as exc: 332 raise UnicodeDecodeError("idna", input, 333 exc.start, exc.end, exc.reason) 334 labels = input.split(".") 335 336 trailing_dot = '' 337 if labels: 338 if not labels[-1]: 339 trailing_dot = '.' 340 del labels[-1] 341 elif not final: 342 # Keep potentially unfinished label until the next call 343 del labels[-1] 344 if labels: 345 trailing_dot = '.' 346 347 result = [] 348 size = 0 349 for label in labels: 350 try: 351 u_label = ToUnicode(label) 352 except (UnicodeEncodeError, UnicodeDecodeError) as exc: 353 raise UnicodeDecodeError( 354 "idna", 355 input.encode("ascii", errors="backslashreplace"), 356 size + exc.start, 357 size + exc.end, 358 exc.reason, 359 ) 360 else: 361 result.append(u_label) 362 if size: 363 size += 1 364 size += len(label) 365 366 result = ".".join(result) + trailing_dot 367 size += len(trailing_dot) 368 return (result, size) 369 370class StreamWriter(Codec,codecs.StreamWriter): 371 pass 372 373class StreamReader(Codec,codecs.StreamReader): 374 pass 375 376### encodings module API 377 378def getregentry(): 379 return codecs.CodecInfo( 380 name='idna', 381 encode=Codec().encode, 382 decode=Codec().decode, 383 incrementalencoder=IncrementalEncoder, 384 incrementaldecoder=IncrementalDecoder, 385 streamwriter=StreamWriter, 386 streamreader=StreamReader, 387 ) 388