1# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) 2 3import stringprep, re, codecs 4from unicodedata import ucd_3_2_0 as unicodedata 5 6# IDNA section 3.1 7dots = re.compile("[\u002E\u3002\uFF0E\uFF61]") 8 9# IDNA section 5 10ace_prefix = b"xn--" 11sace_prefix = "xn--" 12 13# This assumes query strings, so AllowUnassigned is true 14def nameprep(label): 15 # Map 16 newlabel = [] 17 for c in label: 18 if stringprep.in_table_b1(c): 19 # Map to nothing 20 continue 21 newlabel.append(stringprep.map_table_b2(c)) 22 label = "".join(newlabel) 23 24 # Normalize 25 label = unicodedata.normalize("NFKC", label) 26 27 # Prohibit 28 for c in label: 29 if stringprep.in_table_c12(c) or \ 30 stringprep.in_table_c22(c) or \ 31 stringprep.in_table_c3(c) or \ 32 stringprep.in_table_c4(c) or \ 33 stringprep.in_table_c5(c) or \ 34 stringprep.in_table_c6(c) or \ 35 stringprep.in_table_c7(c) or \ 36 stringprep.in_table_c8(c) or \ 37 stringprep.in_table_c9(c): 38 raise UnicodeError("Invalid character %r" % c) 39 40 # Check bidi 41 RandAL = [stringprep.in_table_d1(x) for x in label] 42 for c in RandAL: 43 if c: 44 # There is a RandAL char in the string. Must perform further 45 # tests: 46 # 1) The characters in section 5.8 MUST be prohibited. 47 # This is table C.8, which was already checked 48 # 2) If a string contains any RandALCat character, the string 49 # MUST NOT contain any LCat character. 50 if any(stringprep.in_table_d2(x) for x in label): 51 raise UnicodeError("Violation of BIDI requirement 2") 52 53 # 3) If a string contains any RandALCat character, a 54 # RandALCat character MUST be the first character of the 55 # string, and a RandALCat character MUST be the last 56 # character of the string. 57 if not RandAL[0] or not RandAL[-1]: 58 raise UnicodeError("Violation of BIDI requirement 3") 59 60 return label 61 62def ToASCII(label): 63 try: 64 # Step 1: try ASCII 65 label = label.encode("ascii") 66 except UnicodeError: 67 pass 68 else: 69 # Skip to step 3: UseSTD3ASCIIRules is false, so 70 # Skip to step 8. 71 if 0 < len(label) < 64: 72 return label 73 raise UnicodeError("label empty or too long") 74 75 # Step 2: nameprep 76 label = nameprep(label) 77 78 # Step 3: UseSTD3ASCIIRules is false 79 # Step 4: try ASCII 80 try: 81 label = label.encode("ascii") 82 except UnicodeError: 83 pass 84 else: 85 # Skip to step 8. 86 if 0 < len(label) < 64: 87 return label 88 raise UnicodeError("label empty or too long") 89 90 # Step 5: Check ACE prefix 91 if label.startswith(sace_prefix): 92 raise UnicodeError("Label starts with ACE prefix") 93 94 # Step 6: Encode with PUNYCODE 95 label = label.encode("punycode") 96 97 # Step 7: Prepend ACE prefix 98 label = ace_prefix + label 99 100 # Step 8: Check size 101 if 0 < len(label) < 64: 102 return label 103 raise UnicodeError("label empty or too long") 104 105def ToUnicode(label): 106 # Step 1: Check for ASCII 107 if isinstance(label, bytes): 108 pure_ascii = True 109 else: 110 try: 111 label = label.encode("ascii") 112 pure_ascii = True 113 except UnicodeError: 114 pure_ascii = False 115 if not pure_ascii: 116 # Step 2: Perform nameprep 117 label = nameprep(label) 118 # It doesn't say this, but apparently, it should be ASCII now 119 try: 120 label = label.encode("ascii") 121 except UnicodeError: 122 raise UnicodeError("Invalid character in IDN label") 123 # Step 3: Check for ACE prefix 124 if not label.startswith(ace_prefix): 125 return str(label, "ascii") 126 127 # Step 4: Remove ACE prefix 128 label1 = label[len(ace_prefix):] 129 130 # Step 5: Decode using PUNYCODE 131 result = label1.decode("punycode") 132 133 # Step 6: Apply ToASCII 134 label2 = ToASCII(result) 135 136 # Step 7: Compare the result of step 6 with the one of step 3 137 # label2 will already be in lower case. 138 if str(label, "ascii").lower() != str(label2, "ascii"): 139 raise UnicodeError("IDNA does not round-trip", label, label2) 140 141 # Step 8: return the result of step 5 142 return result 143 144### Codec APIs 145 146class Codec(codecs.Codec): 147 def encode(self, input, errors='strict'): 148 149 if errors != 'strict': 150 # IDNA is quite clear that implementations must be strict 151 raise UnicodeError("unsupported error handling "+errors) 152 153 if not input: 154 return b'', 0 155 156 try: 157 result = input.encode('ascii') 158 except UnicodeEncodeError: 159 pass 160 else: 161 # ASCII name: fast path 162 labels = result.split(b'.') 163 for label in labels[:-1]: 164 if not (0 < len(label) < 64): 165 raise UnicodeError("label empty or too long") 166 if len(labels[-1]) >= 64: 167 raise UnicodeError("label too long") 168 return result, len(input) 169 170 result = bytearray() 171 labels = dots.split(input) 172 if labels and not labels[-1]: 173 trailing_dot = b'.' 174 del labels[-1] 175 else: 176 trailing_dot = b'' 177 for label in labels: 178 if result: 179 # Join with U+002E 180 result.extend(b'.') 181 result.extend(ToASCII(label)) 182 return bytes(result+trailing_dot), len(input) 183 184 def decode(self, input, errors='strict'): 185 186 if errors != 'strict': 187 raise UnicodeError("Unsupported error handling "+errors) 188 189 if not input: 190 return "", 0 191 192 # IDNA allows decoding to operate on Unicode strings, too. 193 if not isinstance(input, bytes): 194 # XXX obviously wrong, see #3232 195 input = bytes(input) 196 197 if ace_prefix not in input: 198 # Fast path 199 try: 200 return input.decode('ascii'), len(input) 201 except UnicodeDecodeError: 202 pass 203 204 labels = input.split(b".") 205 206 if labels and len(labels[-1]) == 0: 207 trailing_dot = '.' 208 del labels[-1] 209 else: 210 trailing_dot = '' 211 212 result = [] 213 for label in labels: 214 result.append(ToUnicode(label)) 215 216 return ".".join(result)+trailing_dot, len(input) 217 218class IncrementalEncoder(codecs.BufferedIncrementalEncoder): 219 def _buffer_encode(self, input, errors, final): 220 if errors != 'strict': 221 # IDNA is quite clear that implementations must be strict 222 raise UnicodeError("unsupported error handling "+errors) 223 224 if not input: 225 return (b'', 0) 226 227 labels = dots.split(input) 228 trailing_dot = b'' 229 if labels: 230 if not labels[-1]: 231 trailing_dot = b'.' 232 del labels[-1] 233 elif not final: 234 # Keep potentially unfinished label until the next call 235 del labels[-1] 236 if labels: 237 trailing_dot = b'.' 238 239 result = bytearray() 240 size = 0 241 for label in labels: 242 if size: 243 # Join with U+002E 244 result.extend(b'.') 245 size += 1 246 result.extend(ToASCII(label)) 247 size += len(label) 248 249 result += trailing_dot 250 size += len(trailing_dot) 251 return (bytes(result), size) 252 253class IncrementalDecoder(codecs.BufferedIncrementalDecoder): 254 def _buffer_decode(self, input, errors, final): 255 if errors != 'strict': 256 raise UnicodeError("Unsupported error handling "+errors) 257 258 if not input: 259 return ("", 0) 260 261 # IDNA allows decoding to operate on Unicode strings, too. 262 if isinstance(input, str): 263 labels = dots.split(input) 264 else: 265 # Must be ASCII string 266 input = str(input, "ascii") 267 labels = input.split(".") 268 269 trailing_dot = '' 270 if labels: 271 if not labels[-1]: 272 trailing_dot = '.' 273 del labels[-1] 274 elif not final: 275 # Keep potentially unfinished label until the next call 276 del labels[-1] 277 if labels: 278 trailing_dot = '.' 279 280 result = [] 281 size = 0 282 for label in labels: 283 result.append(ToUnicode(label)) 284 if size: 285 size += 1 286 size += len(label) 287 288 result = ".".join(result) + trailing_dot 289 size += len(trailing_dot) 290 return (result, size) 291 292class StreamWriter(Codec,codecs.StreamWriter): 293 pass 294 295class StreamReader(Codec,codecs.StreamReader): 296 pass 297 298### encodings module API 299 300def getregentry(): 301 return codecs.CodecInfo( 302 name='idna', 303 encode=Codec().encode, 304 decode=Codec().decode, 305 incrementalencoder=IncrementalEncoder, 306 incrementaldecoder=IncrementalDecoder, 307 streamwriter=StreamWriter, 308 streamreader=StreamReader, 309 ) 310