1# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) 2 3import stringprep, re, codecs 4from unicodedata import ucd_3_2_0 as unicodedata 5 6# IDNA section 3.1 7dots = re.compile("[\u002E\u3002\uFF0E\uFF61]") 8 9# IDNA section 5 10ace_prefix = b"xn--" 11sace_prefix = "xn--" 12 13# This assumes query strings, so AllowUnassigned is true 14def nameprep(label): 15 # Map 16 newlabel = [] 17 for c in label: 18 if stringprep.in_table_b1(c): 19 # Map to nothing 20 continue 21 newlabel.append(stringprep.map_table_b2(c)) 22 label = "".join(newlabel) 23 24 # Normalize 25 label = unicodedata.normalize("NFKC", label) 26 27 # Prohibit 28 for c in label: 29 if stringprep.in_table_c12(c) or \ 30 stringprep.in_table_c22(c) or \ 31 stringprep.in_table_c3(c) or \ 32 stringprep.in_table_c4(c) or \ 33 stringprep.in_table_c5(c) or \ 34 stringprep.in_table_c6(c) or \ 35 stringprep.in_table_c7(c) or \ 36 stringprep.in_table_c8(c) or \ 37 stringprep.in_table_c9(c): 38 raise UnicodeError("Invalid character %r" % c) 39 40 # Check bidi 41 RandAL = [stringprep.in_table_d1(x) for x in label] 42 if any(RandAL): 43 # There is a RandAL char in the string. Must perform further 44 # tests: 45 # 1) The characters in section 5.8 MUST be prohibited. 46 # This is table C.8, which was already checked 47 # 2) If a string contains any RandALCat character, the string 48 # MUST NOT contain any LCat character. 49 if any(stringprep.in_table_d2(x) for x in label): 50 raise UnicodeError("Violation of BIDI requirement 2") 51 # 3) If a string contains any RandALCat character, a 52 # RandALCat character MUST be the first character of the 53 # string, and a RandALCat character MUST be the last 54 # character of the string. 55 if not RandAL[0] or not RandAL[-1]: 56 raise UnicodeError("Violation of BIDI requirement 3") 57 58 return label 59 60def ToASCII(label): 61 try: 62 # Step 1: try ASCII 63 label = label.encode("ascii") 64 except UnicodeError: 65 pass 66 else: 67 # Skip to step 3: UseSTD3ASCIIRules is false, so 68 # Skip to step 8. 69 if 0 < len(label) < 64: 70 return label 71 raise UnicodeError("label empty or too long") 72 73 # Step 2: nameprep 74 label = nameprep(label) 75 76 # Step 3: UseSTD3ASCIIRules is false 77 # Step 4: try ASCII 78 try: 79 label = label.encode("ascii") 80 except UnicodeError: 81 pass 82 else: 83 # Skip to step 8. 84 if 0 < len(label) < 64: 85 return label 86 raise UnicodeError("label empty or too long") 87 88 # Step 5: Check ACE prefix 89 if label.startswith(sace_prefix): 90 raise UnicodeError("Label starts with ACE prefix") 91 92 # Step 6: Encode with PUNYCODE 93 label = label.encode("punycode") 94 95 # Step 7: Prepend ACE prefix 96 label = ace_prefix + label 97 98 # Step 8: Check size 99 if 0 < len(label) < 64: 100 return label 101 raise UnicodeError("label empty or too long") 102 103def ToUnicode(label): 104 # Step 1: Check for ASCII 105 if isinstance(label, bytes): 106 pure_ascii = True 107 else: 108 try: 109 label = label.encode("ascii") 110 pure_ascii = True 111 except UnicodeError: 112 pure_ascii = False 113 if not pure_ascii: 114 # Step 2: Perform nameprep 115 label = nameprep(label) 116 # It doesn't say this, but apparently, it should be ASCII now 117 try: 118 label = label.encode("ascii") 119 except UnicodeError: 120 raise UnicodeError("Invalid character in IDN label") 121 # Step 3: Check for ACE prefix 122 if not label.startswith(ace_prefix): 123 return str(label, "ascii") 124 125 # Step 4: Remove ACE prefix 126 label1 = label[len(ace_prefix):] 127 128 # Step 5: Decode using PUNYCODE 129 result = label1.decode("punycode") 130 131 # Step 6: Apply ToASCII 132 label2 = ToASCII(result) 133 134 # Step 7: Compare the result of step 6 with the one of step 3 135 # label2 will already be in lower case. 136 if str(label, "ascii").lower() != str(label2, "ascii"): 137 raise UnicodeError("IDNA does not round-trip", label, label2) 138 139 # Step 8: return the result of step 5 140 return result 141 142### Codec APIs 143 144class Codec(codecs.Codec): 145 def encode(self, input, errors='strict'): 146 147 if errors != 'strict': 148 # IDNA is quite clear that implementations must be strict 149 raise UnicodeError("unsupported error handling "+errors) 150 151 if not input: 152 return b'', 0 153 154 try: 155 result = input.encode('ascii') 156 except UnicodeEncodeError: 157 pass 158 else: 159 # ASCII name: fast path 160 labels = result.split(b'.') 161 for label in labels[:-1]: 162 if not (0 < len(label) < 64): 163 raise UnicodeError("label empty or too long") 164 if len(labels[-1]) >= 64: 165 raise UnicodeError("label too long") 166 return result, len(input) 167 168 result = bytearray() 169 labels = dots.split(input) 170 if labels and not labels[-1]: 171 trailing_dot = b'.' 172 del labels[-1] 173 else: 174 trailing_dot = b'' 175 for label in labels: 176 if result: 177 # Join with U+002E 178 result.extend(b'.') 179 result.extend(ToASCII(label)) 180 return bytes(result+trailing_dot), len(input) 181 182 def decode(self, input, errors='strict'): 183 184 if errors != 'strict': 185 raise UnicodeError("Unsupported error handling "+errors) 186 187 if not input: 188 return "", 0 189 190 # IDNA allows decoding to operate on Unicode strings, too. 191 if not isinstance(input, bytes): 192 # XXX obviously wrong, see #3232 193 input = bytes(input) 194 195 if ace_prefix not in input: 196 # Fast path 197 try: 198 return input.decode('ascii'), len(input) 199 except UnicodeDecodeError: 200 pass 201 202 labels = input.split(b".") 203 204 if labels and len(labels[-1]) == 0: 205 trailing_dot = '.' 206 del labels[-1] 207 else: 208 trailing_dot = '' 209 210 result = [] 211 for label in labels: 212 result.append(ToUnicode(label)) 213 214 return ".".join(result)+trailing_dot, len(input) 215 216class IncrementalEncoder(codecs.BufferedIncrementalEncoder): 217 def _buffer_encode(self, input, errors, final): 218 if errors != 'strict': 219 # IDNA is quite clear that implementations must be strict 220 raise UnicodeError("unsupported error handling "+errors) 221 222 if not input: 223 return (b'', 0) 224 225 labels = dots.split(input) 226 trailing_dot = b'' 227 if labels: 228 if not labels[-1]: 229 trailing_dot = b'.' 230 del labels[-1] 231 elif not final: 232 # Keep potentially unfinished label until the next call 233 del labels[-1] 234 if labels: 235 trailing_dot = b'.' 236 237 result = bytearray() 238 size = 0 239 for label in labels: 240 if size: 241 # Join with U+002E 242 result.extend(b'.') 243 size += 1 244 result.extend(ToASCII(label)) 245 size += len(label) 246 247 result += trailing_dot 248 size += len(trailing_dot) 249 return (bytes(result), size) 250 251class IncrementalDecoder(codecs.BufferedIncrementalDecoder): 252 def _buffer_decode(self, input, errors, final): 253 if errors != 'strict': 254 raise UnicodeError("Unsupported error handling "+errors) 255 256 if not input: 257 return ("", 0) 258 259 # IDNA allows decoding to operate on Unicode strings, too. 260 if isinstance(input, str): 261 labels = dots.split(input) 262 else: 263 # Must be ASCII string 264 input = str(input, "ascii") 265 labels = input.split(".") 266 267 trailing_dot = '' 268 if labels: 269 if not labels[-1]: 270 trailing_dot = '.' 271 del labels[-1] 272 elif not final: 273 # Keep potentially unfinished label until the next call 274 del labels[-1] 275 if labels: 276 trailing_dot = '.' 277 278 result = [] 279 size = 0 280 for label in labels: 281 result.append(ToUnicode(label)) 282 if size: 283 size += 1 284 size += len(label) 285 286 result = ".".join(result) + trailing_dot 287 size += len(trailing_dot) 288 return (result, size) 289 290class StreamWriter(Codec,codecs.StreamWriter): 291 pass 292 293class StreamReader(Codec,codecs.StreamReader): 294 pass 295 296### encodings module API 297 298def getregentry(): 299 return codecs.CodecInfo( 300 name='idna', 301 encode=Codec().encode, 302 decode=Codec().decode, 303 incrementalencoder=IncrementalEncoder, 304 incrementaldecoder=IncrementalDecoder, 305 streamwriter=StreamWriter, 306 streamreader=StreamReader, 307 ) 308