1""" Codec for the Punicode encoding, as specified in RFC 3492 2 3Written by Martin v. Löwis. 4""" 5 6import codecs 7 8##################### Encoding ##################################### 9 10def segregate(str): 11 """3.1 Basic code point segregation""" 12 base = bytearray() 13 extended = set() 14 for c in str: 15 if ord(c) < 128: 16 base.append(ord(c)) 17 else: 18 extended.add(c) 19 extended = sorted(extended) 20 return bytes(base), extended 21 22def selective_len(str, max): 23 """Return the length of str, considering only characters below max.""" 24 res = 0 25 for c in str: 26 if ord(c) < max: 27 res += 1 28 return res 29 30def selective_find(str, char, index, pos): 31 """Return a pair (index, pos), indicating the next occurrence of 32 char in str. index is the position of the character considering 33 only ordinals up to and including char, and pos is the position in 34 the full string. index/pos is the starting position in the full 35 string.""" 36 37 l = len(str) 38 while 1: 39 pos += 1 40 if pos == l: 41 return (-1, -1) 42 c = str[pos] 43 if c == char: 44 return index+1, pos 45 elif c < char: 46 index += 1 47 48def insertion_unsort(str, extended): 49 """3.2 Insertion unsort coding""" 50 oldchar = 0x80 51 result = [] 52 oldindex = -1 53 for c in extended: 54 index = pos = -1 55 char = ord(c) 56 curlen = selective_len(str, char) 57 delta = (curlen+1) * (char - oldchar) 58 while 1: 59 index,pos = selective_find(str,c,index,pos) 60 if index == -1: 61 break 62 delta += index - oldindex 63 result.append(delta-1) 64 oldindex = index 65 delta = 0 66 oldchar = char 67 68 return result 69 70def T(j, bias): 71 # Punycode parameters: tmin = 1, tmax = 26, base = 36 72 res = 36 * (j + 1) - bias 73 if res < 1: return 1 74 if res > 26: return 26 75 return res 76 77digits = b"abcdefghijklmnopqrstuvwxyz0123456789" 78def generate_generalized_integer(N, bias): 79 """3.3 Generalized variable-length integers""" 80 result = bytearray() 81 j = 0 82 while 1: 83 t = T(j, bias) 84 if N < t: 85 result.append(digits[N]) 86 return bytes(result) 87 result.append(digits[t + ((N - t) % (36 - t))]) 88 N = (N - t) // (36 - t) 89 j += 1 90 91def adapt(delta, first, numchars): 92 if first: 93 delta //= 700 94 else: 95 delta //= 2 96 delta += delta // numchars 97 # ((base - tmin) * tmax) // 2 == 455 98 divisions = 0 99 while delta > 455: 100 delta = delta // 35 # base - tmin 101 divisions += 36 102 bias = divisions + (36 * delta // (delta + 38)) 103 return bias 104 105 106def generate_integers(baselen, deltas): 107 """3.4 Bias adaptation""" 108 # Punycode parameters: initial bias = 72, damp = 700, skew = 38 109 result = bytearray() 110 bias = 72 111 for points, delta in enumerate(deltas): 112 s = generate_generalized_integer(delta, bias) 113 result.extend(s) 114 bias = adapt(delta, points==0, baselen+points+1) 115 return bytes(result) 116 117def punycode_encode(text): 118 base, extended = segregate(text) 119 deltas = insertion_unsort(text, extended) 120 extended = generate_integers(len(base), deltas) 121 if base: 122 return base + b"-" + extended 123 return extended 124 125##################### Decoding ##################################### 126 127def decode_generalized_number(extended, extpos, bias, errors): 128 """3.3 Generalized variable-length integers""" 129 result = 0 130 w = 1 131 j = 0 132 while 1: 133 try: 134 char = ord(extended[extpos]) 135 except IndexError: 136 if errors == "strict": 137 raise UnicodeError("incomplete punicode string") 138 return extpos + 1, None 139 extpos += 1 140 if 0x41 <= char <= 0x5A: # A-Z 141 digit = char - 0x41 142 elif 0x30 <= char <= 0x39: 143 digit = char - 22 # 0x30-26 144 elif errors == "strict": 145 raise UnicodeError("Invalid extended code point '%s'" 146 % extended[extpos]) 147 else: 148 return extpos, None 149 t = T(j, bias) 150 result += digit * w 151 if digit < t: 152 return extpos, result 153 w = w * (36 - t) 154 j += 1 155 156 157def insertion_sort(base, extended, errors): 158 """3.2 Insertion unsort coding""" 159 char = 0x80 160 pos = -1 161 bias = 72 162 extpos = 0 163 while extpos < len(extended): 164 newpos, delta = decode_generalized_number(extended, extpos, 165 bias, errors) 166 if delta is None: 167 # There was an error in decoding. We can't continue because 168 # synchronization is lost. 169 return base 170 pos += delta+1 171 char += pos // (len(base) + 1) 172 if char > 0x10FFFF: 173 if errors == "strict": 174 raise UnicodeError("Invalid character U+%x" % char) 175 char = ord('?') 176 pos = pos % (len(base) + 1) 177 base = base[:pos] + chr(char) + base[pos:] 178 bias = adapt(delta, (extpos == 0), len(base)) 179 extpos = newpos 180 return base 181 182def punycode_decode(text, errors): 183 if isinstance(text, str): 184 text = text.encode("ascii") 185 if isinstance(text, memoryview): 186 text = bytes(text) 187 pos = text.rfind(b"-") 188 if pos == -1: 189 base = "" 190 extended = str(text, "ascii").upper() 191 else: 192 base = str(text[:pos], "ascii", errors) 193 extended = str(text[pos+1:], "ascii").upper() 194 return insertion_sort(base, extended, errors) 195 196### Codec APIs 197 198class Codec(codecs.Codec): 199 200 def encode(self, input, errors='strict'): 201 res = punycode_encode(input) 202 return res, len(input) 203 204 def decode(self, input, errors='strict'): 205 if errors not in ('strict', 'replace', 'ignore'): 206 raise UnicodeError("Unsupported error handling "+errors) 207 res = punycode_decode(input, errors) 208 return res, len(input) 209 210class IncrementalEncoder(codecs.IncrementalEncoder): 211 def encode(self, input, final=False): 212 return punycode_encode(input) 213 214class IncrementalDecoder(codecs.IncrementalDecoder): 215 def decode(self, input, final=False): 216 if self.errors not in ('strict', 'replace', 'ignore'): 217 raise UnicodeError("Unsupported error handling "+self.errors) 218 return punycode_decode(input, self.errors) 219 220class StreamWriter(Codec,codecs.StreamWriter): 221 pass 222 223class StreamReader(Codec,codecs.StreamReader): 224 pass 225 226### encodings module API 227 228def getregentry(): 229 return codecs.CodecInfo( 230 name='punycode', 231 encode=Codec().encode, 232 decode=Codec().decode, 233 incrementalencoder=IncrementalEncoder, 234 incrementaldecoder=IncrementalDecoder, 235 streamwriter=StreamWriter, 236 streamreader=StreamReader, 237 ) 238