• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Codec for the Punicode encoding, as specified in RFC 3492
2
3Written by Martin v. Löwis.
4"""
5
6import codecs
7
8##################### Encoding #####################################
9
10def segregate(str):
11    """3.1 Basic code point segregation"""
12    base = bytearray()
13    extended = set()
14    for c in str:
15        if ord(c) < 128:
16            base.append(ord(c))
17        else:
18            extended.add(c)
19    extended = sorted(extended)
20    return bytes(base), extended
21
22def selective_len(str, max):
23    """Return the length of str, considering only characters below max."""
24    res = 0
25    for c in str:
26        if ord(c) < max:
27            res += 1
28    return res
29
30def selective_find(str, char, index, pos):
31    """Return a pair (index, pos), indicating the next occurrence of
32    char in str. index is the position of the character considering
33    only ordinals up to and including char, and pos is the position in
34    the full string. index/pos is the starting position in the full
35    string."""
36
37    l = len(str)
38    while 1:
39        pos += 1
40        if pos == l:
41            return (-1, -1)
42        c = str[pos]
43        if c == char:
44            return index+1, pos
45        elif c < char:
46            index += 1
47
48def insertion_unsort(str, extended):
49    """3.2 Insertion unsort coding"""
50    oldchar = 0x80
51    result = []
52    oldindex = -1
53    for c in extended:
54        index = pos = -1
55        char = ord(c)
56        curlen = selective_len(str, char)
57        delta = (curlen+1) * (char - oldchar)
58        while 1:
59            index,pos = selective_find(str,c,index,pos)
60            if index == -1:
61                break
62            delta += index - oldindex
63            result.append(delta-1)
64            oldindex = index
65            delta = 0
66        oldchar = char
67
68    return result
69
70def T(j, bias):
71    # Punycode parameters: tmin = 1, tmax = 26, base = 36
72    res = 36 * (j + 1) - bias
73    if res < 1: return 1
74    if res > 26: return 26
75    return res
76
77digits = b"abcdefghijklmnopqrstuvwxyz0123456789"
78def generate_generalized_integer(N, bias):
79    """3.3 Generalized variable-length integers"""
80    result = bytearray()
81    j = 0
82    while 1:
83        t = T(j, bias)
84        if N < t:
85            result.append(digits[N])
86            return bytes(result)
87        result.append(digits[t + ((N - t) % (36 - t))])
88        N = (N - t) // (36 - t)
89        j += 1
90
91def adapt(delta, first, numchars):
92    if first:
93        delta //= 700
94    else:
95        delta //= 2
96    delta += delta // numchars
97    # ((base - tmin) * tmax) // 2 == 455
98    divisions = 0
99    while delta > 455:
100        delta = delta // 35 # base - tmin
101        divisions += 36
102    bias = divisions + (36 * delta // (delta + 38))
103    return bias
104
105
106def generate_integers(baselen, deltas):
107    """3.4 Bias adaptation"""
108    # Punycode parameters: initial bias = 72, damp = 700, skew = 38
109    result = bytearray()
110    bias = 72
111    for points, delta in enumerate(deltas):
112        s = generate_generalized_integer(delta, bias)
113        result.extend(s)
114        bias = adapt(delta, points==0, baselen+points+1)
115    return bytes(result)
116
117def punycode_encode(text):
118    base, extended = segregate(text)
119    deltas = insertion_unsort(text, extended)
120    extended = generate_integers(len(base), deltas)
121    if base:
122        return base + b"-" + extended
123    return extended
124
125##################### Decoding #####################################
126
127def decode_generalized_number(extended, extpos, bias, errors):
128    """3.3 Generalized variable-length integers"""
129    result = 0
130    w = 1
131    j = 0
132    while 1:
133        try:
134            char = ord(extended[extpos])
135        except IndexError:
136            if errors == "strict":
137                raise UnicodeError("incomplete punicode string")
138            return extpos + 1, None
139        extpos += 1
140        if 0x41 <= char <= 0x5A: # A-Z
141            digit = char - 0x41
142        elif 0x30 <= char <= 0x39:
143            digit = char - 22 # 0x30-26
144        elif errors == "strict":
145            raise UnicodeError("Invalid extended code point '%s'"
146                               % extended[extpos])
147        else:
148            return extpos, None
149        t = T(j, bias)
150        result += digit * w
151        if digit < t:
152            return extpos, result
153        w = w * (36 - t)
154        j += 1
155
156
157def insertion_sort(base, extended, errors):
158    """3.2 Insertion unsort coding"""
159    char = 0x80
160    pos = -1
161    bias = 72
162    extpos = 0
163    while extpos < len(extended):
164        newpos, delta = decode_generalized_number(extended, extpos,
165                                                  bias, errors)
166        if delta is None:
167            # There was an error in decoding. We can't continue because
168            # synchronization is lost.
169            return base
170        pos += delta+1
171        char += pos // (len(base) + 1)
172        if char > 0x10FFFF:
173            if errors == "strict":
174                raise UnicodeError("Invalid character U+%x" % char)
175            char = ord('?')
176        pos = pos % (len(base) + 1)
177        base = base[:pos] + chr(char) + base[pos:]
178        bias = adapt(delta, (extpos == 0), len(base))
179        extpos = newpos
180    return base
181
182def punycode_decode(text, errors):
183    if isinstance(text, str):
184        text = text.encode("ascii")
185    if isinstance(text, memoryview):
186        text = bytes(text)
187    pos = text.rfind(b"-")
188    if pos == -1:
189        base = ""
190        extended = str(text, "ascii").upper()
191    else:
192        base = str(text[:pos], "ascii", errors)
193        extended = str(text[pos+1:], "ascii").upper()
194    return insertion_sort(base, extended, errors)
195
196### Codec APIs
197
198class Codec(codecs.Codec):
199
200    def encode(self, input, errors='strict'):
201        res = punycode_encode(input)
202        return res, len(input)
203
204    def decode(self, input, errors='strict'):
205        if errors not in ('strict', 'replace', 'ignore'):
206            raise UnicodeError("Unsupported error handling "+errors)
207        res = punycode_decode(input, errors)
208        return res, len(input)
209
210class IncrementalEncoder(codecs.IncrementalEncoder):
211    def encode(self, input, final=False):
212        return punycode_encode(input)
213
214class IncrementalDecoder(codecs.IncrementalDecoder):
215    def decode(self, input, final=False):
216        if self.errors not in ('strict', 'replace', 'ignore'):
217            raise UnicodeError("Unsupported error handling "+self.errors)
218        return punycode_decode(input, self.errors)
219
220class StreamWriter(Codec,codecs.StreamWriter):
221    pass
222
223class StreamReader(Codec,codecs.StreamReader):
224    pass
225
226### encodings module API
227
228def getregentry():
229    return codecs.CodecInfo(
230        name='punycode',
231        encode=Codec().encode,
232        decode=Codec().decode,
233        incrementalencoder=IncrementalEncoder,
234        incrementaldecoder=IncrementalDecoder,
235        streamwriter=StreamWriter,
236        streamreader=StreamReader,
237    )
238