• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env python3
2
3"""Base16, Base32, Base64 (RFC 3548), Base85 and Ascii85 data encodings"""
4
5# Modified 04-Oct-1995 by Jack Jansen to use binascii module
6# Modified 30-Dec-2003 by Barry Warsaw to add full RFC 3548 support
7# Modified 22-May-2007 by Guido van Rossum to use bytes everywhere
8
9import re
10import struct
11import binascii
12
13
14__all__ = [
15    # Legacy interface exports traditional RFC 2045 Base64 encodings
16    'encode', 'decode', 'encodebytes', 'decodebytes',
17    # Generalized interface for other encodings
18    'b64encode', 'b64decode', 'b32encode', 'b32decode',
19    'b32hexencode', 'b32hexdecode', 'b16encode', 'b16decode',
20    # Base85 and Ascii85 encodings
21    'b85encode', 'b85decode', 'a85encode', 'a85decode',
22    # Standard Base64 encoding
23    'standard_b64encode', 'standard_b64decode',
24    # Some common Base64 alternatives.  As referenced by RFC 3458, see thread
25    # starting at:
26    #
27    # http://zgp.org/pipermail/p2p-hackers/2001-September/000316.html
28    'urlsafe_b64encode', 'urlsafe_b64decode',
29    ]
30
31
32bytes_types = (bytes, bytearray)  # Types acceptable as binary data
33
34def _bytes_from_decode_data(s):
35    if isinstance(s, str):
36        try:
37            return s.encode('ascii')
38        except UnicodeEncodeError:
39            raise ValueError('string argument should contain only ASCII characters')
40    if isinstance(s, bytes_types):
41        return s
42    try:
43        return memoryview(s).tobytes()
44    except TypeError:
45        raise TypeError("argument should be a bytes-like object or ASCII "
46                        "string, not %r" % s.__class__.__name__) from None
47
48
49# Base64 encoding/decoding uses binascii
50
51def b64encode(s, altchars=None):
52    """Encode the bytes-like object s using Base64 and return a bytes object.
53
54    Optional altchars should be a byte string of length 2 which specifies an
55    alternative alphabet for the '+' and '/' characters.  This allows an
56    application to e.g. generate url or filesystem safe Base64 strings.
57    """
58    encoded = binascii.b2a_base64(s, newline=False)
59    if altchars is not None:
60        assert len(altchars) == 2, repr(altchars)
61        return encoded.translate(bytes.maketrans(b'+/', altchars))
62    return encoded
63
64
65def b64decode(s, altchars=None, validate=False):
66    """Decode the Base64 encoded bytes-like object or ASCII string s.
67
68    Optional altchars must be a bytes-like object or ASCII string of length 2
69    which specifies the alternative alphabet used instead of the '+' and '/'
70    characters.
71
72    The result is returned as a bytes object.  A binascii.Error is raised if
73    s is incorrectly padded.
74
75    If validate is False (the default), characters that are neither in the
76    normal base-64 alphabet nor the alternative alphabet are discarded prior
77    to the padding check.  If validate is True, these non-alphabet characters
78    in the input result in a binascii.Error.
79    """
80    s = _bytes_from_decode_data(s)
81    if altchars is not None:
82        altchars = _bytes_from_decode_data(altchars)
83        assert len(altchars) == 2, repr(altchars)
84        s = s.translate(bytes.maketrans(altchars, b'+/'))
85    if validate and not re.fullmatch(b'[A-Za-z0-9+/]*={0,2}', s):
86        raise binascii.Error('Non-base64 digit found')
87    return binascii.a2b_base64(s)
88
89
90def standard_b64encode(s):
91    """Encode bytes-like object s using the standard Base64 alphabet.
92
93    The result is returned as a bytes object.
94    """
95    return b64encode(s)
96
97def standard_b64decode(s):
98    """Decode bytes encoded with the standard Base64 alphabet.
99
100    Argument s is a bytes-like object or ASCII string to decode.  The result
101    is returned as a bytes object.  A binascii.Error is raised if the input
102    is incorrectly padded.  Characters that are not in the standard alphabet
103    are discarded prior to the padding check.
104    """
105    return b64decode(s)
106
107
108_urlsafe_encode_translation = bytes.maketrans(b'+/', b'-_')
109_urlsafe_decode_translation = bytes.maketrans(b'-_', b'+/')
110
111def urlsafe_b64encode(s):
112    """Encode bytes using the URL- and filesystem-safe Base64 alphabet.
113
114    Argument s is a bytes-like object to encode.  The result is returned as a
115    bytes object.  The alphabet uses '-' instead of '+' and '_' instead of
116    '/'.
117    """
118    return b64encode(s).translate(_urlsafe_encode_translation)
119
120def urlsafe_b64decode(s):
121    """Decode bytes using the URL- and filesystem-safe Base64 alphabet.
122
123    Argument s is a bytes-like object or ASCII string to decode.  The result
124    is returned as a bytes object.  A binascii.Error is raised if the input
125    is incorrectly padded.  Characters that are not in the URL-safe base-64
126    alphabet, and are not a plus '+' or slash '/', are discarded prior to the
127    padding check.
128
129    The alphabet uses '-' instead of '+' and '_' instead of '/'.
130    """
131    s = _bytes_from_decode_data(s)
132    s = s.translate(_urlsafe_decode_translation)
133    return b64decode(s)
134
135
136
137# Base32 encoding/decoding must be done in Python
138_B32_ENCODE_DOCSTRING = '''
139Encode the bytes-like objects using {encoding} and return a bytes object.
140'''
141_B32_DECODE_DOCSTRING = '''
142Decode the {encoding} encoded bytes-like object or ASCII string s.
143
144Optional casefold is a flag specifying whether a lowercase alphabet is
145acceptable as input.  For security purposes, the default is False.
146{extra_args}
147The result is returned as a bytes object.  A binascii.Error is raised if
148the input is incorrectly padded or if there are non-alphabet
149characters present in the input.
150'''
151_B32_DECODE_MAP01_DOCSTRING = '''
152RFC 3548 allows for optional mapping of the digit 0 (zero) to the
153letter O (oh), and for optional mapping of the digit 1 (one) to
154either the letter I (eye) or letter L (el).  The optional argument
155map01 when not None, specifies which letter the digit 1 should be
156mapped to (when map01 is not None, the digit 0 is always mapped to
157the letter O).  For security purposes the default is None, so that
1580 and 1 are not allowed in the input.
159'''
160_b32alphabet = b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567'
161_b32hexalphabet = b'0123456789ABCDEFGHIJKLMNOPQRSTUV'
162_b32tab2 = {}
163_b32rev = {}
164
165def _b32encode(alphabet, s):
166    global _b32tab2
167    # Delay the initialization of the table to not waste memory
168    # if the function is never called
169    if alphabet not in _b32tab2:
170        b32tab = [bytes((i,)) for i in alphabet]
171        _b32tab2[alphabet] = [a + b for a in b32tab for b in b32tab]
172        b32tab = None
173
174    if not isinstance(s, bytes_types):
175        s = memoryview(s).tobytes()
176    leftover = len(s) % 5
177    # Pad the last quantum with zero bits if necessary
178    if leftover:
179        s = s + b'\0' * (5 - leftover)  # Don't use += !
180    encoded = bytearray()
181    from_bytes = int.from_bytes
182    b32tab2 = _b32tab2[alphabet]
183    for i in range(0, len(s), 5):
184        c = from_bytes(s[i: i + 5], 'big')
185        encoded += (b32tab2[c >> 30] +           # bits 1 - 10
186                    b32tab2[(c >> 20) & 0x3ff] + # bits 11 - 20
187                    b32tab2[(c >> 10) & 0x3ff] + # bits 21 - 30
188                    b32tab2[c & 0x3ff]           # bits 31 - 40
189                   )
190    # Adjust for any leftover partial quanta
191    if leftover == 1:
192        encoded[-6:] = b'======'
193    elif leftover == 2:
194        encoded[-4:] = b'===='
195    elif leftover == 3:
196        encoded[-3:] = b'==='
197    elif leftover == 4:
198        encoded[-1:] = b'='
199    return bytes(encoded)
200
201def _b32decode(alphabet, s, casefold=False, map01=None):
202    global _b32rev
203    # Delay the initialization of the table to not waste memory
204    # if the function is never called
205    if alphabet not in _b32rev:
206        _b32rev[alphabet] = {v: k for k, v in enumerate(alphabet)}
207    s = _bytes_from_decode_data(s)
208    if len(s) % 8:
209        raise binascii.Error('Incorrect padding')
210    # Handle section 2.4 zero and one mapping.  The flag map01 will be either
211    # False, or the character to map the digit 1 (one) to.  It should be
212    # either L (el) or I (eye).
213    if map01 is not None:
214        map01 = _bytes_from_decode_data(map01)
215        assert len(map01) == 1, repr(map01)
216        s = s.translate(bytes.maketrans(b'01', b'O' + map01))
217    if casefold:
218        s = s.upper()
219    # Strip off pad characters from the right.  We need to count the pad
220    # characters because this will tell us how many null bytes to remove from
221    # the end of the decoded string.
222    l = len(s)
223    s = s.rstrip(b'=')
224    padchars = l - len(s)
225    # Now decode the full quanta
226    decoded = bytearray()
227    b32rev = _b32rev[alphabet]
228    for i in range(0, len(s), 8):
229        quanta = s[i: i + 8]
230        acc = 0
231        try:
232            for c in quanta:
233                acc = (acc << 5) + b32rev[c]
234        except KeyError:
235            raise binascii.Error('Non-base32 digit found') from None
236        decoded += acc.to_bytes(5, 'big')
237    # Process the last, partial quanta
238    if l % 8 or padchars not in {0, 1, 3, 4, 6}:
239        raise binascii.Error('Incorrect padding')
240    if padchars and decoded:
241        acc <<= 5 * padchars
242        last = acc.to_bytes(5, 'big')
243        leftover = (43 - 5 * padchars) // 8  # 1: 4, 3: 3, 4: 2, 6: 1
244        decoded[-5:] = last[:leftover]
245    return bytes(decoded)
246
247
248def b32encode(s):
249    return _b32encode(_b32alphabet, s)
250b32encode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32')
251
252def b32decode(s, casefold=False, map01=None):
253    return _b32decode(_b32alphabet, s, casefold, map01)
254b32decode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32',
255                                        extra_args=_B32_DECODE_MAP01_DOCSTRING)
256
257def b32hexencode(s):
258    return _b32encode(_b32hexalphabet, s)
259b32hexencode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32hex')
260
261def b32hexdecode(s, casefold=False):
262    # base32hex does not have the 01 mapping
263    return _b32decode(_b32hexalphabet, s, casefold)
264b32hexdecode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32hex',
265                                                    extra_args='')
266
267
268# RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns
269# lowercase.  The RFC also recommends against accepting input case
270# insensitively.
271def b16encode(s):
272    """Encode the bytes-like object s using Base16 and return a bytes object.
273    """
274    return binascii.hexlify(s).upper()
275
276
277def b16decode(s, casefold=False):
278    """Decode the Base16 encoded bytes-like object or ASCII string s.
279
280    Optional casefold is a flag specifying whether a lowercase alphabet is
281    acceptable as input.  For security purposes, the default is False.
282
283    The result is returned as a bytes object.  A binascii.Error is raised if
284    s is incorrectly padded or if there are non-alphabet characters present
285    in the input.
286    """
287    s = _bytes_from_decode_data(s)
288    if casefold:
289        s = s.upper()
290    if re.search(b'[^0-9A-F]', s):
291        raise binascii.Error('Non-base16 digit found')
292    return binascii.unhexlify(s)
293
294#
295# Ascii85 encoding/decoding
296#
297
298_a85chars = None
299_a85chars2 = None
300_A85START = b"<~"
301_A85END = b"~>"
302
303def _85encode(b, chars, chars2, pad=False, foldnuls=False, foldspaces=False):
304    # Helper function for a85encode and b85encode
305    if not isinstance(b, bytes_types):
306        b = memoryview(b).tobytes()
307
308    padding = (-len(b)) % 4
309    if padding:
310        b = b + b'\0' * padding
311    words = struct.Struct('!%dI' % (len(b) // 4)).unpack(b)
312
313    chunks = [b'z' if foldnuls and not word else
314              b'y' if foldspaces and word == 0x20202020 else
315              (chars2[word // 614125] +
316               chars2[word // 85 % 7225] +
317               chars[word % 85])
318              for word in words]
319
320    if padding and not pad:
321        if chunks[-1] == b'z':
322            chunks[-1] = chars[0] * 5
323        chunks[-1] = chunks[-1][:-padding]
324
325    return b''.join(chunks)
326
327def a85encode(b, *, foldspaces=False, wrapcol=0, pad=False, adobe=False):
328    """Encode bytes-like object b using Ascii85 and return a bytes object.
329
330    foldspaces is an optional flag that uses the special short sequence 'y'
331    instead of 4 consecutive spaces (ASCII 0x20) as supported by 'btoa'. This
332    feature is not supported by the "standard" Adobe encoding.
333
334    wrapcol controls whether the output should have newline (b'\\n') characters
335    added to it. If this is non-zero, each output line will be at most this
336    many characters long.
337
338    pad controls whether the input is padded to a multiple of 4 before
339    encoding. Note that the btoa implementation always pads.
340
341    adobe controls whether the encoded byte sequence is framed with <~ and ~>,
342    which is used by the Adobe implementation.
343    """
344    global _a85chars, _a85chars2
345    # Delay the initialization of tables to not waste memory
346    # if the function is never called
347    if _a85chars2 is None:
348        _a85chars = [bytes((i,)) for i in range(33, 118)]
349        _a85chars2 = [(a + b) for a in _a85chars for b in _a85chars]
350
351    result = _85encode(b, _a85chars, _a85chars2, pad, True, foldspaces)
352
353    if adobe:
354        result = _A85START + result
355    if wrapcol:
356        wrapcol = max(2 if adobe else 1, wrapcol)
357        chunks = [result[i: i + wrapcol]
358                  for i in range(0, len(result), wrapcol)]
359        if adobe:
360            if len(chunks[-1]) + 2 > wrapcol:
361                chunks.append(b'')
362        result = b'\n'.join(chunks)
363    if adobe:
364        result += _A85END
365
366    return result
367
368def a85decode(b, *, foldspaces=False, adobe=False, ignorechars=b' \t\n\r\v'):
369    """Decode the Ascii85 encoded bytes-like object or ASCII string b.
370
371    foldspaces is a flag that specifies whether the 'y' short sequence should be
372    accepted as shorthand for 4 consecutive spaces (ASCII 0x20). This feature is
373    not supported by the "standard" Adobe encoding.
374
375    adobe controls whether the input sequence is in Adobe Ascii85 format (i.e.
376    is framed with <~ and ~>).
377
378    ignorechars should be a byte string containing characters to ignore from the
379    input. This should only contain whitespace characters, and by default
380    contains all whitespace characters in ASCII.
381
382    The result is returned as a bytes object.
383    """
384    b = _bytes_from_decode_data(b)
385    if adobe:
386        if not b.endswith(_A85END):
387            raise ValueError(
388                "Ascii85 encoded byte sequences must end "
389                "with {!r}".format(_A85END)
390                )
391        if b.startswith(_A85START):
392            b = b[2:-2]  # Strip off start/end markers
393        else:
394            b = b[:-2]
395    #
396    # We have to go through this stepwise, so as to ignore spaces and handle
397    # special short sequences
398    #
399    packI = struct.Struct('!I').pack
400    decoded = []
401    decoded_append = decoded.append
402    curr = []
403    curr_append = curr.append
404    curr_clear = curr.clear
405    for x in b + b'u' * 4:
406        if b'!'[0] <= x <= b'u'[0]:
407            curr_append(x)
408            if len(curr) == 5:
409                acc = 0
410                for x in curr:
411                    acc = 85 * acc + (x - 33)
412                try:
413                    decoded_append(packI(acc))
414                except struct.error:
415                    raise ValueError('Ascii85 overflow') from None
416                curr_clear()
417        elif x == b'z'[0]:
418            if curr:
419                raise ValueError('z inside Ascii85 5-tuple')
420            decoded_append(b'\0\0\0\0')
421        elif foldspaces and x == b'y'[0]:
422            if curr:
423                raise ValueError('y inside Ascii85 5-tuple')
424            decoded_append(b'\x20\x20\x20\x20')
425        elif x in ignorechars:
426            # Skip whitespace
427            continue
428        else:
429            raise ValueError('Non-Ascii85 digit found: %c' % x)
430
431    result = b''.join(decoded)
432    padding = 4 - len(curr)
433    if padding:
434        # Throw away the extra padding
435        result = result[:-padding]
436    return result
437
438# The following code is originally taken (with permission) from Mercurial
439
440_b85alphabet = (b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
441                b"abcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~")
442_b85chars = None
443_b85chars2 = None
444_b85dec = None
445
446def b85encode(b, pad=False):
447    """Encode bytes-like object b in base85 format and return a bytes object.
448
449    If pad is true, the input is padded with b'\\0' so its length is a multiple of
450    4 bytes before encoding.
451    """
452    global _b85chars, _b85chars2
453    # Delay the initialization of tables to not waste memory
454    # if the function is never called
455    if _b85chars2 is None:
456        _b85chars = [bytes((i,)) for i in _b85alphabet]
457        _b85chars2 = [(a + b) for a in _b85chars for b in _b85chars]
458    return _85encode(b, _b85chars, _b85chars2, pad)
459
460def b85decode(b):
461    """Decode the base85-encoded bytes-like object or ASCII string b
462
463    The result is returned as a bytes object.
464    """
465    global _b85dec
466    # Delay the initialization of tables to not waste memory
467    # if the function is never called
468    if _b85dec is None:
469        _b85dec = [None] * 256
470        for i, c in enumerate(_b85alphabet):
471            _b85dec[c] = i
472
473    b = _bytes_from_decode_data(b)
474    padding = (-len(b)) % 5
475    b = b + b'~' * padding
476    out = []
477    packI = struct.Struct('!I').pack
478    for i in range(0, len(b), 5):
479        chunk = b[i:i + 5]
480        acc = 0
481        try:
482            for c in chunk:
483                acc = acc * 85 + _b85dec[c]
484        except TypeError:
485            for j, c in enumerate(chunk):
486                if _b85dec[c] is None:
487                    raise ValueError('bad base85 character at position %d'
488                                    % (i + j)) from None
489            raise
490        try:
491            out.append(packI(acc))
492        except struct.error:
493            raise ValueError('base85 overflow in hunk starting at byte %d'
494                             % i) from None
495
496    result = b''.join(out)
497    if padding:
498        result = result[:-padding]
499    return result
500
501# Legacy interface.  This code could be cleaned up since I don't believe
502# binascii has any line length limitations.  It just doesn't seem worth it
503# though.  The files should be opened in binary mode.
504
505MAXLINESIZE = 76 # Excluding the CRLF
506MAXBINSIZE = (MAXLINESIZE//4)*3
507
508def encode(input, output):
509    """Encode a file; input and output are binary files."""
510    while True:
511        s = input.read(MAXBINSIZE)
512        if not s:
513            break
514        while len(s) < MAXBINSIZE:
515            ns = input.read(MAXBINSIZE-len(s))
516            if not ns:
517                break
518            s += ns
519        line = binascii.b2a_base64(s)
520        output.write(line)
521
522
523def decode(input, output):
524    """Decode a file; input and output are binary files."""
525    while True:
526        line = input.readline()
527        if not line:
528            break
529        s = binascii.a2b_base64(line)
530        output.write(s)
531
532def _input_type_check(s):
533    try:
534        m = memoryview(s)
535    except TypeError as err:
536        msg = "expected bytes-like object, not %s" % s.__class__.__name__
537        raise TypeError(msg) from err
538    if m.format not in ('c', 'b', 'B'):
539        msg = ("expected single byte elements, not %r from %s" %
540                                          (m.format, s.__class__.__name__))
541        raise TypeError(msg)
542    if m.ndim != 1:
543        msg = ("expected 1-D data, not %d-D data from %s" %
544                                          (m.ndim, s.__class__.__name__))
545        raise TypeError(msg)
546
547
548def encodebytes(s):
549    """Encode a bytestring into a bytes object containing multiple lines
550    of base-64 data."""
551    _input_type_check(s)
552    pieces = []
553    for i in range(0, len(s), MAXBINSIZE):
554        chunk = s[i : i + MAXBINSIZE]
555        pieces.append(binascii.b2a_base64(chunk))
556    return b"".join(pieces)
557
558
559def decodebytes(s):
560    """Decode a bytestring of base-64 data into a bytes object."""
561    _input_type_check(s)
562    return binascii.a2b_base64(s)
563
564
565# Usable as a script...
566def main():
567    """Small main program"""
568    import sys, getopt
569    try:
570        opts, args = getopt.getopt(sys.argv[1:], 'deut')
571    except getopt.error as msg:
572        sys.stdout = sys.stderr
573        print(msg)
574        print("""usage: %s [-d|-e|-u|-t] [file|-]
575        -d, -u: decode
576        -e: encode (default)
577        -t: encode and decode string 'Aladdin:open sesame'"""%sys.argv[0])
578        sys.exit(2)
579    func = encode
580    for o, a in opts:
581        if o == '-e': func = encode
582        if o == '-d': func = decode
583        if o == '-u': func = decode
584        if o == '-t': test(); return
585    if args and args[0] != '-':
586        with open(args[0], 'rb') as f:
587            func(f, sys.stdout.buffer)
588    else:
589        func(sys.stdin.buffer, sys.stdout.buffer)
590
591
592def test():
593    s0 = b"Aladdin:open sesame"
594    print(repr(s0))
595    s1 = encodebytes(s0)
596    print(repr(s1))
597    s2 = decodebytes(s1)
598    print(repr(s2))
599    assert s0 == s2
600
601
602if __name__ == '__main__':
603    main()
604