• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Standard "encodings" Package
2
3    Standard Python encoding modules are stored in this package
4    directory.
5
6    Codec modules must have names corresponding to normalized encoding
7    names as defined in the normalize_encoding() function below, e.g.
8    'utf-8' must be implemented by the module 'utf_8.py'.
9
10    Each codec module must export the following interface:
11
12    * getregentry() -> codecs.CodecInfo object
13    The getregentry() API must a CodecInfo object with encoder, decoder,
14    incrementalencoder, incrementaldecoder, streamwriter and streamreader
15    atttributes which adhere to the Python Codec Interface Standard.
16
17    In addition, a module may optionally also define the following
18    APIs which are then used by the package's codec search function:
19
20    * getaliases() -> sequence of encoding name strings to use as aliases
21
22    Alias names returned by getaliases() must be normalized encoding
23    names as defined by normalize_encoding().
24
25Written by Marc-Andre Lemburg (mal@lemburg.com).
26
27(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
28
29"""#"
30
31import codecs
32from encodings import aliases
33import __builtin__
34
35_cache = {}
36_unknown = '--unknown--'
37_import_tail = ['*']
38_norm_encoding_map = ('                                              . '
39                      '0123456789       ABCDEFGHIJKLMNOPQRSTUVWXYZ     '
40                      ' abcdefghijklmnopqrstuvwxyz                     '
41                      '                                                '
42                      '                                                '
43                      '                ')
44_aliases = aliases.aliases
45
46class CodecRegistryError(LookupError, SystemError):
47    pass
48
49def normalize_encoding(encoding):
50
51    """ Normalize an encoding name.
52
53        Normalization works as follows: all non-alphanumeric
54        characters except the dot used for Python package names are
55        collapsed and replaced with a single underscore, e.g. '  -;#'
56        becomes '_'. Leading and trailing underscores are removed.
57
58        Note that encoding names should be ASCII only; if they do use
59        non-ASCII characters, these must be Latin-1 compatible.
60
61    """
62    # Make sure we have an 8-bit string, because .translate() works
63    # differently for Unicode strings.
64    if hasattr(__builtin__, "unicode") and isinstance(encoding, unicode):
65        # Note that .encode('latin-1') does *not* use the codec
66        # registry, so this call doesn't recurse. (See unicodeobject.c
67        # PyUnicode_AsEncodedString() for details)
68        encoding = encoding.encode('latin-1')
69    return '_'.join(encoding.translate(_norm_encoding_map).split())
70
71def search_function(encoding):
72
73    # Cache lookup
74    entry = _cache.get(encoding, _unknown)
75    if entry is not _unknown:
76        return entry
77
78    # Import the module:
79    #
80    # First try to find an alias for the normalized encoding
81    # name and lookup the module using the aliased name, then try to
82    # lookup the module using the standard import scheme, i.e. first
83    # try in the encodings package, then at top-level.
84    #
85    norm_encoding = normalize_encoding(encoding)
86    aliased_encoding = _aliases.get(norm_encoding) or \
87                       _aliases.get(norm_encoding.replace('.', '_'))
88    if aliased_encoding is not None:
89        modnames = [aliased_encoding,
90                    norm_encoding]
91    else:
92        modnames = [norm_encoding]
93    for modname in modnames:
94        if not modname or '.' in modname:
95            continue
96        try:
97            # Import is absolute to prevent the possibly malicious import of a
98            # module with side-effects that is not in the 'encodings' package.
99            mod = __import__('encodings.' + modname, fromlist=_import_tail,
100                             level=0)
101        except ImportError:
102            pass
103        else:
104            break
105    else:
106        mod = None
107
108    try:
109        getregentry = mod.getregentry
110    except AttributeError:
111        # Not a codec module
112        mod = None
113
114    if mod is None:
115        # Cache misses
116        _cache[encoding] = None
117        return None
118
119    # Now ask the module for the registry entry
120    entry = getregentry()
121    if not isinstance(entry, codecs.CodecInfo):
122        if not 4 <= len(entry) <= 7:
123            raise CodecRegistryError,\
124                 'module "%s" (%s) failed to register' % \
125                  (mod.__name__, mod.__file__)
126        if not hasattr(entry[0], '__call__') or \
127           not hasattr(entry[1], '__call__') or \
128           (entry[2] is not None and not hasattr(entry[2], '__call__')) or \
129           (entry[3] is not None and not hasattr(entry[3], '__call__')) or \
130           (len(entry) > 4 and entry[4] is not None and not hasattr(entry[4], '__call__')) or \
131           (len(entry) > 5 and entry[5] is not None and not hasattr(entry[5], '__call__')):
132            raise CodecRegistryError,\
133                'incompatible codecs in module "%s" (%s)' % \
134                (mod.__name__, mod.__file__)
135        if len(entry)<7 or entry[6] is None:
136            entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)
137        entry = codecs.CodecInfo(*entry)
138
139    # Cache the codec registry entry
140    _cache[encoding] = entry
141
142    # Register its aliases (without overwriting previously registered
143    # aliases)
144    try:
145        codecaliases = mod.getaliases()
146    except AttributeError:
147        pass
148    else:
149        for alias in codecaliases:
150            if alias not in _aliases:
151                _aliases[alias] = modname
152
153    # Return the registry entry
154    return entry
155
156# Register the search_function in the Python codec registry
157codecs.register(search_function)
158