• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Standard "encodings" Package
2
3    Standard Python encoding modules are stored in this package
4    directory.
5
6    Codec modules must have names corresponding to normalized encoding
7    names as defined in the normalize_encoding() function below, e.g.
8    'utf-8' must be implemented by the module 'utf_8.py'.
9
10    Each codec module must export the following interface:
11
12    * getregentry() -> codecs.CodecInfo object
13    The getregentry() API must return a CodecInfo object with encoder, decoder,
14    incrementalencoder, incrementaldecoder, streamwriter and streamreader
15    attributes which adhere to the Python Codec Interface Standard.
16
17    In addition, a module may optionally also define the following
18    APIs which are then used by the package's codec search function:
19
20    * getaliases() -> sequence of encoding name strings to use as aliases
21
22    Alias names returned by getaliases() must be normalized encoding
23    names as defined by normalize_encoding().
24
25Written by Marc-Andre Lemburg (mal@lemburg.com).
26
27(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
28
29"""#"
30
31import codecs
32import sys
33from . import aliases
34
35_cache = {}
36_unknown = '--unknown--'
37_import_tail = ['*']
38_aliases = aliases.aliases
39
40class CodecRegistryError(LookupError, SystemError):
41    pass
42
43def normalize_encoding(encoding):
44
45    """ Normalize an encoding name.
46
47        Normalization works as follows: all non-alphanumeric
48        characters except the dot used for Python package names are
49        collapsed and replaced with a single underscore, e.g. '  -;#'
50        becomes '_'. Leading and trailing underscores are removed.
51
52        Note that encoding names should be ASCII only.
53
54    """
55    if isinstance(encoding, bytes):
56        encoding = str(encoding, "ascii")
57
58    chars = []
59    punct = False
60    for c in encoding:
61        if c.isalnum() or c == '.':
62            if punct and chars:
63                chars.append('_')
64            chars.append(c)
65            punct = False
66        else:
67            punct = True
68    return ''.join(chars)
69
70def search_function(encoding):
71
72    # Cache lookup
73    entry = _cache.get(encoding, _unknown)
74    if entry is not _unknown:
75        return entry
76
77    # Import the module:
78    #
79    # First try to find an alias for the normalized encoding
80    # name and lookup the module using the aliased name, then try to
81    # lookup the module using the standard import scheme, i.e. first
82    # try in the encodings package, then at top-level.
83    #
84    norm_encoding = normalize_encoding(encoding)
85    aliased_encoding = _aliases.get(norm_encoding) or \
86                       _aliases.get(norm_encoding.replace('.', '_'))
87    if aliased_encoding is not None:
88        modnames = [aliased_encoding,
89                    norm_encoding]
90    else:
91        modnames = [norm_encoding]
92    for modname in modnames:
93        if not modname or '.' in modname:
94            continue
95        try:
96            # Import is absolute to prevent the possibly malicious import of a
97            # module with side-effects that is not in the 'encodings' package.
98            mod = __import__('encodings.' + modname, fromlist=_import_tail,
99                             level=0)
100        except ImportError:
101            # ImportError may occur because 'encodings.(modname)' does not exist,
102            # or because it imports a name that does not exist (see mbcs and oem)
103            pass
104        else:
105            break
106    else:
107        mod = None
108
109    try:
110        getregentry = mod.getregentry
111    except AttributeError:
112        # Not a codec module
113        mod = None
114
115    if mod is None:
116        # Cache misses
117        _cache[encoding] = None
118        return None
119
120    # Now ask the module for the registry entry
121    entry = getregentry()
122    if not isinstance(entry, codecs.CodecInfo):
123        if not 4 <= len(entry) <= 7:
124            raise CodecRegistryError('module "%s" (%s) failed to register'
125                                     % (mod.__name__, mod.__file__))
126        if not callable(entry[0]) or not callable(entry[1]) or \
127           (entry[2] is not None and not callable(entry[2])) or \
128           (entry[3] is not None and not callable(entry[3])) or \
129           (len(entry) > 4 and entry[4] is not None and not callable(entry[4])) or \
130           (len(entry) > 5 and entry[5] is not None and not callable(entry[5])):
131            raise CodecRegistryError('incompatible codecs in module "%s" (%s)'
132                                     % (mod.__name__, mod.__file__))
133        if len(entry)<7 or entry[6] is None:
134            entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)
135        entry = codecs.CodecInfo(*entry)
136
137    # Cache the codec registry entry
138    _cache[encoding] = entry
139
140    # Register its aliases (without overwriting previously registered
141    # aliases)
142    try:
143        codecaliases = mod.getaliases()
144    except AttributeError:
145        pass
146    else:
147        for alias in codecaliases:
148            if alias not in _aliases:
149                _aliases[alias] = modname
150
151    # Return the registry entry
152    return entry
153
154# Register the search_function in the Python codec registry
155codecs.register(search_function)
156
157if sys.platform == 'win32':
158    def _alias_mbcs(encoding):
159        try:
160            import _winapi
161            ansi_code_page = "cp%s" % _winapi.GetACP()
162            if encoding == ansi_code_page:
163                import encodings.mbcs
164                return encodings.mbcs.getregentry()
165        except ImportError:
166            # Imports may fail while we are shutting down
167            pass
168
169    codecs.register(_alias_mbcs)
170