• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Guess the MIME type of a file.
2
3This module defines two useful functions:
4
5guess_type(url, strict=True) -- guess the MIME type and encoding of a URL.
6
7guess_extension(type, strict=True) -- guess the extension for a given MIME type.
8
9It also contains the following, for tuning the behavior:
10
11Data:
12
13knownfiles -- list of files to parse
14inited -- flag set when init() has been called
15suffix_map -- dictionary mapping suffixes to suffixes
16encodings_map -- dictionary mapping suffixes to encodings
17types_map -- dictionary mapping suffixes to types
18
19Functions:
20
21init([files]) -- parse a list of files, default knownfiles (on Windows, the
22  default values are taken from the registry)
23read_mime_types(file) -- parse one file, return a dictionary or None
24"""
25
26import os
27import sys
28import posixpath
29import urllib.parse
30
31try:
32    from _winapi import _mimetypes_read_windows_registry
33except ImportError:
34    _mimetypes_read_windows_registry = None
35
36try:
37    import winreg as _winreg
38except ImportError:
39    _winreg = None
40
41__all__ = [
42    "knownfiles", "inited", "MimeTypes",
43    "guess_type", "guess_all_extensions", "guess_extension",
44    "add_type", "init", "read_mime_types",
45    "suffix_map", "encodings_map", "types_map", "common_types"
46]
47
48knownfiles = [
49    "/etc/mime.types",
50    "/etc/httpd/mime.types",                    # Mac OS X
51    "/etc/httpd/conf/mime.types",               # Apache
52    "/etc/apache/mime.types",                   # Apache 1
53    "/etc/apache2/mime.types",                  # Apache 2
54    "/usr/local/etc/httpd/conf/mime.types",
55    "/usr/local/lib/netscape/mime.types",
56    "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
57    "/usr/local/etc/mime.types",                # Apache 1.3
58    ]
59
60inited = False
61_db = None
62
63
64class MimeTypes:
65    """MIME-types datastore.
66
67    This datastore can handle information from mime.types-style files
68    and supports basic determination of MIME type from a filename or
69    URL, and can guess a reasonable extension given a MIME type.
70    """
71
72    def __init__(self, filenames=(), strict=True):
73        if not inited:
74            init()
75        self.encodings_map = _encodings_map_default.copy()
76        self.suffix_map = _suffix_map_default.copy()
77        self.types_map = ({}, {}) # dict for (non-strict, strict)
78        self.types_map_inv = ({}, {})
79        for (ext, type) in _types_map_default.items():
80            self.add_type(type, ext, True)
81        for (ext, type) in _common_types_default.items():
82            self.add_type(type, ext, False)
83        for name in filenames:
84            self.read(name, strict)
85
86    def add_type(self, type, ext, strict=True):
87        """Add a mapping between a type and an extension.
88
89        When the extension is already known, the new
90        type will replace the old one. When the type
91        is already known the extension will be added
92        to the list of known extensions.
93
94        If strict is true, information will be added to
95        list of standard types, else to the list of non-standard
96        types.
97        """
98        self.types_map[strict][ext] = type
99        exts = self.types_map_inv[strict].setdefault(type, [])
100        if ext not in exts:
101            exts.append(ext)
102
103    def guess_type(self, url, strict=True):
104        """Guess the type of a file which is either a URL or a path-like object.
105
106        Return value is a tuple (type, encoding) where type is None if
107        the type can't be guessed (no or unknown suffix) or a string
108        of the form type/subtype, usable for a MIME Content-type
109        header; and encoding is None for no encoding or the name of
110        the program used to encode (e.g. compress or gzip).  The
111        mappings are table driven.  Encoding suffixes are case
112        sensitive; type suffixes are first tried case sensitive, then
113        case insensitive.
114
115        The suffixes .tgz, .taz and .tz (case sensitive!) are all
116        mapped to '.tar.gz'.  (This is table-driven too, using the
117        dictionary suffix_map.)
118
119        Optional `strict' argument when False adds a bunch of commonly found,
120        but non-standard types.
121        """
122        url = os.fspath(url)
123        scheme, url = urllib.parse._splittype(url)
124        if scheme == 'data':
125            # syntax of data URLs:
126            # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
127            # mediatype := [ type "/" subtype ] *( ";" parameter )
128            # data      := *urlchar
129            # parameter := attribute "=" value
130            # type/subtype defaults to "text/plain"
131            comma = url.find(',')
132            if comma < 0:
133                # bad data URL
134                return None, None
135            semi = url.find(';', 0, comma)
136            if semi >= 0:
137                type = url[:semi]
138            else:
139                type = url[:comma]
140            if '=' in type or '/' not in type:
141                type = 'text/plain'
142            return type, None           # never compressed, so encoding is None
143        base, ext = posixpath.splitext(url)
144        while (ext_lower := ext.lower()) in self.suffix_map:
145            base, ext = posixpath.splitext(base + self.suffix_map[ext_lower])
146        # encodings_map is case sensitive
147        if ext in self.encodings_map:
148            encoding = self.encodings_map[ext]
149            base, ext = posixpath.splitext(base)
150        else:
151            encoding = None
152        ext = ext.lower()
153        types_map = self.types_map[True]
154        if ext in types_map:
155            return types_map[ext], encoding
156        elif strict:
157            return None, encoding
158        types_map = self.types_map[False]
159        if ext in types_map:
160            return types_map[ext], encoding
161        else:
162            return None, encoding
163
164    def guess_all_extensions(self, type, strict=True):
165        """Guess the extensions for a file based on its MIME type.
166
167        Return value is a list of strings giving the possible filename
168        extensions, including the leading dot ('.').  The extension is not
169        guaranteed to have been associated with any particular data stream,
170        but would be mapped to the MIME type `type' by guess_type().
171
172        Optional `strict' argument when false adds a bunch of commonly found,
173        but non-standard types.
174        """
175        type = type.lower()
176        extensions = list(self.types_map_inv[True].get(type, []))
177        if not strict:
178            for ext in self.types_map_inv[False].get(type, []):
179                if ext not in extensions:
180                    extensions.append(ext)
181        return extensions
182
183    def guess_extension(self, type, strict=True):
184        """Guess the extension for a file based on its MIME type.
185
186        Return value is a string giving a filename extension,
187        including the leading dot ('.').  The extension is not
188        guaranteed to have been associated with any particular data
189        stream, but would be mapped to the MIME type `type' by
190        guess_type().  If no extension can be guessed for `type', None
191        is returned.
192
193        Optional `strict' argument when false adds a bunch of commonly found,
194        but non-standard types.
195        """
196        extensions = self.guess_all_extensions(type, strict)
197        if not extensions:
198            return None
199        return extensions[0]
200
201    def read(self, filename, strict=True):
202        """
203        Read a single mime.types-format file, specified by pathname.
204
205        If strict is true, information will be added to
206        list of standard types, else to the list of non-standard
207        types.
208        """
209        with open(filename, encoding='utf-8') as fp:
210            self.readfp(fp, strict)
211
212    def readfp(self, fp, strict=True):
213        """
214        Read a single mime.types-format file.
215
216        If strict is true, information will be added to
217        list of standard types, else to the list of non-standard
218        types.
219        """
220        while 1:
221            line = fp.readline()
222            if not line:
223                break
224            words = line.split()
225            for i in range(len(words)):
226                if words[i][0] == '#':
227                    del words[i:]
228                    break
229            if not words:
230                continue
231            type, suffixes = words[0], words[1:]
232            for suff in suffixes:
233                self.add_type(type, '.' + suff, strict)
234
235    def read_windows_registry(self, strict=True):
236        """
237        Load the MIME types database from Windows registry.
238
239        If strict is true, information will be added to
240        list of standard types, else to the list of non-standard
241        types.
242        """
243
244        if not _mimetypes_read_windows_registry and not _winreg:
245            return
246
247        add_type = self.add_type
248        if strict:
249            add_type = lambda type, ext: self.add_type(type, ext, True)
250
251        # Accelerated function if it is available
252        if _mimetypes_read_windows_registry:
253            _mimetypes_read_windows_registry(add_type)
254        elif _winreg:
255            self._read_windows_registry(add_type)
256
257    @classmethod
258    def _read_windows_registry(cls, add_type):
259        def enum_types(mimedb):
260            i = 0
261            while True:
262                try:
263                    ctype = _winreg.EnumKey(mimedb, i)
264                except OSError:
265                    break
266                else:
267                    if '\0' not in ctype:
268                        yield ctype
269                i += 1
270
271        with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr:
272            for subkeyname in enum_types(hkcr):
273                try:
274                    with _winreg.OpenKey(hkcr, subkeyname) as subkey:
275                        # Only check file extensions
276                        if not subkeyname.startswith("."):
277                            continue
278                        # raises OSError if no 'Content Type' value
279                        mimetype, datatype = _winreg.QueryValueEx(
280                            subkey, 'Content Type')
281                        if datatype != _winreg.REG_SZ:
282                            continue
283                        add_type(mimetype, subkeyname)
284                except OSError:
285                    continue
286
287def guess_type(url, strict=True):
288    """Guess the type of a file based on its URL.
289
290    Return value is a tuple (type, encoding) where type is None if the
291    type can't be guessed (no or unknown suffix) or a string of the
292    form type/subtype, usable for a MIME Content-type header; and
293    encoding is None for no encoding or the name of the program used
294    to encode (e.g. compress or gzip).  The mappings are table
295    driven.  Encoding suffixes are case sensitive; type suffixes are
296    first tried case sensitive, then case insensitive.
297
298    The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
299    to ".tar.gz".  (This is table-driven too, using the dictionary
300    suffix_map).
301
302    Optional `strict' argument when false adds a bunch of commonly found, but
303    non-standard types.
304    """
305    if _db is None:
306        init()
307    return _db.guess_type(url, strict)
308
309
310def guess_all_extensions(type, strict=True):
311    """Guess the extensions for a file based on its MIME type.
312
313    Return value is a list of strings giving the possible filename
314    extensions, including the leading dot ('.').  The extension is not
315    guaranteed to have been associated with any particular data
316    stream, but would be mapped to the MIME type `type' by
317    guess_type().  If no extension can be guessed for `type', None
318    is returned.
319
320    Optional `strict' argument when false adds a bunch of commonly found,
321    but non-standard types.
322    """
323    if _db is None:
324        init()
325    return _db.guess_all_extensions(type, strict)
326
327def guess_extension(type, strict=True):
328    """Guess the extension for a file based on its MIME type.
329
330    Return value is a string giving a filename extension, including the
331    leading dot ('.').  The extension is not guaranteed to have been
332    associated with any particular data stream, but would be mapped to the
333    MIME type `type' by guess_type().  If no extension can be guessed for
334    `type', None is returned.
335
336    Optional `strict' argument when false adds a bunch of commonly found,
337    but non-standard types.
338    """
339    if _db is None:
340        init()
341    return _db.guess_extension(type, strict)
342
343def add_type(type, ext, strict=True):
344    """Add a mapping between a type and an extension.
345
346    When the extension is already known, the new
347    type will replace the old one. When the type
348    is already known the extension will be added
349    to the list of known extensions.
350
351    If strict is true, information will be added to
352    list of standard types, else to the list of non-standard
353    types.
354    """
355    if _db is None:
356        init()
357    return _db.add_type(type, ext, strict)
358
359
360def init(files=None):
361    global suffix_map, types_map, encodings_map, common_types
362    global inited, _db
363    inited = True    # so that MimeTypes.__init__() doesn't call us again
364
365    if files is None or _db is None:
366        db = MimeTypes()
367        # Quick return if not supported
368        db.read_windows_registry()
369
370        if files is None:
371            files = knownfiles
372        else:
373            files = knownfiles + list(files)
374    else:
375        db = _db
376
377    for file in files:
378        if os.path.isfile(file):
379            db.read(file)
380    encodings_map = db.encodings_map
381    suffix_map = db.suffix_map
382    types_map = db.types_map[True]
383    common_types = db.types_map[False]
384    # Make the DB a global variable now that it is fully initialized
385    _db = db
386
387
388def read_mime_types(file):
389    try:
390        f = open(file, encoding='utf-8')
391    except OSError:
392        return None
393    with f:
394        db = MimeTypes()
395        db.readfp(f, True)
396        return db.types_map[True]
397
398
399def _default_mime_types():
400    global suffix_map, _suffix_map_default
401    global encodings_map, _encodings_map_default
402    global types_map, _types_map_default
403    global common_types, _common_types_default
404
405    suffix_map = _suffix_map_default = {
406        '.svgz': '.svg.gz',
407        '.tgz': '.tar.gz',
408        '.taz': '.tar.gz',
409        '.tz': '.tar.gz',
410        '.tbz2': '.tar.bz2',
411        '.txz': '.tar.xz',
412        }
413
414    encodings_map = _encodings_map_default = {
415        '.gz': 'gzip',
416        '.Z': 'compress',
417        '.bz2': 'bzip2',
418        '.xz': 'xz',
419        '.br': 'br',
420        }
421
422    # Before adding new types, make sure they are either registered with IANA,
423    # at http://www.iana.org/assignments/media-types
424    # or extensions, i.e. using the x- prefix
425
426    # If you add to these, please keep them sorted by mime type.
427    # Make sure the entry with the preferred file extension for a particular mime type
428    # appears before any others of the same mimetype.
429    types_map = _types_map_default = {
430        '.js'     : 'application/javascript',
431        '.mjs'    : 'application/javascript',
432        '.json'   : 'application/json',
433        '.webmanifest': 'application/manifest+json',
434        '.doc'    : 'application/msword',
435        '.dot'    : 'application/msword',
436        '.wiz'    : 'application/msword',
437        '.nq'     : 'application/n-quads',
438        '.nt'     : 'application/n-triples',
439        '.bin'    : 'application/octet-stream',
440        '.a'      : 'application/octet-stream',
441        '.dll'    : 'application/octet-stream',
442        '.exe'    : 'application/octet-stream',
443        '.o'      : 'application/octet-stream',
444        '.obj'    : 'application/octet-stream',
445        '.so'     : 'application/octet-stream',
446        '.oda'    : 'application/oda',
447        '.pdf'    : 'application/pdf',
448        '.p7c'    : 'application/pkcs7-mime',
449        '.ps'     : 'application/postscript',
450        '.ai'     : 'application/postscript',
451        '.eps'    : 'application/postscript',
452        '.trig'   : 'application/trig',
453        '.m3u'    : 'application/vnd.apple.mpegurl',
454        '.m3u8'   : 'application/vnd.apple.mpegurl',
455        '.xls'    : 'application/vnd.ms-excel',
456        '.xlb'    : 'application/vnd.ms-excel',
457        '.ppt'    : 'application/vnd.ms-powerpoint',
458        '.pot'    : 'application/vnd.ms-powerpoint',
459        '.ppa'    : 'application/vnd.ms-powerpoint',
460        '.pps'    : 'application/vnd.ms-powerpoint',
461        '.pwz'    : 'application/vnd.ms-powerpoint',
462        '.wasm'   : 'application/wasm',
463        '.bcpio'  : 'application/x-bcpio',
464        '.cpio'   : 'application/x-cpio',
465        '.csh'    : 'application/x-csh',
466        '.dvi'    : 'application/x-dvi',
467        '.gtar'   : 'application/x-gtar',
468        '.hdf'    : 'application/x-hdf',
469        '.h5'     : 'application/x-hdf5',
470        '.latex'  : 'application/x-latex',
471        '.mif'    : 'application/x-mif',
472        '.cdf'    : 'application/x-netcdf',
473        '.nc'     : 'application/x-netcdf',
474        '.p12'    : 'application/x-pkcs12',
475        '.pfx'    : 'application/x-pkcs12',
476        '.ram'    : 'application/x-pn-realaudio',
477        '.pyc'    : 'application/x-python-code',
478        '.pyo'    : 'application/x-python-code',
479        '.sh'     : 'application/x-sh',
480        '.shar'   : 'application/x-shar',
481        '.swf'    : 'application/x-shockwave-flash',
482        '.sv4cpio': 'application/x-sv4cpio',
483        '.sv4crc' : 'application/x-sv4crc',
484        '.tar'    : 'application/x-tar',
485        '.tcl'    : 'application/x-tcl',
486        '.tex'    : 'application/x-tex',
487        '.texi'   : 'application/x-texinfo',
488        '.texinfo': 'application/x-texinfo',
489        '.roff'   : 'application/x-troff',
490        '.t'      : 'application/x-troff',
491        '.tr'     : 'application/x-troff',
492        '.man'    : 'application/x-troff-man',
493        '.me'     : 'application/x-troff-me',
494        '.ms'     : 'application/x-troff-ms',
495        '.ustar'  : 'application/x-ustar',
496        '.src'    : 'application/x-wais-source',
497        '.xsl'    : 'application/xml',
498        '.rdf'    : 'application/xml',
499        '.wsdl'   : 'application/xml',
500        '.xpdl'   : 'application/xml',
501        '.zip'    : 'application/zip',
502        '.3gp'    : 'audio/3gpp',
503        '.3gpp'   : 'audio/3gpp',
504        '.3g2'    : 'audio/3gpp2',
505        '.3gpp2'  : 'audio/3gpp2',
506        '.aac'    : 'audio/aac',
507        '.adts'   : 'audio/aac',
508        '.loas'   : 'audio/aac',
509        '.ass'    : 'audio/aac',
510        '.au'     : 'audio/basic',
511        '.snd'    : 'audio/basic',
512        '.mp3'    : 'audio/mpeg',
513        '.mp2'    : 'audio/mpeg',
514        '.opus'   : 'audio/opus',
515        '.aif'    : 'audio/x-aiff',
516        '.aifc'   : 'audio/x-aiff',
517        '.aiff'   : 'audio/x-aiff',
518        '.ra'     : 'audio/x-pn-realaudio',
519        '.wav'    : 'audio/x-wav',
520        '.avif'   : 'image/avif',
521        '.bmp'    : 'image/bmp',
522        '.gif'    : 'image/gif',
523        '.ief'    : 'image/ief',
524        '.jpg'    : 'image/jpeg',
525        '.jpe'    : 'image/jpeg',
526        '.jpeg'   : 'image/jpeg',
527        '.heic'   : 'image/heic',
528        '.heif'   : 'image/heif',
529        '.png'    : 'image/png',
530        '.svg'    : 'image/svg+xml',
531        '.tiff'   : 'image/tiff',
532        '.tif'    : 'image/tiff',
533        '.ico'    : 'image/vnd.microsoft.icon',
534        '.ras'    : 'image/x-cmu-raster',
535        '.pnm'    : 'image/x-portable-anymap',
536        '.pbm'    : 'image/x-portable-bitmap',
537        '.pgm'    : 'image/x-portable-graymap',
538        '.ppm'    : 'image/x-portable-pixmap',
539        '.rgb'    : 'image/x-rgb',
540        '.xbm'    : 'image/x-xbitmap',
541        '.xpm'    : 'image/x-xpixmap',
542        '.xwd'    : 'image/x-xwindowdump',
543        '.eml'    : 'message/rfc822',
544        '.mht'    : 'message/rfc822',
545        '.mhtml'  : 'message/rfc822',
546        '.nws'    : 'message/rfc822',
547        '.css'    : 'text/css',
548        '.csv'    : 'text/csv',
549        '.html'   : 'text/html',
550        '.htm'    : 'text/html',
551        '.n3'     : 'text/n3',
552        '.txt'    : 'text/plain',
553        '.bat'    : 'text/plain',
554        '.c'      : 'text/plain',
555        '.h'      : 'text/plain',
556        '.ksh'    : 'text/plain',
557        '.pl'     : 'text/plain',
558        '.srt'    : 'text/plain',
559        '.rtx'    : 'text/richtext',
560        '.tsv'    : 'text/tab-separated-values',
561        '.vtt'    : 'text/vtt',
562        '.py'     : 'text/x-python',
563        '.etx'    : 'text/x-setext',
564        '.sgm'    : 'text/x-sgml',
565        '.sgml'   : 'text/x-sgml',
566        '.vcf'    : 'text/x-vcard',
567        '.xml'    : 'text/xml',
568        '.mp4'    : 'video/mp4',
569        '.mpeg'   : 'video/mpeg',
570        '.m1v'    : 'video/mpeg',
571        '.mpa'    : 'video/mpeg',
572        '.mpe'    : 'video/mpeg',
573        '.mpg'    : 'video/mpeg',
574        '.mov'    : 'video/quicktime',
575        '.qt'     : 'video/quicktime',
576        '.webm'   : 'video/webm',
577        '.avi'    : 'video/x-msvideo',
578        '.movie'  : 'video/x-sgi-movie',
579        }
580
581    # These are non-standard types, commonly found in the wild.  They will
582    # only match if strict=0 flag is given to the API methods.
583
584    # Please sort these too
585    common_types = _common_types_default = {
586        '.rtf' : 'application/rtf',
587        '.midi': 'audio/midi',
588        '.mid' : 'audio/midi',
589        '.jpg' : 'image/jpg',
590        '.pict': 'image/pict',
591        '.pct' : 'image/pict',
592        '.pic' : 'image/pict',
593        '.webp': 'image/webp',
594        '.xul' : 'text/xul',
595        }
596
597
598_default_mime_types()
599
600
601def _main():
602    import getopt
603
604    USAGE = """\
605Usage: mimetypes.py [options] type
606
607Options:
608    --help / -h       -- print this message and exit
609    --lenient / -l    -- additionally search of some common, but non-standard
610                         types.
611    --extension / -e  -- guess extension instead of type
612
613More than one type argument may be given.
614"""
615
616    def usage(code, msg=''):
617        print(USAGE)
618        if msg: print(msg)
619        sys.exit(code)
620
621    try:
622        opts, args = getopt.getopt(sys.argv[1:], 'hle',
623                                   ['help', 'lenient', 'extension'])
624    except getopt.error as msg:
625        usage(1, msg)
626
627    strict = 1
628    extension = 0
629    for opt, arg in opts:
630        if opt in ('-h', '--help'):
631            usage(0)
632        elif opt in ('-l', '--lenient'):
633            strict = 0
634        elif opt in ('-e', '--extension'):
635            extension = 1
636    for gtype in args:
637        if extension:
638            guess = guess_extension(gtype, strict)
639            if not guess: print("I don't know anything about type", gtype)
640            else: print(guess)
641        else:
642            guess, encoding = guess_type(gtype, strict)
643            if not guess: print("I don't know anything about type", gtype)
644            else: print('type:', guess, 'encoding:', encoding)
645
646
647if __name__ == '__main__':
648    _main()
649