• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Guess the MIME type of a file.
2
3This module defines two useful functions:
4
5guess_type(url, strict=True) -- guess the MIME type and encoding of a URL.
6
7guess_extension(type, strict=True) -- guess the extension for a given MIME type.
8
9It also contains the following, for tuning the behavior:
10
11Data:
12
13knownfiles -- list of files to parse
14inited -- flag set when init() has been called
15suffix_map -- dictionary mapping suffixes to suffixes
16encodings_map -- dictionary mapping suffixes to encodings
17types_map -- dictionary mapping suffixes to types
18
19Functions:
20
21init([files]) -- parse a list of files, default knownfiles (on Windows, the
22  default values are taken from the registry)
23read_mime_types(file) -- parse one file, return a dictionary or None
24"""
25
26import os
27import sys
28import posixpath
29import urllib.parse
30
31try:
32    from _winapi import _mimetypes_read_windows_registry
33except ImportError:
34    _mimetypes_read_windows_registry = None
35
36try:
37    import winreg as _winreg
38except ImportError:
39    _winreg = None
40
41__all__ = [
42    "knownfiles", "inited", "MimeTypes",
43    "guess_type", "guess_file_type", "guess_all_extensions", "guess_extension",
44    "add_type", "init", "read_mime_types",
45    "suffix_map", "encodings_map", "types_map", "common_types"
46]
47
48knownfiles = [
49    "/etc/mime.types",
50    "/etc/httpd/mime.types",                    # Mac OS X
51    "/etc/httpd/conf/mime.types",               # Apache
52    "/etc/apache/mime.types",                   # Apache 1
53    "/etc/apache2/mime.types",                  # Apache 2
54    "/usr/local/etc/httpd/conf/mime.types",
55    "/usr/local/lib/netscape/mime.types",
56    "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
57    "/usr/local/etc/mime.types",                # Apache 1.3
58    ]
59
60inited = False
61_db = None
62
63
64class MimeTypes:
65    """MIME-types datastore.
66
67    This datastore can handle information from mime.types-style files
68    and supports basic determination of MIME type from a filename or
69    URL, and can guess a reasonable extension given a MIME type.
70    """
71
72    def __init__(self, filenames=(), strict=True):
73        if not inited:
74            init()
75        self.encodings_map = _encodings_map_default.copy()
76        self.suffix_map = _suffix_map_default.copy()
77        self.types_map = ({}, {}) # dict for (non-strict, strict)
78        self.types_map_inv = ({}, {})
79        for (ext, type) in _types_map_default.items():
80            self.add_type(type, ext, True)
81        for (ext, type) in _common_types_default.items():
82            self.add_type(type, ext, False)
83        for name in filenames:
84            self.read(name, strict)
85
86    def add_type(self, type, ext, strict=True):
87        """Add a mapping between a type and an extension.
88
89        When the extension is already known, the new
90        type will replace the old one. When the type
91        is already known the extension will be added
92        to the list of known extensions.
93
94        If strict is true, information will be added to
95        list of standard types, else to the list of non-standard
96        types.
97        """
98        self.types_map[strict][ext] = type
99        exts = self.types_map_inv[strict].setdefault(type, [])
100        if ext not in exts:
101            exts.append(ext)
102
103    def guess_type(self, url, strict=True):
104        """Guess the type of a file which is either a URL or a path-like object.
105
106        Return value is a tuple (type, encoding) where type is None if
107        the type can't be guessed (no or unknown suffix) or a string
108        of the form type/subtype, usable for a MIME Content-type
109        header; and encoding is None for no encoding or the name of
110        the program used to encode (e.g. compress or gzip).  The
111        mappings are table driven.  Encoding suffixes are case
112        sensitive; type suffixes are first tried case sensitive, then
113        case insensitive.
114
115        The suffixes .tgz, .taz and .tz (case sensitive!) are all
116        mapped to '.tar.gz'.  (This is table-driven too, using the
117        dictionary suffix_map.)
118
119        Optional `strict' argument when False adds a bunch of commonly found,
120        but non-standard types.
121        """
122        # TODO: Deprecate accepting file paths (in particular path-like objects).
123        url = os.fspath(url)
124        p = urllib.parse.urlparse(url)
125        if p.scheme and len(p.scheme) > 1:
126            scheme = p.scheme
127            url = p.path
128        else:
129            return self.guess_file_type(url, strict=strict)
130        if scheme == 'data':
131            # syntax of data URLs:
132            # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
133            # mediatype := [ type "/" subtype ] *( ";" parameter )
134            # data      := *urlchar
135            # parameter := attribute "=" value
136            # type/subtype defaults to "text/plain"
137            comma = url.find(',')
138            if comma < 0:
139                # bad data URL
140                return None, None
141            semi = url.find(';', 0, comma)
142            if semi >= 0:
143                type = url[:semi]
144            else:
145                type = url[:comma]
146            if '=' in type or '/' not in type:
147                type = 'text/plain'
148            return type, None           # never compressed, so encoding is None
149        return self._guess_file_type(url, strict, posixpath.splitext)
150
151    def guess_file_type(self, path, *, strict=True):
152        """Guess the type of a file based on its path.
153
154        Similar to guess_type(), but takes file path istead of URL.
155        """
156        path = os.fsdecode(path)
157        path = os.path.splitdrive(path)[1]
158        return self._guess_file_type(path, strict, os.path.splitext)
159
160    def _guess_file_type(self, path, strict, splitext):
161        base, ext = splitext(path)
162        while (ext_lower := ext.lower()) in self.suffix_map:
163            base, ext = splitext(base + self.suffix_map[ext_lower])
164        # encodings_map is case sensitive
165        if ext in self.encodings_map:
166            encoding = self.encodings_map[ext]
167            base, ext = splitext(base)
168        else:
169            encoding = None
170        ext = ext.lower()
171        types_map = self.types_map[True]
172        if ext in types_map:
173            return types_map[ext], encoding
174        elif strict:
175            return None, encoding
176        types_map = self.types_map[False]
177        if ext in types_map:
178            return types_map[ext], encoding
179        else:
180            return None, encoding
181
182    def guess_all_extensions(self, type, strict=True):
183        """Guess the extensions for a file based on its MIME type.
184
185        Return value is a list of strings giving the possible filename
186        extensions, including the leading dot ('.').  The extension is not
187        guaranteed to have been associated with any particular data stream,
188        but would be mapped to the MIME type `type' by guess_type().
189
190        Optional `strict' argument when false adds a bunch of commonly found,
191        but non-standard types.
192        """
193        type = type.lower()
194        extensions = list(self.types_map_inv[True].get(type, []))
195        if not strict:
196            for ext in self.types_map_inv[False].get(type, []):
197                if ext not in extensions:
198                    extensions.append(ext)
199        return extensions
200
201    def guess_extension(self, type, strict=True):
202        """Guess the extension for a file based on its MIME type.
203
204        Return value is a string giving a filename extension,
205        including the leading dot ('.').  The extension is not
206        guaranteed to have been associated with any particular data
207        stream, but would be mapped to the MIME type `type' by
208        guess_type().  If no extension can be guessed for `type', None
209        is returned.
210
211        Optional `strict' argument when false adds a bunch of commonly found,
212        but non-standard types.
213        """
214        extensions = self.guess_all_extensions(type, strict)
215        if not extensions:
216            return None
217        return extensions[0]
218
219    def read(self, filename, strict=True):
220        """
221        Read a single mime.types-format file, specified by pathname.
222
223        If strict is true, information will be added to
224        list of standard types, else to the list of non-standard
225        types.
226        """
227        with open(filename, encoding='utf-8') as fp:
228            self.readfp(fp, strict)
229
230    def readfp(self, fp, strict=True):
231        """
232        Read a single mime.types-format file.
233
234        If strict is true, information will be added to
235        list of standard types, else to the list of non-standard
236        types.
237        """
238        while line := fp.readline():
239            words = line.split()
240            for i in range(len(words)):
241                if words[i][0] == '#':
242                    del words[i:]
243                    break
244            if not words:
245                continue
246            type, suffixes = words[0], words[1:]
247            for suff in suffixes:
248                self.add_type(type, '.' + suff, strict)
249
250    def read_windows_registry(self, strict=True):
251        """
252        Load the MIME types database from Windows registry.
253
254        If strict is true, information will be added to
255        list of standard types, else to the list of non-standard
256        types.
257        """
258
259        if not _mimetypes_read_windows_registry and not _winreg:
260            return
261
262        add_type = self.add_type
263        if strict:
264            add_type = lambda type, ext: self.add_type(type, ext, True)
265
266        # Accelerated function if it is available
267        if _mimetypes_read_windows_registry:
268            _mimetypes_read_windows_registry(add_type)
269        elif _winreg:
270            self._read_windows_registry(add_type)
271
272    @classmethod
273    def _read_windows_registry(cls, add_type):
274        def enum_types(mimedb):
275            i = 0
276            while True:
277                try:
278                    ctype = _winreg.EnumKey(mimedb, i)
279                except OSError:
280                    break
281                else:
282                    if '\0' not in ctype:
283                        yield ctype
284                i += 1
285
286        with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr:
287            for subkeyname in enum_types(hkcr):
288                try:
289                    with _winreg.OpenKey(hkcr, subkeyname) as subkey:
290                        # Only check file extensions
291                        if not subkeyname.startswith("."):
292                            continue
293                        # raises OSError if no 'Content Type' value
294                        mimetype, datatype = _winreg.QueryValueEx(
295                            subkey, 'Content Type')
296                        if datatype != _winreg.REG_SZ:
297                            continue
298                        add_type(mimetype, subkeyname)
299                except OSError:
300                    continue
301
302def guess_type(url, strict=True):
303    """Guess the type of a file based on its URL.
304
305    Return value is a tuple (type, encoding) where type is None if the
306    type can't be guessed (no or unknown suffix) or a string of the
307    form type/subtype, usable for a MIME Content-type header; and
308    encoding is None for no encoding or the name of the program used
309    to encode (e.g. compress or gzip).  The mappings are table
310    driven.  Encoding suffixes are case sensitive; type suffixes are
311    first tried case sensitive, then case insensitive.
312
313    The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
314    to ".tar.gz".  (This is table-driven too, using the dictionary
315    suffix_map).
316
317    Optional `strict' argument when false adds a bunch of commonly found, but
318    non-standard types.
319    """
320    if _db is None:
321        init()
322    return _db.guess_type(url, strict)
323
324
325def guess_file_type(path, *, strict=True):
326    """Guess the type of a file based on its path.
327
328    Similar to guess_type(), but takes file path istead of URL.
329    """
330    if _db is None:
331        init()
332    return _db.guess_file_type(path, strict=strict)
333
334
335def guess_all_extensions(type, strict=True):
336    """Guess the extensions for a file based on its MIME type.
337
338    Return value is a list of strings giving the possible filename
339    extensions, including the leading dot ('.').  The extension is not
340    guaranteed to have been associated with any particular data
341    stream, but would be mapped to the MIME type `type' by
342    guess_type().  If no extension can be guessed for `type', None
343    is returned.
344
345    Optional `strict' argument when false adds a bunch of commonly found,
346    but non-standard types.
347    """
348    if _db is None:
349        init()
350    return _db.guess_all_extensions(type, strict)
351
352def guess_extension(type, strict=True):
353    """Guess the extension for a file based on its MIME type.
354
355    Return value is a string giving a filename extension, including the
356    leading dot ('.').  The extension is not guaranteed to have been
357    associated with any particular data stream, but would be mapped to the
358    MIME type `type' by guess_type().  If no extension can be guessed for
359    `type', None is returned.
360
361    Optional `strict' argument when false adds a bunch of commonly found,
362    but non-standard types.
363    """
364    if _db is None:
365        init()
366    return _db.guess_extension(type, strict)
367
368def add_type(type, ext, strict=True):
369    """Add a mapping between a type and an extension.
370
371    When the extension is already known, the new
372    type will replace the old one. When the type
373    is already known the extension will be added
374    to the list of known extensions.
375
376    If strict is true, information will be added to
377    list of standard types, else to the list of non-standard
378    types.
379    """
380    if _db is None:
381        init()
382    return _db.add_type(type, ext, strict)
383
384
385def init(files=None):
386    global suffix_map, types_map, encodings_map, common_types
387    global inited, _db
388    inited = True    # so that MimeTypes.__init__() doesn't call us again
389
390    if files is None or _db is None:
391        db = MimeTypes()
392        # Quick return if not supported
393        db.read_windows_registry()
394
395        if files is None:
396            files = knownfiles
397        else:
398            files = knownfiles + list(files)
399    else:
400        db = _db
401
402    for file in files:
403        if os.path.isfile(file):
404            db.read(file)
405    encodings_map = db.encodings_map
406    suffix_map = db.suffix_map
407    types_map = db.types_map[True]
408    common_types = db.types_map[False]
409    # Make the DB a global variable now that it is fully initialized
410    _db = db
411
412
413def read_mime_types(file):
414    try:
415        f = open(file, encoding='utf-8')
416    except OSError:
417        return None
418    with f:
419        db = MimeTypes()
420        db.readfp(f, True)
421        return db.types_map[True]
422
423
424def _default_mime_types():
425    global suffix_map, _suffix_map_default
426    global encodings_map, _encodings_map_default
427    global types_map, _types_map_default
428    global common_types, _common_types_default
429
430    suffix_map = _suffix_map_default = {
431        '.svgz': '.svg.gz',
432        '.tgz': '.tar.gz',
433        '.taz': '.tar.gz',
434        '.tz': '.tar.gz',
435        '.tbz2': '.tar.bz2',
436        '.txz': '.tar.xz',
437        }
438
439    encodings_map = _encodings_map_default = {
440        '.gz': 'gzip',
441        '.Z': 'compress',
442        '.bz2': 'bzip2',
443        '.xz': 'xz',
444        '.br': 'br',
445        }
446
447    # Before adding new types, make sure they are either registered with IANA,
448    # at http://www.iana.org/assignments/media-types
449    # or extensions, i.e. using the x- prefix
450
451    # If you add to these, please keep them sorted by mime type.
452    # Make sure the entry with the preferred file extension for a particular mime type
453    # appears before any others of the same mimetype.
454    types_map = _types_map_default = {
455        '.js'     : 'text/javascript',
456        '.mjs'    : 'text/javascript',
457        '.json'   : 'application/json',
458        '.webmanifest': 'application/manifest+json',
459        '.doc'    : 'application/msword',
460        '.dot'    : 'application/msword',
461        '.wiz'    : 'application/msword',
462        '.nq'     : 'application/n-quads',
463        '.nt'     : 'application/n-triples',
464        '.bin'    : 'application/octet-stream',
465        '.a'      : 'application/octet-stream',
466        '.dll'    : 'application/octet-stream',
467        '.exe'    : 'application/octet-stream',
468        '.o'      : 'application/octet-stream',
469        '.obj'    : 'application/octet-stream',
470        '.so'     : 'application/octet-stream',
471        '.oda'    : 'application/oda',
472        '.pdf'    : 'application/pdf',
473        '.p7c'    : 'application/pkcs7-mime',
474        '.ps'     : 'application/postscript',
475        '.ai'     : 'application/postscript',
476        '.eps'    : 'application/postscript',
477        '.trig'   : 'application/trig',
478        '.m3u'    : 'application/vnd.apple.mpegurl',
479        '.m3u8'   : 'application/vnd.apple.mpegurl',
480        '.xls'    : 'application/vnd.ms-excel',
481        '.xlb'    : 'application/vnd.ms-excel',
482        '.ppt'    : 'application/vnd.ms-powerpoint',
483        '.pot'    : 'application/vnd.ms-powerpoint',
484        '.ppa'    : 'application/vnd.ms-powerpoint',
485        '.pps'    : 'application/vnd.ms-powerpoint',
486        '.pwz'    : 'application/vnd.ms-powerpoint',
487        '.wasm'   : 'application/wasm',
488        '.bcpio'  : 'application/x-bcpio',
489        '.cpio'   : 'application/x-cpio',
490        '.csh'    : 'application/x-csh',
491        '.dvi'    : 'application/x-dvi',
492        '.gtar'   : 'application/x-gtar',
493        '.hdf'    : 'application/x-hdf',
494        '.h5'     : 'application/x-hdf5',
495        '.latex'  : 'application/x-latex',
496        '.mif'    : 'application/x-mif',
497        '.cdf'    : 'application/x-netcdf',
498        '.nc'     : 'application/x-netcdf',
499        '.p12'    : 'application/x-pkcs12',
500        '.pfx'    : 'application/x-pkcs12',
501        '.ram'    : 'application/x-pn-realaudio',
502        '.pyc'    : 'application/x-python-code',
503        '.pyo'    : 'application/x-python-code',
504        '.sh'     : 'application/x-sh',
505        '.shar'   : 'application/x-shar',
506        '.swf'    : 'application/x-shockwave-flash',
507        '.sv4cpio': 'application/x-sv4cpio',
508        '.sv4crc' : 'application/x-sv4crc',
509        '.tar'    : 'application/x-tar',
510        '.tcl'    : 'application/x-tcl',
511        '.tex'    : 'application/x-tex',
512        '.texi'   : 'application/x-texinfo',
513        '.texinfo': 'application/x-texinfo',
514        '.roff'   : 'application/x-troff',
515        '.t'      : 'application/x-troff',
516        '.tr'     : 'application/x-troff',
517        '.man'    : 'application/x-troff-man',
518        '.me'     : 'application/x-troff-me',
519        '.ms'     : 'application/x-troff-ms',
520        '.ustar'  : 'application/x-ustar',
521        '.src'    : 'application/x-wais-source',
522        '.xsl'    : 'application/xml',
523        '.rdf'    : 'application/xml',
524        '.wsdl'   : 'application/xml',
525        '.xpdl'   : 'application/xml',
526        '.zip'    : 'application/zip',
527        '.3gp'    : 'audio/3gpp',
528        '.3gpp'   : 'audio/3gpp',
529        '.3g2'    : 'audio/3gpp2',
530        '.3gpp2'  : 'audio/3gpp2',
531        '.aac'    : 'audio/aac',
532        '.adts'   : 'audio/aac',
533        '.loas'   : 'audio/aac',
534        '.ass'    : 'audio/aac',
535        '.au'     : 'audio/basic',
536        '.snd'    : 'audio/basic',
537        '.mp3'    : 'audio/mpeg',
538        '.mp2'    : 'audio/mpeg',
539        '.opus'   : 'audio/opus',
540        '.aif'    : 'audio/x-aiff',
541        '.aifc'   : 'audio/x-aiff',
542        '.aiff'   : 'audio/x-aiff',
543        '.ra'     : 'audio/x-pn-realaudio',
544        '.wav'    : 'audio/x-wav',
545        '.avif'   : 'image/avif',
546        '.bmp'    : 'image/bmp',
547        '.gif'    : 'image/gif',
548        '.ief'    : 'image/ief',
549        '.jpg'    : 'image/jpeg',
550        '.jpe'    : 'image/jpeg',
551        '.jpeg'   : 'image/jpeg',
552        '.heic'   : 'image/heic',
553        '.heif'   : 'image/heif',
554        '.png'    : 'image/png',
555        '.svg'    : 'image/svg+xml',
556        '.tiff'   : 'image/tiff',
557        '.tif'    : 'image/tiff',
558        '.ico'    : 'image/vnd.microsoft.icon',
559        '.webp'   : 'image/webp',
560        '.ras'    : 'image/x-cmu-raster',
561        '.pnm'    : 'image/x-portable-anymap',
562        '.pbm'    : 'image/x-portable-bitmap',
563        '.pgm'    : 'image/x-portable-graymap',
564        '.ppm'    : 'image/x-portable-pixmap',
565        '.rgb'    : 'image/x-rgb',
566        '.xbm'    : 'image/x-xbitmap',
567        '.xpm'    : 'image/x-xpixmap',
568        '.xwd'    : 'image/x-xwindowdump',
569        '.eml'    : 'message/rfc822',
570        '.mht'    : 'message/rfc822',
571        '.mhtml'  : 'message/rfc822',
572        '.nws'    : 'message/rfc822',
573        '.css'    : 'text/css',
574        '.csv'    : 'text/csv',
575        '.html'   : 'text/html',
576        '.htm'    : 'text/html',
577        '.md'     : 'text/markdown',
578        '.markdown': 'text/markdown',
579        '.n3'     : 'text/n3',
580        '.txt'    : 'text/plain',
581        '.bat'    : 'text/plain',
582        '.c'      : 'text/plain',
583        '.h'      : 'text/plain',
584        '.ksh'    : 'text/plain',
585        '.pl'     : 'text/plain',
586        '.srt'    : 'text/plain',
587        '.rtx'    : 'text/richtext',
588        '.rtf'    : 'text/rtf',
589        '.tsv'    : 'text/tab-separated-values',
590        '.vtt'    : 'text/vtt',
591        '.py'     : 'text/x-python',
592        '.rst'    : 'text/x-rst',
593        '.etx'    : 'text/x-setext',
594        '.sgm'    : 'text/x-sgml',
595        '.sgml'   : 'text/x-sgml',
596        '.vcf'    : 'text/x-vcard',
597        '.xml'    : 'text/xml',
598        '.mp4'    : 'video/mp4',
599        '.mpeg'   : 'video/mpeg',
600        '.m1v'    : 'video/mpeg',
601        '.mpa'    : 'video/mpeg',
602        '.mpe'    : 'video/mpeg',
603        '.mpg'    : 'video/mpeg',
604        '.mov'    : 'video/quicktime',
605        '.qt'     : 'video/quicktime',
606        '.webm'   : 'video/webm',
607        '.avi'    : 'video/x-msvideo',
608        '.movie'  : 'video/x-sgi-movie',
609        }
610
611    # These are non-standard types, commonly found in the wild.  They will
612    # only match if strict=0 flag is given to the API methods.
613
614    # Please sort these too
615    common_types = _common_types_default = {
616        '.rtf' : 'application/rtf',
617        '.midi': 'audio/midi',
618        '.mid' : 'audio/midi',
619        '.jpg' : 'image/jpg',
620        '.pict': 'image/pict',
621        '.pct' : 'image/pict',
622        '.pic' : 'image/pict',
623        '.xul' : 'text/xul',
624        }
625
626
627_default_mime_types()
628
629
630def _main():
631    import getopt
632
633    USAGE = """\
634Usage: mimetypes.py [options] type
635
636Options:
637    --help / -h       -- print this message and exit
638    --lenient / -l    -- additionally search of some common, but non-standard
639                         types.
640    --extension / -e  -- guess extension instead of type
641
642More than one type argument may be given.
643"""
644
645    def usage(code, msg=''):
646        print(USAGE)
647        if msg: print(msg)
648        sys.exit(code)
649
650    try:
651        opts, args = getopt.getopt(sys.argv[1:], 'hle',
652                                   ['help', 'lenient', 'extension'])
653    except getopt.error as msg:
654        usage(1, msg)
655
656    strict = 1
657    extension = 0
658    for opt, arg in opts:
659        if opt in ('-h', '--help'):
660            usage(0)
661        elif opt in ('-l', '--lenient'):
662            strict = 0
663        elif opt in ('-e', '--extension'):
664            extension = 1
665    for gtype in args:
666        if extension:
667            guess = guess_extension(gtype, strict)
668            if not guess: print("I don't know anything about type", gtype)
669            else: print(guess)
670        else:
671            guess, encoding = guess_type(gtype, strict)
672            if not guess: print("I don't know anything about type", gtype)
673            else: print('type:', guess, 'encoding:', encoding)
674
675
676if __name__ == '__main__':
677    _main()
678