1"""Guess the MIME type of a file. 2 3This module defines two useful functions: 4 5guess_type(url, strict=True) -- guess the MIME type and encoding of a URL. 6 7guess_extension(type, strict=True) -- guess the extension for a given MIME type. 8 9It also contains the following, for tuning the behavior: 10 11Data: 12 13knownfiles -- list of files to parse 14inited -- flag set when init() has been called 15suffix_map -- dictionary mapping suffixes to suffixes 16encodings_map -- dictionary mapping suffixes to encodings 17types_map -- dictionary mapping suffixes to types 18 19Functions: 20 21init([files]) -- parse a list of files, default knownfiles (on Windows, the 22 default values are taken from the registry) 23read_mime_types(file) -- parse one file, return a dictionary or None 24""" 25 26import os 27import sys 28import posixpath 29import urllib.parse 30 31try: 32 from _winapi import _mimetypes_read_windows_registry 33except ImportError: 34 _mimetypes_read_windows_registry = None 35 36try: 37 import winreg as _winreg 38except ImportError: 39 _winreg = None 40 41__all__ = [ 42 "knownfiles", "inited", "MimeTypes", 43 "guess_type", "guess_file_type", "guess_all_extensions", "guess_extension", 44 "add_type", "init", "read_mime_types", 45 "suffix_map", "encodings_map", "types_map", "common_types" 46] 47 48knownfiles = [ 49 "/etc/mime.types", 50 "/etc/httpd/mime.types", # Mac OS X 51 "/etc/httpd/conf/mime.types", # Apache 52 "/etc/apache/mime.types", # Apache 1 53 "/etc/apache2/mime.types", # Apache 2 54 "/usr/local/etc/httpd/conf/mime.types", 55 "/usr/local/lib/netscape/mime.types", 56 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2 57 "/usr/local/etc/mime.types", # Apache 1.3 58 ] 59 60inited = False 61_db = None 62 63 64class MimeTypes: 65 """MIME-types datastore. 66 67 This datastore can handle information from mime.types-style files 68 and supports basic determination of MIME type from a filename or 69 URL, and can guess a reasonable extension given a MIME type. 70 """ 71 72 def __init__(self, filenames=(), strict=True): 73 if not inited: 74 init() 75 self.encodings_map = _encodings_map_default.copy() 76 self.suffix_map = _suffix_map_default.copy() 77 self.types_map = ({}, {}) # dict for (non-strict, strict) 78 self.types_map_inv = ({}, {}) 79 for (ext, type) in _types_map_default.items(): 80 self.add_type(type, ext, True) 81 for (ext, type) in _common_types_default.items(): 82 self.add_type(type, ext, False) 83 for name in filenames: 84 self.read(name, strict) 85 86 def add_type(self, type, ext, strict=True): 87 """Add a mapping between a type and an extension. 88 89 When the extension is already known, the new 90 type will replace the old one. When the type 91 is already known the extension will be added 92 to the list of known extensions. 93 94 If strict is true, information will be added to 95 list of standard types, else to the list of non-standard 96 types. 97 """ 98 self.types_map[strict][ext] = type 99 exts = self.types_map_inv[strict].setdefault(type, []) 100 if ext not in exts: 101 exts.append(ext) 102 103 def guess_type(self, url, strict=True): 104 """Guess the type of a file which is either a URL or a path-like object. 105 106 Return value is a tuple (type, encoding) where type is None if 107 the type can't be guessed (no or unknown suffix) or a string 108 of the form type/subtype, usable for a MIME Content-type 109 header; and encoding is None for no encoding or the name of 110 the program used to encode (e.g. compress or gzip). The 111 mappings are table driven. Encoding suffixes are case 112 sensitive; type suffixes are first tried case sensitive, then 113 case insensitive. 114 115 The suffixes .tgz, .taz and .tz (case sensitive!) are all 116 mapped to '.tar.gz'. (This is table-driven too, using the 117 dictionary suffix_map.) 118 119 Optional `strict' argument when False adds a bunch of commonly found, 120 but non-standard types. 121 """ 122 # TODO: Deprecate accepting file paths (in particular path-like objects). 123 url = os.fspath(url) 124 p = urllib.parse.urlparse(url) 125 if p.scheme and len(p.scheme) > 1: 126 scheme = p.scheme 127 url = p.path 128 else: 129 return self.guess_file_type(url, strict=strict) 130 if scheme == 'data': 131 # syntax of data URLs: 132 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 133 # mediatype := [ type "/" subtype ] *( ";" parameter ) 134 # data := *urlchar 135 # parameter := attribute "=" value 136 # type/subtype defaults to "text/plain" 137 comma = url.find(',') 138 if comma < 0: 139 # bad data URL 140 return None, None 141 semi = url.find(';', 0, comma) 142 if semi >= 0: 143 type = url[:semi] 144 else: 145 type = url[:comma] 146 if '=' in type or '/' not in type: 147 type = 'text/plain' 148 return type, None # never compressed, so encoding is None 149 return self._guess_file_type(url, strict, posixpath.splitext) 150 151 def guess_file_type(self, path, *, strict=True): 152 """Guess the type of a file based on its path. 153 154 Similar to guess_type(), but takes file path istead of URL. 155 """ 156 path = os.fsdecode(path) 157 path = os.path.splitdrive(path)[1] 158 return self._guess_file_type(path, strict, os.path.splitext) 159 160 def _guess_file_type(self, path, strict, splitext): 161 base, ext = splitext(path) 162 while (ext_lower := ext.lower()) in self.suffix_map: 163 base, ext = splitext(base + self.suffix_map[ext_lower]) 164 # encodings_map is case sensitive 165 if ext in self.encodings_map: 166 encoding = self.encodings_map[ext] 167 base, ext = splitext(base) 168 else: 169 encoding = None 170 ext = ext.lower() 171 types_map = self.types_map[True] 172 if ext in types_map: 173 return types_map[ext], encoding 174 elif strict: 175 return None, encoding 176 types_map = self.types_map[False] 177 if ext in types_map: 178 return types_map[ext], encoding 179 else: 180 return None, encoding 181 182 def guess_all_extensions(self, type, strict=True): 183 """Guess the extensions for a file based on its MIME type. 184 185 Return value is a list of strings giving the possible filename 186 extensions, including the leading dot ('.'). The extension is not 187 guaranteed to have been associated with any particular data stream, 188 but would be mapped to the MIME type `type' by guess_type(). 189 190 Optional `strict' argument when false adds a bunch of commonly found, 191 but non-standard types. 192 """ 193 type = type.lower() 194 extensions = list(self.types_map_inv[True].get(type, [])) 195 if not strict: 196 for ext in self.types_map_inv[False].get(type, []): 197 if ext not in extensions: 198 extensions.append(ext) 199 return extensions 200 201 def guess_extension(self, type, strict=True): 202 """Guess the extension for a file based on its MIME type. 203 204 Return value is a string giving a filename extension, 205 including the leading dot ('.'). The extension is not 206 guaranteed to have been associated with any particular data 207 stream, but would be mapped to the MIME type `type' by 208 guess_type(). If no extension can be guessed for `type', None 209 is returned. 210 211 Optional `strict' argument when false adds a bunch of commonly found, 212 but non-standard types. 213 """ 214 extensions = self.guess_all_extensions(type, strict) 215 if not extensions: 216 return None 217 return extensions[0] 218 219 def read(self, filename, strict=True): 220 """ 221 Read a single mime.types-format file, specified by pathname. 222 223 If strict is true, information will be added to 224 list of standard types, else to the list of non-standard 225 types. 226 """ 227 with open(filename, encoding='utf-8') as fp: 228 self.readfp(fp, strict) 229 230 def readfp(self, fp, strict=True): 231 """ 232 Read a single mime.types-format file. 233 234 If strict is true, information will be added to 235 list of standard types, else to the list of non-standard 236 types. 237 """ 238 while line := fp.readline(): 239 words = line.split() 240 for i in range(len(words)): 241 if words[i][0] == '#': 242 del words[i:] 243 break 244 if not words: 245 continue 246 type, suffixes = words[0], words[1:] 247 for suff in suffixes: 248 self.add_type(type, '.' + suff, strict) 249 250 def read_windows_registry(self, strict=True): 251 """ 252 Load the MIME types database from Windows registry. 253 254 If strict is true, information will be added to 255 list of standard types, else to the list of non-standard 256 types. 257 """ 258 259 if not _mimetypes_read_windows_registry and not _winreg: 260 return 261 262 add_type = self.add_type 263 if strict: 264 add_type = lambda type, ext: self.add_type(type, ext, True) 265 266 # Accelerated function if it is available 267 if _mimetypes_read_windows_registry: 268 _mimetypes_read_windows_registry(add_type) 269 elif _winreg: 270 self._read_windows_registry(add_type) 271 272 @classmethod 273 def _read_windows_registry(cls, add_type): 274 def enum_types(mimedb): 275 i = 0 276 while True: 277 try: 278 ctype = _winreg.EnumKey(mimedb, i) 279 except OSError: 280 break 281 else: 282 if '\0' not in ctype: 283 yield ctype 284 i += 1 285 286 with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr: 287 for subkeyname in enum_types(hkcr): 288 try: 289 with _winreg.OpenKey(hkcr, subkeyname) as subkey: 290 # Only check file extensions 291 if not subkeyname.startswith("."): 292 continue 293 # raises OSError if no 'Content Type' value 294 mimetype, datatype = _winreg.QueryValueEx( 295 subkey, 'Content Type') 296 if datatype != _winreg.REG_SZ: 297 continue 298 add_type(mimetype, subkeyname) 299 except OSError: 300 continue 301 302def guess_type(url, strict=True): 303 """Guess the type of a file based on its URL. 304 305 Return value is a tuple (type, encoding) where type is None if the 306 type can't be guessed (no or unknown suffix) or a string of the 307 form type/subtype, usable for a MIME Content-type header; and 308 encoding is None for no encoding or the name of the program used 309 to encode (e.g. compress or gzip). The mappings are table 310 driven. Encoding suffixes are case sensitive; type suffixes are 311 first tried case sensitive, then case insensitive. 312 313 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped 314 to ".tar.gz". (This is table-driven too, using the dictionary 315 suffix_map). 316 317 Optional `strict' argument when false adds a bunch of commonly found, but 318 non-standard types. 319 """ 320 if _db is None: 321 init() 322 return _db.guess_type(url, strict) 323 324 325def guess_file_type(path, *, strict=True): 326 """Guess the type of a file based on its path. 327 328 Similar to guess_type(), but takes file path istead of URL. 329 """ 330 if _db is None: 331 init() 332 return _db.guess_file_type(path, strict=strict) 333 334 335def guess_all_extensions(type, strict=True): 336 """Guess the extensions for a file based on its MIME type. 337 338 Return value is a list of strings giving the possible filename 339 extensions, including the leading dot ('.'). The extension is not 340 guaranteed to have been associated with any particular data 341 stream, but would be mapped to the MIME type `type' by 342 guess_type(). If no extension can be guessed for `type', None 343 is returned. 344 345 Optional `strict' argument when false adds a bunch of commonly found, 346 but non-standard types. 347 """ 348 if _db is None: 349 init() 350 return _db.guess_all_extensions(type, strict) 351 352def guess_extension(type, strict=True): 353 """Guess the extension for a file based on its MIME type. 354 355 Return value is a string giving a filename extension, including the 356 leading dot ('.'). The extension is not guaranteed to have been 357 associated with any particular data stream, but would be mapped to the 358 MIME type `type' by guess_type(). If no extension can be guessed for 359 `type', None is returned. 360 361 Optional `strict' argument when false adds a bunch of commonly found, 362 but non-standard types. 363 """ 364 if _db is None: 365 init() 366 return _db.guess_extension(type, strict) 367 368def add_type(type, ext, strict=True): 369 """Add a mapping between a type and an extension. 370 371 When the extension is already known, the new 372 type will replace the old one. When the type 373 is already known the extension will be added 374 to the list of known extensions. 375 376 If strict is true, information will be added to 377 list of standard types, else to the list of non-standard 378 types. 379 """ 380 if _db is None: 381 init() 382 return _db.add_type(type, ext, strict) 383 384 385def init(files=None): 386 global suffix_map, types_map, encodings_map, common_types 387 global inited, _db 388 inited = True # so that MimeTypes.__init__() doesn't call us again 389 390 if files is None or _db is None: 391 db = MimeTypes() 392 # Quick return if not supported 393 db.read_windows_registry() 394 395 if files is None: 396 files = knownfiles 397 else: 398 files = knownfiles + list(files) 399 else: 400 db = _db 401 402 for file in files: 403 if os.path.isfile(file): 404 db.read(file) 405 encodings_map = db.encodings_map 406 suffix_map = db.suffix_map 407 types_map = db.types_map[True] 408 common_types = db.types_map[False] 409 # Make the DB a global variable now that it is fully initialized 410 _db = db 411 412 413def read_mime_types(file): 414 try: 415 f = open(file, encoding='utf-8') 416 except OSError: 417 return None 418 with f: 419 db = MimeTypes() 420 db.readfp(f, True) 421 return db.types_map[True] 422 423 424def _default_mime_types(): 425 global suffix_map, _suffix_map_default 426 global encodings_map, _encodings_map_default 427 global types_map, _types_map_default 428 global common_types, _common_types_default 429 430 suffix_map = _suffix_map_default = { 431 '.svgz': '.svg.gz', 432 '.tgz': '.tar.gz', 433 '.taz': '.tar.gz', 434 '.tz': '.tar.gz', 435 '.tbz2': '.tar.bz2', 436 '.txz': '.tar.xz', 437 } 438 439 encodings_map = _encodings_map_default = { 440 '.gz': 'gzip', 441 '.Z': 'compress', 442 '.bz2': 'bzip2', 443 '.xz': 'xz', 444 '.br': 'br', 445 } 446 447 # Before adding new types, make sure they are either registered with IANA, 448 # at http://www.iana.org/assignments/media-types 449 # or extensions, i.e. using the x- prefix 450 451 # If you add to these, please keep them sorted by mime type. 452 # Make sure the entry with the preferred file extension for a particular mime type 453 # appears before any others of the same mimetype. 454 types_map = _types_map_default = { 455 '.js' : 'text/javascript', 456 '.mjs' : 'text/javascript', 457 '.json' : 'application/json', 458 '.webmanifest': 'application/manifest+json', 459 '.doc' : 'application/msword', 460 '.dot' : 'application/msword', 461 '.wiz' : 'application/msword', 462 '.nq' : 'application/n-quads', 463 '.nt' : 'application/n-triples', 464 '.bin' : 'application/octet-stream', 465 '.a' : 'application/octet-stream', 466 '.dll' : 'application/octet-stream', 467 '.exe' : 'application/octet-stream', 468 '.o' : 'application/octet-stream', 469 '.obj' : 'application/octet-stream', 470 '.so' : 'application/octet-stream', 471 '.oda' : 'application/oda', 472 '.pdf' : 'application/pdf', 473 '.p7c' : 'application/pkcs7-mime', 474 '.ps' : 'application/postscript', 475 '.ai' : 'application/postscript', 476 '.eps' : 'application/postscript', 477 '.trig' : 'application/trig', 478 '.m3u' : 'application/vnd.apple.mpegurl', 479 '.m3u8' : 'application/vnd.apple.mpegurl', 480 '.xls' : 'application/vnd.ms-excel', 481 '.xlb' : 'application/vnd.ms-excel', 482 '.ppt' : 'application/vnd.ms-powerpoint', 483 '.pot' : 'application/vnd.ms-powerpoint', 484 '.ppa' : 'application/vnd.ms-powerpoint', 485 '.pps' : 'application/vnd.ms-powerpoint', 486 '.pwz' : 'application/vnd.ms-powerpoint', 487 '.wasm' : 'application/wasm', 488 '.bcpio' : 'application/x-bcpio', 489 '.cpio' : 'application/x-cpio', 490 '.csh' : 'application/x-csh', 491 '.dvi' : 'application/x-dvi', 492 '.gtar' : 'application/x-gtar', 493 '.hdf' : 'application/x-hdf', 494 '.h5' : 'application/x-hdf5', 495 '.latex' : 'application/x-latex', 496 '.mif' : 'application/x-mif', 497 '.cdf' : 'application/x-netcdf', 498 '.nc' : 'application/x-netcdf', 499 '.p12' : 'application/x-pkcs12', 500 '.pfx' : 'application/x-pkcs12', 501 '.ram' : 'application/x-pn-realaudio', 502 '.pyc' : 'application/x-python-code', 503 '.pyo' : 'application/x-python-code', 504 '.sh' : 'application/x-sh', 505 '.shar' : 'application/x-shar', 506 '.swf' : 'application/x-shockwave-flash', 507 '.sv4cpio': 'application/x-sv4cpio', 508 '.sv4crc' : 'application/x-sv4crc', 509 '.tar' : 'application/x-tar', 510 '.tcl' : 'application/x-tcl', 511 '.tex' : 'application/x-tex', 512 '.texi' : 'application/x-texinfo', 513 '.texinfo': 'application/x-texinfo', 514 '.roff' : 'application/x-troff', 515 '.t' : 'application/x-troff', 516 '.tr' : 'application/x-troff', 517 '.man' : 'application/x-troff-man', 518 '.me' : 'application/x-troff-me', 519 '.ms' : 'application/x-troff-ms', 520 '.ustar' : 'application/x-ustar', 521 '.src' : 'application/x-wais-source', 522 '.xsl' : 'application/xml', 523 '.rdf' : 'application/xml', 524 '.wsdl' : 'application/xml', 525 '.xpdl' : 'application/xml', 526 '.zip' : 'application/zip', 527 '.3gp' : 'audio/3gpp', 528 '.3gpp' : 'audio/3gpp', 529 '.3g2' : 'audio/3gpp2', 530 '.3gpp2' : 'audio/3gpp2', 531 '.aac' : 'audio/aac', 532 '.adts' : 'audio/aac', 533 '.loas' : 'audio/aac', 534 '.ass' : 'audio/aac', 535 '.au' : 'audio/basic', 536 '.snd' : 'audio/basic', 537 '.mp3' : 'audio/mpeg', 538 '.mp2' : 'audio/mpeg', 539 '.opus' : 'audio/opus', 540 '.aif' : 'audio/x-aiff', 541 '.aifc' : 'audio/x-aiff', 542 '.aiff' : 'audio/x-aiff', 543 '.ra' : 'audio/x-pn-realaudio', 544 '.wav' : 'audio/x-wav', 545 '.avif' : 'image/avif', 546 '.bmp' : 'image/bmp', 547 '.gif' : 'image/gif', 548 '.ief' : 'image/ief', 549 '.jpg' : 'image/jpeg', 550 '.jpe' : 'image/jpeg', 551 '.jpeg' : 'image/jpeg', 552 '.heic' : 'image/heic', 553 '.heif' : 'image/heif', 554 '.png' : 'image/png', 555 '.svg' : 'image/svg+xml', 556 '.tiff' : 'image/tiff', 557 '.tif' : 'image/tiff', 558 '.ico' : 'image/vnd.microsoft.icon', 559 '.webp' : 'image/webp', 560 '.ras' : 'image/x-cmu-raster', 561 '.pnm' : 'image/x-portable-anymap', 562 '.pbm' : 'image/x-portable-bitmap', 563 '.pgm' : 'image/x-portable-graymap', 564 '.ppm' : 'image/x-portable-pixmap', 565 '.rgb' : 'image/x-rgb', 566 '.xbm' : 'image/x-xbitmap', 567 '.xpm' : 'image/x-xpixmap', 568 '.xwd' : 'image/x-xwindowdump', 569 '.eml' : 'message/rfc822', 570 '.mht' : 'message/rfc822', 571 '.mhtml' : 'message/rfc822', 572 '.nws' : 'message/rfc822', 573 '.css' : 'text/css', 574 '.csv' : 'text/csv', 575 '.html' : 'text/html', 576 '.htm' : 'text/html', 577 '.md' : 'text/markdown', 578 '.markdown': 'text/markdown', 579 '.n3' : 'text/n3', 580 '.txt' : 'text/plain', 581 '.bat' : 'text/plain', 582 '.c' : 'text/plain', 583 '.h' : 'text/plain', 584 '.ksh' : 'text/plain', 585 '.pl' : 'text/plain', 586 '.srt' : 'text/plain', 587 '.rtx' : 'text/richtext', 588 '.rtf' : 'text/rtf', 589 '.tsv' : 'text/tab-separated-values', 590 '.vtt' : 'text/vtt', 591 '.py' : 'text/x-python', 592 '.rst' : 'text/x-rst', 593 '.etx' : 'text/x-setext', 594 '.sgm' : 'text/x-sgml', 595 '.sgml' : 'text/x-sgml', 596 '.vcf' : 'text/x-vcard', 597 '.xml' : 'text/xml', 598 '.mp4' : 'video/mp4', 599 '.mpeg' : 'video/mpeg', 600 '.m1v' : 'video/mpeg', 601 '.mpa' : 'video/mpeg', 602 '.mpe' : 'video/mpeg', 603 '.mpg' : 'video/mpeg', 604 '.mov' : 'video/quicktime', 605 '.qt' : 'video/quicktime', 606 '.webm' : 'video/webm', 607 '.avi' : 'video/x-msvideo', 608 '.movie' : 'video/x-sgi-movie', 609 } 610 611 # These are non-standard types, commonly found in the wild. They will 612 # only match if strict=0 flag is given to the API methods. 613 614 # Please sort these too 615 common_types = _common_types_default = { 616 '.rtf' : 'application/rtf', 617 '.midi': 'audio/midi', 618 '.mid' : 'audio/midi', 619 '.jpg' : 'image/jpg', 620 '.pict': 'image/pict', 621 '.pct' : 'image/pict', 622 '.pic' : 'image/pict', 623 '.xul' : 'text/xul', 624 } 625 626 627_default_mime_types() 628 629 630def _main(): 631 import getopt 632 633 USAGE = """\ 634Usage: mimetypes.py [options] type 635 636Options: 637 --help / -h -- print this message and exit 638 --lenient / -l -- additionally search of some common, but non-standard 639 types. 640 --extension / -e -- guess extension instead of type 641 642More than one type argument may be given. 643""" 644 645 def usage(code, msg=''): 646 print(USAGE) 647 if msg: print(msg) 648 sys.exit(code) 649 650 try: 651 opts, args = getopt.getopt(sys.argv[1:], 'hle', 652 ['help', 'lenient', 'extension']) 653 except getopt.error as msg: 654 usage(1, msg) 655 656 strict = 1 657 extension = 0 658 for opt, arg in opts: 659 if opt in ('-h', '--help'): 660 usage(0) 661 elif opt in ('-l', '--lenient'): 662 strict = 0 663 elif opt in ('-e', '--extension'): 664 extension = 1 665 for gtype in args: 666 if extension: 667 guess = guess_extension(gtype, strict) 668 if not guess: print("I don't know anything about type", gtype) 669 else: print(guess) 670 else: 671 guess, encoding = guess_type(gtype, strict) 672 if not guess: print("I don't know anything about type", gtype) 673 else: print('type:', guess, 'encoding:', encoding) 674 675 676if __name__ == '__main__': 677 _main() 678