1"""Guess the MIME type of a file. 2 3This module defines two useful functions: 4 5guess_type(url, strict=True) -- guess the MIME type and encoding of a URL. 6 7guess_extension(type, strict=True) -- guess the extension for a given MIME type. 8 9It also contains the following, for tuning the behavior: 10 11Data: 12 13knownfiles -- list of files to parse 14inited -- flag set when init() has been called 15suffix_map -- dictionary mapping suffixes to suffixes 16encodings_map -- dictionary mapping suffixes to encodings 17types_map -- dictionary mapping suffixes to types 18 19Functions: 20 21init([files]) -- parse a list of files, default knownfiles (on Windows, the 22 default values are taken from the registry) 23read_mime_types(file) -- parse one file, return a dictionary or None 24""" 25 26import os 27import sys 28import posixpath 29import urllib.parse 30 31try: 32 from _winapi import _mimetypes_read_windows_registry 33except ImportError: 34 _mimetypes_read_windows_registry = None 35 36try: 37 import winreg as _winreg 38except ImportError: 39 _winreg = None 40 41__all__ = [ 42 "knownfiles", "inited", "MimeTypes", 43 "guess_type", "guess_all_extensions", "guess_extension", 44 "add_type", "init", "read_mime_types", 45 "suffix_map", "encodings_map", "types_map", "common_types" 46] 47 48knownfiles = [ 49 "/etc/mime.types", 50 "/etc/httpd/mime.types", # Mac OS X 51 "/etc/httpd/conf/mime.types", # Apache 52 "/etc/apache/mime.types", # Apache 1 53 "/etc/apache2/mime.types", # Apache 2 54 "/usr/local/etc/httpd/conf/mime.types", 55 "/usr/local/lib/netscape/mime.types", 56 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2 57 "/usr/local/etc/mime.types", # Apache 1.3 58 ] 59 60inited = False 61_db = None 62 63 64class MimeTypes: 65 """MIME-types datastore. 66 67 This datastore can handle information from mime.types-style files 68 and supports basic determination of MIME type from a filename or 69 URL, and can guess a reasonable extension given a MIME type. 70 """ 71 72 def __init__(self, filenames=(), strict=True): 73 if not inited: 74 init() 75 self.encodings_map = _encodings_map_default.copy() 76 self.suffix_map = _suffix_map_default.copy() 77 self.types_map = ({}, {}) # dict for (non-strict, strict) 78 self.types_map_inv = ({}, {}) 79 for (ext, type) in _types_map_default.items(): 80 self.add_type(type, ext, True) 81 for (ext, type) in _common_types_default.items(): 82 self.add_type(type, ext, False) 83 for name in filenames: 84 self.read(name, strict) 85 86 def add_type(self, type, ext, strict=True): 87 """Add a mapping between a type and an extension. 88 89 When the extension is already known, the new 90 type will replace the old one. When the type 91 is already known the extension will be added 92 to the list of known extensions. 93 94 If strict is true, information will be added to 95 list of standard types, else to the list of non-standard 96 types. 97 """ 98 self.types_map[strict][ext] = type 99 exts = self.types_map_inv[strict].setdefault(type, []) 100 if ext not in exts: 101 exts.append(ext) 102 103 def guess_type(self, url, strict=True): 104 """Guess the type of a file which is either a URL or a path-like object. 105 106 Return value is a tuple (type, encoding) where type is None if 107 the type can't be guessed (no or unknown suffix) or a string 108 of the form type/subtype, usable for a MIME Content-type 109 header; and encoding is None for no encoding or the name of 110 the program used to encode (e.g. compress or gzip). The 111 mappings are table driven. Encoding suffixes are case 112 sensitive; type suffixes are first tried case sensitive, then 113 case insensitive. 114 115 The suffixes .tgz, .taz and .tz (case sensitive!) are all 116 mapped to '.tar.gz'. (This is table-driven too, using the 117 dictionary suffix_map.) 118 119 Optional `strict' argument when False adds a bunch of commonly found, 120 but non-standard types. 121 """ 122 url = os.fspath(url) 123 scheme, url = urllib.parse._splittype(url) 124 if scheme == 'data': 125 # syntax of data URLs: 126 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 127 # mediatype := [ type "/" subtype ] *( ";" parameter ) 128 # data := *urlchar 129 # parameter := attribute "=" value 130 # type/subtype defaults to "text/plain" 131 comma = url.find(',') 132 if comma < 0: 133 # bad data URL 134 return None, None 135 semi = url.find(';', 0, comma) 136 if semi >= 0: 137 type = url[:semi] 138 else: 139 type = url[:comma] 140 if '=' in type or '/' not in type: 141 type = 'text/plain' 142 return type, None # never compressed, so encoding is None 143 base, ext = posixpath.splitext(url) 144 while (ext_lower := ext.lower()) in self.suffix_map: 145 base, ext = posixpath.splitext(base + self.suffix_map[ext_lower]) 146 # encodings_map is case sensitive 147 if ext in self.encodings_map: 148 encoding = self.encodings_map[ext] 149 base, ext = posixpath.splitext(base) 150 else: 151 encoding = None 152 ext = ext.lower() 153 types_map = self.types_map[True] 154 if ext in types_map: 155 return types_map[ext], encoding 156 elif strict: 157 return None, encoding 158 types_map = self.types_map[False] 159 if ext in types_map: 160 return types_map[ext], encoding 161 else: 162 return None, encoding 163 164 def guess_all_extensions(self, type, strict=True): 165 """Guess the extensions for a file based on its MIME type. 166 167 Return value is a list of strings giving the possible filename 168 extensions, including the leading dot ('.'). The extension is not 169 guaranteed to have been associated with any particular data stream, 170 but would be mapped to the MIME type `type' by guess_type(). 171 172 Optional `strict' argument when false adds a bunch of commonly found, 173 but non-standard types. 174 """ 175 type = type.lower() 176 extensions = list(self.types_map_inv[True].get(type, [])) 177 if not strict: 178 for ext in self.types_map_inv[False].get(type, []): 179 if ext not in extensions: 180 extensions.append(ext) 181 return extensions 182 183 def guess_extension(self, type, strict=True): 184 """Guess the extension for a file based on its MIME type. 185 186 Return value is a string giving a filename extension, 187 including the leading dot ('.'). The extension is not 188 guaranteed to have been associated with any particular data 189 stream, but would be mapped to the MIME type `type' by 190 guess_type(). If no extension can be guessed for `type', None 191 is returned. 192 193 Optional `strict' argument when false adds a bunch of commonly found, 194 but non-standard types. 195 """ 196 extensions = self.guess_all_extensions(type, strict) 197 if not extensions: 198 return None 199 return extensions[0] 200 201 def read(self, filename, strict=True): 202 """ 203 Read a single mime.types-format file, specified by pathname. 204 205 If strict is true, information will be added to 206 list of standard types, else to the list of non-standard 207 types. 208 """ 209 with open(filename, encoding='utf-8') as fp: 210 self.readfp(fp, strict) 211 212 def readfp(self, fp, strict=True): 213 """ 214 Read a single mime.types-format file. 215 216 If strict is true, information will be added to 217 list of standard types, else to the list of non-standard 218 types. 219 """ 220 while 1: 221 line = fp.readline() 222 if not line: 223 break 224 words = line.split() 225 for i in range(len(words)): 226 if words[i][0] == '#': 227 del words[i:] 228 break 229 if not words: 230 continue 231 type, suffixes = words[0], words[1:] 232 for suff in suffixes: 233 self.add_type(type, '.' + suff, strict) 234 235 def read_windows_registry(self, strict=True): 236 """ 237 Load the MIME types database from Windows registry. 238 239 If strict is true, information will be added to 240 list of standard types, else to the list of non-standard 241 types. 242 """ 243 244 if not _mimetypes_read_windows_registry and not _winreg: 245 return 246 247 add_type = self.add_type 248 if strict: 249 add_type = lambda type, ext: self.add_type(type, ext, True) 250 251 # Accelerated function if it is available 252 if _mimetypes_read_windows_registry: 253 _mimetypes_read_windows_registry(add_type) 254 elif _winreg: 255 self._read_windows_registry(add_type) 256 257 @classmethod 258 def _read_windows_registry(cls, add_type): 259 def enum_types(mimedb): 260 i = 0 261 while True: 262 try: 263 ctype = _winreg.EnumKey(mimedb, i) 264 except OSError: 265 break 266 else: 267 if '\0' not in ctype: 268 yield ctype 269 i += 1 270 271 with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr: 272 for subkeyname in enum_types(hkcr): 273 try: 274 with _winreg.OpenKey(hkcr, subkeyname) as subkey: 275 # Only check file extensions 276 if not subkeyname.startswith("."): 277 continue 278 # raises OSError if no 'Content Type' value 279 mimetype, datatype = _winreg.QueryValueEx( 280 subkey, 'Content Type') 281 if datatype != _winreg.REG_SZ: 282 continue 283 add_type(mimetype, subkeyname) 284 except OSError: 285 continue 286 287def guess_type(url, strict=True): 288 """Guess the type of a file based on its URL. 289 290 Return value is a tuple (type, encoding) where type is None if the 291 type can't be guessed (no or unknown suffix) or a string of the 292 form type/subtype, usable for a MIME Content-type header; and 293 encoding is None for no encoding or the name of the program used 294 to encode (e.g. compress or gzip). The mappings are table 295 driven. Encoding suffixes are case sensitive; type suffixes are 296 first tried case sensitive, then case insensitive. 297 298 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped 299 to ".tar.gz". (This is table-driven too, using the dictionary 300 suffix_map). 301 302 Optional `strict' argument when false adds a bunch of commonly found, but 303 non-standard types. 304 """ 305 if _db is None: 306 init() 307 return _db.guess_type(url, strict) 308 309 310def guess_all_extensions(type, strict=True): 311 """Guess the extensions for a file based on its MIME type. 312 313 Return value is a list of strings giving the possible filename 314 extensions, including the leading dot ('.'). The extension is not 315 guaranteed to have been associated with any particular data 316 stream, but would be mapped to the MIME type `type' by 317 guess_type(). If no extension can be guessed for `type', None 318 is returned. 319 320 Optional `strict' argument when false adds a bunch of commonly found, 321 but non-standard types. 322 """ 323 if _db is None: 324 init() 325 return _db.guess_all_extensions(type, strict) 326 327def guess_extension(type, strict=True): 328 """Guess the extension for a file based on its MIME type. 329 330 Return value is a string giving a filename extension, including the 331 leading dot ('.'). The extension is not guaranteed to have been 332 associated with any particular data stream, but would be mapped to the 333 MIME type `type' by guess_type(). If no extension can be guessed for 334 `type', None is returned. 335 336 Optional `strict' argument when false adds a bunch of commonly found, 337 but non-standard types. 338 """ 339 if _db is None: 340 init() 341 return _db.guess_extension(type, strict) 342 343def add_type(type, ext, strict=True): 344 """Add a mapping between a type and an extension. 345 346 When the extension is already known, the new 347 type will replace the old one. When the type 348 is already known the extension will be added 349 to the list of known extensions. 350 351 If strict is true, information will be added to 352 list of standard types, else to the list of non-standard 353 types. 354 """ 355 if _db is None: 356 init() 357 return _db.add_type(type, ext, strict) 358 359 360def init(files=None): 361 global suffix_map, types_map, encodings_map, common_types 362 global inited, _db 363 inited = True # so that MimeTypes.__init__() doesn't call us again 364 365 if files is None or _db is None: 366 db = MimeTypes() 367 # Quick return if not supported 368 db.read_windows_registry() 369 370 if files is None: 371 files = knownfiles 372 else: 373 files = knownfiles + list(files) 374 else: 375 db = _db 376 377 for file in files: 378 if os.path.isfile(file): 379 db.read(file) 380 encodings_map = db.encodings_map 381 suffix_map = db.suffix_map 382 types_map = db.types_map[True] 383 common_types = db.types_map[False] 384 # Make the DB a global variable now that it is fully initialized 385 _db = db 386 387 388def read_mime_types(file): 389 try: 390 f = open(file, encoding='utf-8') 391 except OSError: 392 return None 393 with f: 394 db = MimeTypes() 395 db.readfp(f, True) 396 return db.types_map[True] 397 398 399def _default_mime_types(): 400 global suffix_map, _suffix_map_default 401 global encodings_map, _encodings_map_default 402 global types_map, _types_map_default 403 global common_types, _common_types_default 404 405 suffix_map = _suffix_map_default = { 406 '.svgz': '.svg.gz', 407 '.tgz': '.tar.gz', 408 '.taz': '.tar.gz', 409 '.tz': '.tar.gz', 410 '.tbz2': '.tar.bz2', 411 '.txz': '.tar.xz', 412 } 413 414 encodings_map = _encodings_map_default = { 415 '.gz': 'gzip', 416 '.Z': 'compress', 417 '.bz2': 'bzip2', 418 '.xz': 'xz', 419 '.br': 'br', 420 } 421 422 # Before adding new types, make sure they are either registered with IANA, 423 # at http://www.iana.org/assignments/media-types 424 # or extensions, i.e. using the x- prefix 425 426 # If you add to these, please keep them sorted by mime type. 427 # Make sure the entry with the preferred file extension for a particular mime type 428 # appears before any others of the same mimetype. 429 types_map = _types_map_default = { 430 '.js' : 'application/javascript', 431 '.mjs' : 'application/javascript', 432 '.json' : 'application/json', 433 '.webmanifest': 'application/manifest+json', 434 '.doc' : 'application/msword', 435 '.dot' : 'application/msword', 436 '.wiz' : 'application/msword', 437 '.nq' : 'application/n-quads', 438 '.nt' : 'application/n-triples', 439 '.bin' : 'application/octet-stream', 440 '.a' : 'application/octet-stream', 441 '.dll' : 'application/octet-stream', 442 '.exe' : 'application/octet-stream', 443 '.o' : 'application/octet-stream', 444 '.obj' : 'application/octet-stream', 445 '.so' : 'application/octet-stream', 446 '.oda' : 'application/oda', 447 '.pdf' : 'application/pdf', 448 '.p7c' : 'application/pkcs7-mime', 449 '.ps' : 'application/postscript', 450 '.ai' : 'application/postscript', 451 '.eps' : 'application/postscript', 452 '.trig' : 'application/trig', 453 '.m3u' : 'application/vnd.apple.mpegurl', 454 '.m3u8' : 'application/vnd.apple.mpegurl', 455 '.xls' : 'application/vnd.ms-excel', 456 '.xlb' : 'application/vnd.ms-excel', 457 '.ppt' : 'application/vnd.ms-powerpoint', 458 '.pot' : 'application/vnd.ms-powerpoint', 459 '.ppa' : 'application/vnd.ms-powerpoint', 460 '.pps' : 'application/vnd.ms-powerpoint', 461 '.pwz' : 'application/vnd.ms-powerpoint', 462 '.wasm' : 'application/wasm', 463 '.bcpio' : 'application/x-bcpio', 464 '.cpio' : 'application/x-cpio', 465 '.csh' : 'application/x-csh', 466 '.dvi' : 'application/x-dvi', 467 '.gtar' : 'application/x-gtar', 468 '.hdf' : 'application/x-hdf', 469 '.h5' : 'application/x-hdf5', 470 '.latex' : 'application/x-latex', 471 '.mif' : 'application/x-mif', 472 '.cdf' : 'application/x-netcdf', 473 '.nc' : 'application/x-netcdf', 474 '.p12' : 'application/x-pkcs12', 475 '.pfx' : 'application/x-pkcs12', 476 '.ram' : 'application/x-pn-realaudio', 477 '.pyc' : 'application/x-python-code', 478 '.pyo' : 'application/x-python-code', 479 '.sh' : 'application/x-sh', 480 '.shar' : 'application/x-shar', 481 '.swf' : 'application/x-shockwave-flash', 482 '.sv4cpio': 'application/x-sv4cpio', 483 '.sv4crc' : 'application/x-sv4crc', 484 '.tar' : 'application/x-tar', 485 '.tcl' : 'application/x-tcl', 486 '.tex' : 'application/x-tex', 487 '.texi' : 'application/x-texinfo', 488 '.texinfo': 'application/x-texinfo', 489 '.roff' : 'application/x-troff', 490 '.t' : 'application/x-troff', 491 '.tr' : 'application/x-troff', 492 '.man' : 'application/x-troff-man', 493 '.me' : 'application/x-troff-me', 494 '.ms' : 'application/x-troff-ms', 495 '.ustar' : 'application/x-ustar', 496 '.src' : 'application/x-wais-source', 497 '.xsl' : 'application/xml', 498 '.rdf' : 'application/xml', 499 '.wsdl' : 'application/xml', 500 '.xpdl' : 'application/xml', 501 '.zip' : 'application/zip', 502 '.3gp' : 'audio/3gpp', 503 '.3gpp' : 'audio/3gpp', 504 '.3g2' : 'audio/3gpp2', 505 '.3gpp2' : 'audio/3gpp2', 506 '.aac' : 'audio/aac', 507 '.adts' : 'audio/aac', 508 '.loas' : 'audio/aac', 509 '.ass' : 'audio/aac', 510 '.au' : 'audio/basic', 511 '.snd' : 'audio/basic', 512 '.mp3' : 'audio/mpeg', 513 '.mp2' : 'audio/mpeg', 514 '.opus' : 'audio/opus', 515 '.aif' : 'audio/x-aiff', 516 '.aifc' : 'audio/x-aiff', 517 '.aiff' : 'audio/x-aiff', 518 '.ra' : 'audio/x-pn-realaudio', 519 '.wav' : 'audio/x-wav', 520 '.avif' : 'image/avif', 521 '.bmp' : 'image/bmp', 522 '.gif' : 'image/gif', 523 '.ief' : 'image/ief', 524 '.jpg' : 'image/jpeg', 525 '.jpe' : 'image/jpeg', 526 '.jpeg' : 'image/jpeg', 527 '.heic' : 'image/heic', 528 '.heif' : 'image/heif', 529 '.png' : 'image/png', 530 '.svg' : 'image/svg+xml', 531 '.tiff' : 'image/tiff', 532 '.tif' : 'image/tiff', 533 '.ico' : 'image/vnd.microsoft.icon', 534 '.ras' : 'image/x-cmu-raster', 535 '.pnm' : 'image/x-portable-anymap', 536 '.pbm' : 'image/x-portable-bitmap', 537 '.pgm' : 'image/x-portable-graymap', 538 '.ppm' : 'image/x-portable-pixmap', 539 '.rgb' : 'image/x-rgb', 540 '.xbm' : 'image/x-xbitmap', 541 '.xpm' : 'image/x-xpixmap', 542 '.xwd' : 'image/x-xwindowdump', 543 '.eml' : 'message/rfc822', 544 '.mht' : 'message/rfc822', 545 '.mhtml' : 'message/rfc822', 546 '.nws' : 'message/rfc822', 547 '.css' : 'text/css', 548 '.csv' : 'text/csv', 549 '.html' : 'text/html', 550 '.htm' : 'text/html', 551 '.n3' : 'text/n3', 552 '.txt' : 'text/plain', 553 '.bat' : 'text/plain', 554 '.c' : 'text/plain', 555 '.h' : 'text/plain', 556 '.ksh' : 'text/plain', 557 '.pl' : 'text/plain', 558 '.srt' : 'text/plain', 559 '.rtx' : 'text/richtext', 560 '.tsv' : 'text/tab-separated-values', 561 '.vtt' : 'text/vtt', 562 '.py' : 'text/x-python', 563 '.etx' : 'text/x-setext', 564 '.sgm' : 'text/x-sgml', 565 '.sgml' : 'text/x-sgml', 566 '.vcf' : 'text/x-vcard', 567 '.xml' : 'text/xml', 568 '.mp4' : 'video/mp4', 569 '.mpeg' : 'video/mpeg', 570 '.m1v' : 'video/mpeg', 571 '.mpa' : 'video/mpeg', 572 '.mpe' : 'video/mpeg', 573 '.mpg' : 'video/mpeg', 574 '.mov' : 'video/quicktime', 575 '.qt' : 'video/quicktime', 576 '.webm' : 'video/webm', 577 '.avi' : 'video/x-msvideo', 578 '.movie' : 'video/x-sgi-movie', 579 } 580 581 # These are non-standard types, commonly found in the wild. They will 582 # only match if strict=0 flag is given to the API methods. 583 584 # Please sort these too 585 common_types = _common_types_default = { 586 '.rtf' : 'application/rtf', 587 '.midi': 'audio/midi', 588 '.mid' : 'audio/midi', 589 '.jpg' : 'image/jpg', 590 '.pict': 'image/pict', 591 '.pct' : 'image/pict', 592 '.pic' : 'image/pict', 593 '.webp': 'image/webp', 594 '.xul' : 'text/xul', 595 } 596 597 598_default_mime_types() 599 600 601def _main(): 602 import getopt 603 604 USAGE = """\ 605Usage: mimetypes.py [options] type 606 607Options: 608 --help / -h -- print this message and exit 609 --lenient / -l -- additionally search of some common, but non-standard 610 types. 611 --extension / -e -- guess extension instead of type 612 613More than one type argument may be given. 614""" 615 616 def usage(code, msg=''): 617 print(USAGE) 618 if msg: print(msg) 619 sys.exit(code) 620 621 try: 622 opts, args = getopt.getopt(sys.argv[1:], 'hle', 623 ['help', 'lenient', 'extension']) 624 except getopt.error as msg: 625 usage(1, msg) 626 627 strict = 1 628 extension = 0 629 for opt, arg in opts: 630 if opt in ('-h', '--help'): 631 usage(0) 632 elif opt in ('-l', '--lenient'): 633 strict = 0 634 elif opt in ('-e', '--extension'): 635 extension = 1 636 for gtype in args: 637 if extension: 638 guess = guess_extension(gtype, strict) 639 if not guess: print("I don't know anything about type", gtype) 640 else: print(guess) 641 else: 642 guess, encoding = guess_type(gtype, strict) 643 if not guess: print("I don't know anything about type", gtype) 644 else: print('type:', guess, 'encoding:', encoding) 645 646 647if __name__ == '__main__': 648 _main() 649