1"""Guess the MIME type of a file. 2 3This module defines two useful functions: 4 5guess_type(url, strict=True) -- guess the MIME type and encoding of a URL. 6 7guess_extension(type, strict=True) -- guess the extension for a given MIME type. 8 9It also contains the following, for tuning the behavior: 10 11Data: 12 13knownfiles -- list of files to parse 14inited -- flag set when init() has been called 15suffix_map -- dictionary mapping suffixes to suffixes 16encodings_map -- dictionary mapping suffixes to encodings 17types_map -- dictionary mapping suffixes to types 18 19Functions: 20 21init([files]) -- parse a list of files, default knownfiles (on Windows, the 22 default values are taken from the registry) 23read_mime_types(file) -- parse one file, return a dictionary or None 24""" 25 26import os 27import sys 28import posixpath 29import urllib.parse 30 31try: 32 from _winapi import _mimetypes_read_windows_registry 33except ImportError: 34 _mimetypes_read_windows_registry = None 35 36try: 37 import winreg as _winreg 38except ImportError: 39 _winreg = None 40 41__all__ = [ 42 "knownfiles", "inited", "MimeTypes", 43 "guess_type", "guess_all_extensions", "guess_extension", 44 "add_type", "init", "read_mime_types", 45 "suffix_map", "encodings_map", "types_map", "common_types" 46] 47 48knownfiles = [ 49 "/etc/mime.types", 50 "/etc/httpd/mime.types", # Mac OS X 51 "/etc/httpd/conf/mime.types", # Apache 52 "/etc/apache/mime.types", # Apache 1 53 "/etc/apache2/mime.types", # Apache 2 54 "/usr/local/etc/httpd/conf/mime.types", 55 "/usr/local/lib/netscape/mime.types", 56 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2 57 "/usr/local/etc/mime.types", # Apache 1.3 58 ] 59 60inited = False 61_db = None 62 63 64class MimeTypes: 65 """MIME-types datastore. 66 67 This datastore can handle information from mime.types-style files 68 and supports basic determination of MIME type from a filename or 69 URL, and can guess a reasonable extension given a MIME type. 70 """ 71 72 def __init__(self, filenames=(), strict=True): 73 if not inited: 74 init() 75 self.encodings_map = _encodings_map_default.copy() 76 self.suffix_map = _suffix_map_default.copy() 77 self.types_map = ({}, {}) # dict for (non-strict, strict) 78 self.types_map_inv = ({}, {}) 79 for (ext, type) in _types_map_default.items(): 80 self.add_type(type, ext, True) 81 for (ext, type) in _common_types_default.items(): 82 self.add_type(type, ext, False) 83 for name in filenames: 84 self.read(name, strict) 85 86 def add_type(self, type, ext, strict=True): 87 """Add a mapping between a type and an extension. 88 89 When the extension is already known, the new 90 type will replace the old one. When the type 91 is already known the extension will be added 92 to the list of known extensions. 93 94 If strict is true, information will be added to 95 list of standard types, else to the list of non-standard 96 types. 97 """ 98 self.types_map[strict][ext] = type 99 exts = self.types_map_inv[strict].setdefault(type, []) 100 if ext not in exts: 101 exts.append(ext) 102 103 def guess_type(self, url, strict=True): 104 """Guess the type of a file which is either a URL or a path-like object. 105 106 Return value is a tuple (type, encoding) where type is None if 107 the type can't be guessed (no or unknown suffix) or a string 108 of the form type/subtype, usable for a MIME Content-type 109 header; and encoding is None for no encoding or the name of 110 the program used to encode (e.g. compress or gzip). The 111 mappings are table driven. Encoding suffixes are case 112 sensitive; type suffixes are first tried case sensitive, then 113 case insensitive. 114 115 The suffixes .tgz, .taz and .tz (case sensitive!) are all 116 mapped to '.tar.gz'. (This is table-driven too, using the 117 dictionary suffix_map.) 118 119 Optional `strict' argument when False adds a bunch of commonly found, 120 but non-standard types. 121 """ 122 url = os.fspath(url) 123 scheme, url = urllib.parse._splittype(url) 124 if scheme == 'data': 125 # syntax of data URLs: 126 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 127 # mediatype := [ type "/" subtype ] *( ";" parameter ) 128 # data := *urlchar 129 # parameter := attribute "=" value 130 # type/subtype defaults to "text/plain" 131 comma = url.find(',') 132 if comma < 0: 133 # bad data URL 134 return None, None 135 semi = url.find(';', 0, comma) 136 if semi >= 0: 137 type = url[:semi] 138 else: 139 type = url[:comma] 140 if '=' in type or '/' not in type: 141 type = 'text/plain' 142 return type, None # never compressed, so encoding is None 143 base, ext = posixpath.splitext(url) 144 while ext in self.suffix_map: 145 base, ext = posixpath.splitext(base + self.suffix_map[ext]) 146 if ext in self.encodings_map: 147 encoding = self.encodings_map[ext] 148 base, ext = posixpath.splitext(base) 149 else: 150 encoding = None 151 types_map = self.types_map[True] 152 if ext in types_map: 153 return types_map[ext], encoding 154 elif ext.lower() in types_map: 155 return types_map[ext.lower()], encoding 156 elif strict: 157 return None, encoding 158 types_map = self.types_map[False] 159 if ext in types_map: 160 return types_map[ext], encoding 161 elif ext.lower() in types_map: 162 return types_map[ext.lower()], encoding 163 else: 164 return None, encoding 165 166 def guess_all_extensions(self, type, strict=True): 167 """Guess the extensions for a file based on its MIME type. 168 169 Return value is a list of strings giving the possible filename 170 extensions, including the leading dot ('.'). The extension is not 171 guaranteed to have been associated with any particular data stream, 172 but would be mapped to the MIME type `type' by guess_type(). 173 174 Optional `strict' argument when false adds a bunch of commonly found, 175 but non-standard types. 176 """ 177 type = type.lower() 178 extensions = list(self.types_map_inv[True].get(type, [])) 179 if not strict: 180 for ext in self.types_map_inv[False].get(type, []): 181 if ext not in extensions: 182 extensions.append(ext) 183 return extensions 184 185 def guess_extension(self, type, strict=True): 186 """Guess the extension for a file based on its MIME type. 187 188 Return value is a string giving a filename extension, 189 including the leading dot ('.'). The extension is not 190 guaranteed to have been associated with any particular data 191 stream, but would be mapped to the MIME type `type' by 192 guess_type(). If no extension can be guessed for `type', None 193 is returned. 194 195 Optional `strict' argument when false adds a bunch of commonly found, 196 but non-standard types. 197 """ 198 extensions = self.guess_all_extensions(type, strict) 199 if not extensions: 200 return None 201 return extensions[0] 202 203 def read(self, filename, strict=True): 204 """ 205 Read a single mime.types-format file, specified by pathname. 206 207 If strict is true, information will be added to 208 list of standard types, else to the list of non-standard 209 types. 210 """ 211 with open(filename, encoding='utf-8') as fp: 212 self.readfp(fp, strict) 213 214 def readfp(self, fp, strict=True): 215 """ 216 Read a single mime.types-format file. 217 218 If strict is true, information will be added to 219 list of standard types, else to the list of non-standard 220 types. 221 """ 222 while 1: 223 line = fp.readline() 224 if not line: 225 break 226 words = line.split() 227 for i in range(len(words)): 228 if words[i][0] == '#': 229 del words[i:] 230 break 231 if not words: 232 continue 233 type, suffixes = words[0], words[1:] 234 for suff in suffixes: 235 self.add_type(type, '.' + suff, strict) 236 237 def read_windows_registry(self, strict=True): 238 """ 239 Load the MIME types database from Windows registry. 240 241 If strict is true, information will be added to 242 list of standard types, else to the list of non-standard 243 types. 244 """ 245 246 if not _mimetypes_read_windows_registry and not _winreg: 247 return 248 249 add_type = self.add_type 250 if strict: 251 add_type = lambda type, ext: self.add_type(type, ext, True) 252 253 # Accelerated function if it is available 254 if _mimetypes_read_windows_registry: 255 _mimetypes_read_windows_registry(add_type) 256 elif _winreg: 257 self._read_windows_registry(add_type) 258 259 @classmethod 260 def _read_windows_registry(cls, add_type): 261 def enum_types(mimedb): 262 i = 0 263 while True: 264 try: 265 ctype = _winreg.EnumKey(mimedb, i) 266 except OSError: 267 break 268 else: 269 if '\0' not in ctype: 270 yield ctype 271 i += 1 272 273 with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr: 274 for subkeyname in enum_types(hkcr): 275 try: 276 with _winreg.OpenKey(hkcr, subkeyname) as subkey: 277 # Only check file extensions 278 if not subkeyname.startswith("."): 279 continue 280 # raises OSError if no 'Content Type' value 281 mimetype, datatype = _winreg.QueryValueEx( 282 subkey, 'Content Type') 283 if datatype != _winreg.REG_SZ: 284 continue 285 add_type(mimetype, subkeyname) 286 except OSError: 287 continue 288 289def guess_type(url, strict=True): 290 """Guess the type of a file based on its URL. 291 292 Return value is a tuple (type, encoding) where type is None if the 293 type can't be guessed (no or unknown suffix) or a string of the 294 form type/subtype, usable for a MIME Content-type header; and 295 encoding is None for no encoding or the name of the program used 296 to encode (e.g. compress or gzip). The mappings are table 297 driven. Encoding suffixes are case sensitive; type suffixes are 298 first tried case sensitive, then case insensitive. 299 300 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped 301 to ".tar.gz". (This is table-driven too, using the dictionary 302 suffix_map). 303 304 Optional `strict' argument when false adds a bunch of commonly found, but 305 non-standard types. 306 """ 307 if _db is None: 308 init() 309 return _db.guess_type(url, strict) 310 311 312def guess_all_extensions(type, strict=True): 313 """Guess the extensions for a file based on its MIME type. 314 315 Return value is a list of strings giving the possible filename 316 extensions, including the leading dot ('.'). The extension is not 317 guaranteed to have been associated with any particular data 318 stream, but would be mapped to the MIME type `type' by 319 guess_type(). If no extension can be guessed for `type', None 320 is returned. 321 322 Optional `strict' argument when false adds a bunch of commonly found, 323 but non-standard types. 324 """ 325 if _db is None: 326 init() 327 return _db.guess_all_extensions(type, strict) 328 329def guess_extension(type, strict=True): 330 """Guess the extension for a file based on its MIME type. 331 332 Return value is a string giving a filename extension, including the 333 leading dot ('.'). The extension is not guaranteed to have been 334 associated with any particular data stream, but would be mapped to the 335 MIME type `type' by guess_type(). If no extension can be guessed for 336 `type', None is returned. 337 338 Optional `strict' argument when false adds a bunch of commonly found, 339 but non-standard types. 340 """ 341 if _db is None: 342 init() 343 return _db.guess_extension(type, strict) 344 345def add_type(type, ext, strict=True): 346 """Add a mapping between a type and an extension. 347 348 When the extension is already known, the new 349 type will replace the old one. When the type 350 is already known the extension will be added 351 to the list of known extensions. 352 353 If strict is true, information will be added to 354 list of standard types, else to the list of non-standard 355 types. 356 """ 357 if _db is None: 358 init() 359 return _db.add_type(type, ext, strict) 360 361 362def init(files=None): 363 global suffix_map, types_map, encodings_map, common_types 364 global inited, _db 365 inited = True # so that MimeTypes.__init__() doesn't call us again 366 367 if files is None or _db is None: 368 db = MimeTypes() 369 # Quick return if not supported 370 db.read_windows_registry() 371 372 if files is None: 373 files = knownfiles 374 else: 375 files = knownfiles + list(files) 376 else: 377 db = _db 378 379 for file in files: 380 if os.path.isfile(file): 381 db.read(file) 382 encodings_map = db.encodings_map 383 suffix_map = db.suffix_map 384 types_map = db.types_map[True] 385 common_types = db.types_map[False] 386 # Make the DB a global variable now that it is fully initialized 387 _db = db 388 389 390def read_mime_types(file): 391 try: 392 f = open(file, encoding='utf-8') 393 except OSError: 394 return None 395 with f: 396 db = MimeTypes() 397 db.readfp(f, True) 398 return db.types_map[True] 399 400 401def _default_mime_types(): 402 global suffix_map, _suffix_map_default 403 global encodings_map, _encodings_map_default 404 global types_map, _types_map_default 405 global common_types, _common_types_default 406 407 suffix_map = _suffix_map_default = { 408 '.svgz': '.svg.gz', 409 '.tgz': '.tar.gz', 410 '.taz': '.tar.gz', 411 '.tz': '.tar.gz', 412 '.tbz2': '.tar.bz2', 413 '.txz': '.tar.xz', 414 } 415 416 encodings_map = _encodings_map_default = { 417 '.gz': 'gzip', 418 '.Z': 'compress', 419 '.bz2': 'bzip2', 420 '.xz': 'xz', 421 '.br': 'br', 422 } 423 424 # Before adding new types, make sure they are either registered with IANA, 425 # at http://www.iana.org/assignments/media-types 426 # or extensions, i.e. using the x- prefix 427 428 # If you add to these, please keep them sorted by mime type. 429 # Make sure the entry with the preferred file extension for a particular mime type 430 # appears before any others of the same mimetype. 431 types_map = _types_map_default = { 432 '.js' : 'application/javascript', 433 '.mjs' : 'application/javascript', 434 '.json' : 'application/json', 435 '.webmanifest': 'application/manifest+json', 436 '.doc' : 'application/msword', 437 '.dot' : 'application/msword', 438 '.wiz' : 'application/msword', 439 '.bin' : 'application/octet-stream', 440 '.a' : 'application/octet-stream', 441 '.dll' : 'application/octet-stream', 442 '.exe' : 'application/octet-stream', 443 '.o' : 'application/octet-stream', 444 '.obj' : 'application/octet-stream', 445 '.so' : 'application/octet-stream', 446 '.oda' : 'application/oda', 447 '.pdf' : 'application/pdf', 448 '.p7c' : 'application/pkcs7-mime', 449 '.ps' : 'application/postscript', 450 '.ai' : 'application/postscript', 451 '.eps' : 'application/postscript', 452 '.m3u' : 'application/vnd.apple.mpegurl', 453 '.m3u8' : 'application/vnd.apple.mpegurl', 454 '.xls' : 'application/vnd.ms-excel', 455 '.xlb' : 'application/vnd.ms-excel', 456 '.ppt' : 'application/vnd.ms-powerpoint', 457 '.pot' : 'application/vnd.ms-powerpoint', 458 '.ppa' : 'application/vnd.ms-powerpoint', 459 '.pps' : 'application/vnd.ms-powerpoint', 460 '.pwz' : 'application/vnd.ms-powerpoint', 461 '.wasm' : 'application/wasm', 462 '.bcpio' : 'application/x-bcpio', 463 '.cpio' : 'application/x-cpio', 464 '.csh' : 'application/x-csh', 465 '.dvi' : 'application/x-dvi', 466 '.gtar' : 'application/x-gtar', 467 '.hdf' : 'application/x-hdf', 468 '.h5' : 'application/x-hdf5', 469 '.latex' : 'application/x-latex', 470 '.mif' : 'application/x-mif', 471 '.cdf' : 'application/x-netcdf', 472 '.nc' : 'application/x-netcdf', 473 '.p12' : 'application/x-pkcs12', 474 '.pfx' : 'application/x-pkcs12', 475 '.ram' : 'application/x-pn-realaudio', 476 '.pyc' : 'application/x-python-code', 477 '.pyo' : 'application/x-python-code', 478 '.sh' : 'application/x-sh', 479 '.shar' : 'application/x-shar', 480 '.swf' : 'application/x-shockwave-flash', 481 '.sv4cpio': 'application/x-sv4cpio', 482 '.sv4crc' : 'application/x-sv4crc', 483 '.tar' : 'application/x-tar', 484 '.tcl' : 'application/x-tcl', 485 '.tex' : 'application/x-tex', 486 '.texi' : 'application/x-texinfo', 487 '.texinfo': 'application/x-texinfo', 488 '.roff' : 'application/x-troff', 489 '.t' : 'application/x-troff', 490 '.tr' : 'application/x-troff', 491 '.man' : 'application/x-troff-man', 492 '.me' : 'application/x-troff-me', 493 '.ms' : 'application/x-troff-ms', 494 '.ustar' : 'application/x-ustar', 495 '.src' : 'application/x-wais-source', 496 '.xsl' : 'application/xml', 497 '.rdf' : 'application/xml', 498 '.wsdl' : 'application/xml', 499 '.xpdl' : 'application/xml', 500 '.zip' : 'application/zip', 501 '.3gp' : 'audio/3gpp', 502 '.3gpp' : 'audio/3gpp', 503 '.3g2' : 'audio/3gpp2', 504 '.3gpp2' : 'audio/3gpp2', 505 '.aac' : 'audio/aac', 506 '.adts' : 'audio/aac', 507 '.loas' : 'audio/aac', 508 '.ass' : 'audio/aac', 509 '.au' : 'audio/basic', 510 '.snd' : 'audio/basic', 511 '.mp3' : 'audio/mpeg', 512 '.mp2' : 'audio/mpeg', 513 '.opus' : 'audio/opus', 514 '.aif' : 'audio/x-aiff', 515 '.aifc' : 'audio/x-aiff', 516 '.aiff' : 'audio/x-aiff', 517 '.ra' : 'audio/x-pn-realaudio', 518 '.wav' : 'audio/x-wav', 519 '.bmp' : 'image/bmp', 520 '.gif' : 'image/gif', 521 '.ief' : 'image/ief', 522 '.jpg' : 'image/jpeg', 523 '.jpe' : 'image/jpeg', 524 '.jpeg' : 'image/jpeg', 525 '.heic' : 'image/heic', 526 '.heif' : 'image/heif', 527 '.png' : 'image/png', 528 '.svg' : 'image/svg+xml', 529 '.tiff' : 'image/tiff', 530 '.tif' : 'image/tiff', 531 '.ico' : 'image/vnd.microsoft.icon', 532 '.ras' : 'image/x-cmu-raster', 533 '.bmp' : 'image/x-ms-bmp', 534 '.pnm' : 'image/x-portable-anymap', 535 '.pbm' : 'image/x-portable-bitmap', 536 '.pgm' : 'image/x-portable-graymap', 537 '.ppm' : 'image/x-portable-pixmap', 538 '.rgb' : 'image/x-rgb', 539 '.xbm' : 'image/x-xbitmap', 540 '.xpm' : 'image/x-xpixmap', 541 '.xwd' : 'image/x-xwindowdump', 542 '.eml' : 'message/rfc822', 543 '.mht' : 'message/rfc822', 544 '.mhtml' : 'message/rfc822', 545 '.nws' : 'message/rfc822', 546 '.css' : 'text/css', 547 '.csv' : 'text/csv', 548 '.html' : 'text/html', 549 '.htm' : 'text/html', 550 '.txt' : 'text/plain', 551 '.bat' : 'text/plain', 552 '.c' : 'text/plain', 553 '.h' : 'text/plain', 554 '.ksh' : 'text/plain', 555 '.pl' : 'text/plain', 556 '.rtx' : 'text/richtext', 557 '.tsv' : 'text/tab-separated-values', 558 '.py' : 'text/x-python', 559 '.etx' : 'text/x-setext', 560 '.sgm' : 'text/x-sgml', 561 '.sgml' : 'text/x-sgml', 562 '.vcf' : 'text/x-vcard', 563 '.xml' : 'text/xml', 564 '.mp4' : 'video/mp4', 565 '.mpeg' : 'video/mpeg', 566 '.m1v' : 'video/mpeg', 567 '.mpa' : 'video/mpeg', 568 '.mpe' : 'video/mpeg', 569 '.mpg' : 'video/mpeg', 570 '.mov' : 'video/quicktime', 571 '.qt' : 'video/quicktime', 572 '.webm' : 'video/webm', 573 '.avi' : 'video/x-msvideo', 574 '.movie' : 'video/x-sgi-movie', 575 } 576 577 # These are non-standard types, commonly found in the wild. They will 578 # only match if strict=0 flag is given to the API methods. 579 580 # Please sort these too 581 common_types = _common_types_default = { 582 '.rtf' : 'application/rtf', 583 '.midi': 'audio/midi', 584 '.mid' : 'audio/midi', 585 '.jpg' : 'image/jpg', 586 '.pict': 'image/pict', 587 '.pct' : 'image/pict', 588 '.pic' : 'image/pict', 589 '.xul' : 'text/xul', 590 } 591 592 593_default_mime_types() 594 595 596def _main(): 597 import getopt 598 599 USAGE = """\ 600Usage: mimetypes.py [options] type 601 602Options: 603 --help / -h -- print this message and exit 604 --lenient / -l -- additionally search of some common, but non-standard 605 types. 606 --extension / -e -- guess extension instead of type 607 608More than one type argument may be given. 609""" 610 611 def usage(code, msg=''): 612 print(USAGE) 613 if msg: print(msg) 614 sys.exit(code) 615 616 try: 617 opts, args = getopt.getopt(sys.argv[1:], 'hle', 618 ['help', 'lenient', 'extension']) 619 except getopt.error as msg: 620 usage(1, msg) 621 622 strict = 1 623 extension = 0 624 for opt, arg in opts: 625 if opt in ('-h', '--help'): 626 usage(0) 627 elif opt in ('-l', '--lenient'): 628 strict = 0 629 elif opt in ('-e', '--extension'): 630 extension = 1 631 for gtype in args: 632 if extension: 633 guess = guess_extension(gtype, strict) 634 if not guess: print("I don't know anything about type", gtype) 635 else: print(guess) 636 else: 637 guess, encoding = guess_type(gtype, strict) 638 if not guess: print("I don't know anything about type", gtype) 639 else: print('type:', guess, 'encoding:', encoding) 640 641 642if __name__ == '__main__': 643 _main() 644