1"""Guess the MIME type of a file. 2 3This module defines two useful functions: 4 5guess_type(url, strict=True) -- guess the MIME type and encoding of a URL. 6 7guess_extension(type, strict=True) -- guess the extension for a given MIME type. 8 9It also contains the following, for tuning the behavior: 10 11Data: 12 13knownfiles -- list of files to parse 14inited -- flag set when init() has been called 15suffix_map -- dictionary mapping suffixes to suffixes 16encodings_map -- dictionary mapping suffixes to encodings 17types_map -- dictionary mapping suffixes to types 18 19Functions: 20 21init([files]) -- parse a list of files, default knownfiles (on Windows, the 22 default values are taken from the registry) 23read_mime_types(file) -- parse one file, return a dictionary or None 24""" 25 26import os 27import sys 28import posixpath 29import urllib.parse 30try: 31 import winreg as _winreg 32except ImportError: 33 _winreg = None 34 35__all__ = [ 36 "knownfiles", "inited", "MimeTypes", 37 "guess_type", "guess_all_extensions", "guess_extension", 38 "add_type", "init", "read_mime_types", 39 "suffix_map", "encodings_map", "types_map", "common_types" 40] 41 42knownfiles = [ 43 "/etc/mime.types", 44 "/etc/httpd/mime.types", # Mac OS X 45 "/etc/httpd/conf/mime.types", # Apache 46 "/etc/apache/mime.types", # Apache 1 47 "/etc/apache2/mime.types", # Apache 2 48 "/usr/local/etc/httpd/conf/mime.types", 49 "/usr/local/lib/netscape/mime.types", 50 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2 51 "/usr/local/etc/mime.types", # Apache 1.3 52 ] 53 54inited = False 55_db = None 56 57 58class MimeTypes: 59 """MIME-types datastore. 60 61 This datastore can handle information from mime.types-style files 62 and supports basic determination of MIME type from a filename or 63 URL, and can guess a reasonable extension given a MIME type. 64 """ 65 66 def __init__(self, filenames=(), strict=True): 67 if not inited: 68 init() 69 self.encodings_map = _encodings_map_default.copy() 70 self.suffix_map = _suffix_map_default.copy() 71 self.types_map = ({}, {}) # dict for (non-strict, strict) 72 self.types_map_inv = ({}, {}) 73 for (ext, type) in _types_map_default.items(): 74 self.add_type(type, ext, True) 75 for (ext, type) in _common_types_default.items(): 76 self.add_type(type, ext, False) 77 for name in filenames: 78 self.read(name, strict) 79 80 def add_type(self, type, ext, strict=True): 81 """Add a mapping between a type and an extension. 82 83 When the extension is already known, the new 84 type will replace the old one. When the type 85 is already known the extension will be added 86 to the list of known extensions. 87 88 If strict is true, information will be added to 89 list of standard types, else to the list of non-standard 90 types. 91 """ 92 self.types_map[strict][ext] = type 93 exts = self.types_map_inv[strict].setdefault(type, []) 94 if ext not in exts: 95 exts.append(ext) 96 97 def guess_type(self, url, strict=True): 98 """Guess the type of a file which is either a URL or a path-like object. 99 100 Return value is a tuple (type, encoding) where type is None if 101 the type can't be guessed (no or unknown suffix) or a string 102 of the form type/subtype, usable for a MIME Content-type 103 header; and encoding is None for no encoding or the name of 104 the program used to encode (e.g. compress or gzip). The 105 mappings are table driven. Encoding suffixes are case 106 sensitive; type suffixes are first tried case sensitive, then 107 case insensitive. 108 109 The suffixes .tgz, .taz and .tz (case sensitive!) are all 110 mapped to '.tar.gz'. (This is table-driven too, using the 111 dictionary suffix_map.) 112 113 Optional `strict' argument when False adds a bunch of commonly found, 114 but non-standard types. 115 """ 116 url = os.fspath(url) 117 scheme, url = urllib.parse._splittype(url) 118 if scheme == 'data': 119 # syntax of data URLs: 120 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 121 # mediatype := [ type "/" subtype ] *( ";" parameter ) 122 # data := *urlchar 123 # parameter := attribute "=" value 124 # type/subtype defaults to "text/plain" 125 comma = url.find(',') 126 if comma < 0: 127 # bad data URL 128 return None, None 129 semi = url.find(';', 0, comma) 130 if semi >= 0: 131 type = url[:semi] 132 else: 133 type = url[:comma] 134 if '=' in type or '/' not in type: 135 type = 'text/plain' 136 return type, None # never compressed, so encoding is None 137 base, ext = posixpath.splitext(url) 138 while ext in self.suffix_map: 139 base, ext = posixpath.splitext(base + self.suffix_map[ext]) 140 if ext in self.encodings_map: 141 encoding = self.encodings_map[ext] 142 base, ext = posixpath.splitext(base) 143 else: 144 encoding = None 145 types_map = self.types_map[True] 146 if ext in types_map: 147 return types_map[ext], encoding 148 elif ext.lower() in types_map: 149 return types_map[ext.lower()], encoding 150 elif strict: 151 return None, encoding 152 types_map = self.types_map[False] 153 if ext in types_map: 154 return types_map[ext], encoding 155 elif ext.lower() in types_map: 156 return types_map[ext.lower()], encoding 157 else: 158 return None, encoding 159 160 def guess_all_extensions(self, type, strict=True): 161 """Guess the extensions for a file based on its MIME type. 162 163 Return value is a list of strings giving the possible filename 164 extensions, including the leading dot ('.'). The extension is not 165 guaranteed to have been associated with any particular data stream, 166 but would be mapped to the MIME type `type' by guess_type(). 167 168 Optional `strict' argument when false adds a bunch of commonly found, 169 but non-standard types. 170 """ 171 type = type.lower() 172 extensions = self.types_map_inv[True].get(type, []) 173 if not strict: 174 for ext in self.types_map_inv[False].get(type, []): 175 if ext not in extensions: 176 extensions.append(ext) 177 return extensions 178 179 def guess_extension(self, type, strict=True): 180 """Guess the extension for a file based on its MIME type. 181 182 Return value is a string giving a filename extension, 183 including the leading dot ('.'). The extension is not 184 guaranteed to have been associated with any particular data 185 stream, but would be mapped to the MIME type `type' by 186 guess_type(). If no extension can be guessed for `type', None 187 is returned. 188 189 Optional `strict' argument when false adds a bunch of commonly found, 190 but non-standard types. 191 """ 192 extensions = self.guess_all_extensions(type, strict) 193 if not extensions: 194 return None 195 return extensions[0] 196 197 def read(self, filename, strict=True): 198 """ 199 Read a single mime.types-format file, specified by pathname. 200 201 If strict is true, information will be added to 202 list of standard types, else to the list of non-standard 203 types. 204 """ 205 with open(filename, encoding='utf-8') as fp: 206 self.readfp(fp, strict) 207 208 def readfp(self, fp, strict=True): 209 """ 210 Read a single mime.types-format file. 211 212 If strict is true, information will be added to 213 list of standard types, else to the list of non-standard 214 types. 215 """ 216 while 1: 217 line = fp.readline() 218 if not line: 219 break 220 words = line.split() 221 for i in range(len(words)): 222 if words[i][0] == '#': 223 del words[i:] 224 break 225 if not words: 226 continue 227 type, suffixes = words[0], words[1:] 228 for suff in suffixes: 229 self.add_type(type, '.' + suff, strict) 230 231 def read_windows_registry(self, strict=True): 232 """ 233 Load the MIME types database from Windows registry. 234 235 If strict is true, information will be added to 236 list of standard types, else to the list of non-standard 237 types. 238 """ 239 240 # Windows only 241 if not _winreg: 242 return 243 244 def enum_types(mimedb): 245 i = 0 246 while True: 247 try: 248 ctype = _winreg.EnumKey(mimedb, i) 249 except OSError: 250 break 251 else: 252 if '\0' not in ctype: 253 yield ctype 254 i += 1 255 256 with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr: 257 for subkeyname in enum_types(hkcr): 258 try: 259 with _winreg.OpenKey(hkcr, subkeyname) as subkey: 260 # Only check file extensions 261 if not subkeyname.startswith("."): 262 continue 263 # raises OSError if no 'Content Type' value 264 mimetype, datatype = _winreg.QueryValueEx( 265 subkey, 'Content Type') 266 if datatype != _winreg.REG_SZ: 267 continue 268 self.add_type(mimetype, subkeyname, strict) 269 except OSError: 270 continue 271 272def guess_type(url, strict=True): 273 """Guess the type of a file based on its URL. 274 275 Return value is a tuple (type, encoding) where type is None if the 276 type can't be guessed (no or unknown suffix) or a string of the 277 form type/subtype, usable for a MIME Content-type header; and 278 encoding is None for no encoding or the name of the program used 279 to encode (e.g. compress or gzip). The mappings are table 280 driven. Encoding suffixes are case sensitive; type suffixes are 281 first tried case sensitive, then case insensitive. 282 283 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped 284 to ".tar.gz". (This is table-driven too, using the dictionary 285 suffix_map). 286 287 Optional `strict' argument when false adds a bunch of commonly found, but 288 non-standard types. 289 """ 290 if _db is None: 291 init() 292 return _db.guess_type(url, strict) 293 294 295def guess_all_extensions(type, strict=True): 296 """Guess the extensions for a file based on its MIME type. 297 298 Return value is a list of strings giving the possible filename 299 extensions, including the leading dot ('.'). The extension is not 300 guaranteed to have been associated with any particular data 301 stream, but would be mapped to the MIME type `type' by 302 guess_type(). If no extension can be guessed for `type', None 303 is returned. 304 305 Optional `strict' argument when false adds a bunch of commonly found, 306 but non-standard types. 307 """ 308 if _db is None: 309 init() 310 return _db.guess_all_extensions(type, strict) 311 312def guess_extension(type, strict=True): 313 """Guess the extension for a file based on its MIME type. 314 315 Return value is a string giving a filename extension, including the 316 leading dot ('.'). The extension is not guaranteed to have been 317 associated with any particular data stream, but would be mapped to the 318 MIME type `type' by guess_type(). If no extension can be guessed for 319 `type', None is returned. 320 321 Optional `strict' argument when false adds a bunch of commonly found, 322 but non-standard types. 323 """ 324 if _db is None: 325 init() 326 return _db.guess_extension(type, strict) 327 328def add_type(type, ext, strict=True): 329 """Add a mapping between a type and an extension. 330 331 When the extension is already known, the new 332 type will replace the old one. When the type 333 is already known the extension will be added 334 to the list of known extensions. 335 336 If strict is true, information will be added to 337 list of standard types, else to the list of non-standard 338 types. 339 """ 340 if _db is None: 341 init() 342 return _db.add_type(type, ext, strict) 343 344 345def init(files=None): 346 global suffix_map, types_map, encodings_map, common_types 347 global inited, _db 348 inited = True # so that MimeTypes.__init__() doesn't call us again 349 350 if files is None or _db is None: 351 db = MimeTypes() 352 if _winreg: 353 db.read_windows_registry() 354 355 if files is None: 356 files = knownfiles 357 else: 358 files = knownfiles + list(files) 359 else: 360 db = _db 361 362 for file in files: 363 if os.path.isfile(file): 364 db.read(file) 365 encodings_map = db.encodings_map 366 suffix_map = db.suffix_map 367 types_map = db.types_map[True] 368 common_types = db.types_map[False] 369 # Make the DB a global variable now that it is fully initialized 370 _db = db 371 372 373def read_mime_types(file): 374 try: 375 f = open(file) 376 except OSError: 377 return None 378 with f: 379 db = MimeTypes() 380 db.readfp(f, True) 381 return db.types_map[True] 382 383 384def _default_mime_types(): 385 global suffix_map, _suffix_map_default 386 global encodings_map, _encodings_map_default 387 global types_map, _types_map_default 388 global common_types, _common_types_default 389 390 suffix_map = _suffix_map_default = { 391 '.svgz': '.svg.gz', 392 '.tgz': '.tar.gz', 393 '.taz': '.tar.gz', 394 '.tz': '.tar.gz', 395 '.tbz2': '.tar.bz2', 396 '.txz': '.tar.xz', 397 } 398 399 encodings_map = _encodings_map_default = { 400 '.gz': 'gzip', 401 '.Z': 'compress', 402 '.bz2': 'bzip2', 403 '.xz': 'xz', 404 } 405 406 # Before adding new types, make sure they are either registered with IANA, 407 # at http://www.iana.org/assignments/media-types 408 # or extensions, i.e. using the x- prefix 409 410 # If you add to these, please keep them sorted by mime type. 411 # Make sure the entry with the preferred file extension for a particular mime type 412 # appears before any others of the same mimetype. 413 types_map = _types_map_default = { 414 '.js' : 'application/javascript', 415 '.mjs' : 'application/javascript', 416 '.json' : 'application/json', 417 '.webmanifest': 'application/manifest+json', 418 '.doc' : 'application/msword', 419 '.dot' : 'application/msword', 420 '.wiz' : 'application/msword', 421 '.bin' : 'application/octet-stream', 422 '.a' : 'application/octet-stream', 423 '.dll' : 'application/octet-stream', 424 '.exe' : 'application/octet-stream', 425 '.o' : 'application/octet-stream', 426 '.obj' : 'application/octet-stream', 427 '.so' : 'application/octet-stream', 428 '.oda' : 'application/oda', 429 '.pdf' : 'application/pdf', 430 '.p7c' : 'application/pkcs7-mime', 431 '.ps' : 'application/postscript', 432 '.ai' : 'application/postscript', 433 '.eps' : 'application/postscript', 434 '.m3u' : 'application/vnd.apple.mpegurl', 435 '.m3u8' : 'application/vnd.apple.mpegurl', 436 '.xls' : 'application/vnd.ms-excel', 437 '.xlb' : 'application/vnd.ms-excel', 438 '.ppt' : 'application/vnd.ms-powerpoint', 439 '.pot' : 'application/vnd.ms-powerpoint', 440 '.ppa' : 'application/vnd.ms-powerpoint', 441 '.pps' : 'application/vnd.ms-powerpoint', 442 '.pwz' : 'application/vnd.ms-powerpoint', 443 '.wasm' : 'application/wasm', 444 '.bcpio' : 'application/x-bcpio', 445 '.cpio' : 'application/x-cpio', 446 '.csh' : 'application/x-csh', 447 '.dvi' : 'application/x-dvi', 448 '.gtar' : 'application/x-gtar', 449 '.hdf' : 'application/x-hdf', 450 '.latex' : 'application/x-latex', 451 '.mif' : 'application/x-mif', 452 '.cdf' : 'application/x-netcdf', 453 '.nc' : 'application/x-netcdf', 454 '.p12' : 'application/x-pkcs12', 455 '.pfx' : 'application/x-pkcs12', 456 '.ram' : 'application/x-pn-realaudio', 457 '.pyc' : 'application/x-python-code', 458 '.pyo' : 'application/x-python-code', 459 '.sh' : 'application/x-sh', 460 '.shar' : 'application/x-shar', 461 '.swf' : 'application/x-shockwave-flash', 462 '.sv4cpio': 'application/x-sv4cpio', 463 '.sv4crc' : 'application/x-sv4crc', 464 '.tar' : 'application/x-tar', 465 '.tcl' : 'application/x-tcl', 466 '.tex' : 'application/x-tex', 467 '.texi' : 'application/x-texinfo', 468 '.texinfo': 'application/x-texinfo', 469 '.roff' : 'application/x-troff', 470 '.t' : 'application/x-troff', 471 '.tr' : 'application/x-troff', 472 '.man' : 'application/x-troff-man', 473 '.me' : 'application/x-troff-me', 474 '.ms' : 'application/x-troff-ms', 475 '.ustar' : 'application/x-ustar', 476 '.src' : 'application/x-wais-source', 477 '.xsl' : 'application/xml', 478 '.rdf' : 'application/xml', 479 '.wsdl' : 'application/xml', 480 '.xpdl' : 'application/xml', 481 '.zip' : 'application/zip', 482 '.au' : 'audio/basic', 483 '.snd' : 'audio/basic', 484 '.mp3' : 'audio/mpeg', 485 '.mp2' : 'audio/mpeg', 486 '.aif' : 'audio/x-aiff', 487 '.aifc' : 'audio/x-aiff', 488 '.aiff' : 'audio/x-aiff', 489 '.ra' : 'audio/x-pn-realaudio', 490 '.wav' : 'audio/x-wav', 491 '.bmp' : 'image/bmp', 492 '.gif' : 'image/gif', 493 '.ief' : 'image/ief', 494 '.jpg' : 'image/jpeg', 495 '.jpe' : 'image/jpeg', 496 '.jpeg' : 'image/jpeg', 497 '.png' : 'image/png', 498 '.svg' : 'image/svg+xml', 499 '.tiff' : 'image/tiff', 500 '.tif' : 'image/tiff', 501 '.ico' : 'image/vnd.microsoft.icon', 502 '.ras' : 'image/x-cmu-raster', 503 '.bmp' : 'image/x-ms-bmp', 504 '.pnm' : 'image/x-portable-anymap', 505 '.pbm' : 'image/x-portable-bitmap', 506 '.pgm' : 'image/x-portable-graymap', 507 '.ppm' : 'image/x-portable-pixmap', 508 '.rgb' : 'image/x-rgb', 509 '.xbm' : 'image/x-xbitmap', 510 '.xpm' : 'image/x-xpixmap', 511 '.xwd' : 'image/x-xwindowdump', 512 '.eml' : 'message/rfc822', 513 '.mht' : 'message/rfc822', 514 '.mhtml' : 'message/rfc822', 515 '.nws' : 'message/rfc822', 516 '.css' : 'text/css', 517 '.csv' : 'text/csv', 518 '.html' : 'text/html', 519 '.htm' : 'text/html', 520 '.txt' : 'text/plain', 521 '.bat' : 'text/plain', 522 '.c' : 'text/plain', 523 '.h' : 'text/plain', 524 '.ksh' : 'text/plain', 525 '.pl' : 'text/plain', 526 '.rtx' : 'text/richtext', 527 '.tsv' : 'text/tab-separated-values', 528 '.py' : 'text/x-python', 529 '.etx' : 'text/x-setext', 530 '.sgm' : 'text/x-sgml', 531 '.sgml' : 'text/x-sgml', 532 '.vcf' : 'text/x-vcard', 533 '.xml' : 'text/xml', 534 '.mp4' : 'video/mp4', 535 '.mpeg' : 'video/mpeg', 536 '.m1v' : 'video/mpeg', 537 '.mpa' : 'video/mpeg', 538 '.mpe' : 'video/mpeg', 539 '.mpg' : 'video/mpeg', 540 '.mov' : 'video/quicktime', 541 '.qt' : 'video/quicktime', 542 '.webm' : 'video/webm', 543 '.avi' : 'video/x-msvideo', 544 '.movie' : 'video/x-sgi-movie', 545 } 546 547 # These are non-standard types, commonly found in the wild. They will 548 # only match if strict=0 flag is given to the API methods. 549 550 # Please sort these too 551 common_types = _common_types_default = { 552 '.rtf' : 'application/rtf', 553 '.midi': 'audio/midi', 554 '.mid' : 'audio/midi', 555 '.jpg' : 'image/jpg', 556 '.pict': 'image/pict', 557 '.pct' : 'image/pict', 558 '.pic' : 'image/pict', 559 '.xul' : 'text/xul', 560 } 561 562 563_default_mime_types() 564 565 566if __name__ == '__main__': 567 import getopt 568 569 USAGE = """\ 570Usage: mimetypes.py [options] type 571 572Options: 573 --help / -h -- print this message and exit 574 --lenient / -l -- additionally search of some common, but non-standard 575 types. 576 --extension / -e -- guess extension instead of type 577 578More than one type argument may be given. 579""" 580 581 def usage(code, msg=''): 582 print(USAGE) 583 if msg: print(msg) 584 sys.exit(code) 585 586 try: 587 opts, args = getopt.getopt(sys.argv[1:], 'hle', 588 ['help', 'lenient', 'extension']) 589 except getopt.error as msg: 590 usage(1, msg) 591 592 strict = 1 593 extension = 0 594 for opt, arg in opts: 595 if opt in ('-h', '--help'): 596 usage(0) 597 elif opt in ('-l', '--lenient'): 598 strict = 0 599 elif opt in ('-e', '--extension'): 600 extension = 1 601 for gtype in args: 602 if extension: 603 guess = guess_extension(gtype, strict) 604 if not guess: print("I don't know anything about type", gtype) 605 else: print(guess) 606 else: 607 guess, encoding = guess_type(gtype, strict) 608 if not guess: print("I don't know anything about type", gtype) 609 else: print('type:', guess, 'encoding:', encoding) 610