1"""Guess the MIME type of a file. 2 3This module defines two useful functions: 4 5guess_type(url, strict=1) -- guess the MIME type and encoding of a URL. 6 7guess_extension(type, strict=1) -- guess the extension for a given MIME type. 8 9It also contains the following, for tuning the behavior: 10 11Data: 12 13knownfiles -- list of files to parse 14inited -- flag set when init() has been called 15suffix_map -- dictionary mapping suffixes to suffixes 16encodings_map -- dictionary mapping suffixes to encodings 17types_map -- dictionary mapping suffixes to types 18 19Functions: 20 21init([files]) -- parse a list of files, default knownfiles (on Windows, the 22 default values are taken from the registry) 23read_mime_types(file) -- parse one file, return a dictionary or None 24""" 25 26import os 27import sys 28import posixpath 29import urllib 30try: 31 import _winreg 32except ImportError: 33 _winreg = None 34 35__all__ = [ 36 "guess_type","guess_extension","guess_all_extensions", 37 "add_type","read_mime_types","init" 38] 39 40knownfiles = [ 41 "/etc/mime.types", 42 "/etc/httpd/mime.types", # Mac OS X 43 "/etc/httpd/conf/mime.types", # Apache 44 "/etc/apache/mime.types", # Apache 1 45 "/etc/apache2/mime.types", # Apache 2 46 "/usr/local/etc/httpd/conf/mime.types", 47 "/usr/local/lib/netscape/mime.types", 48 "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2 49 "/usr/local/etc/mime.types", # Apache 1.3 50 ] 51 52inited = False 53_db = None 54 55 56class MimeTypes: 57 """MIME-types datastore. 58 59 This datastore can handle information from mime.types-style files 60 and supports basic determination of MIME type from a filename or 61 URL, and can guess a reasonable extension given a MIME type. 62 """ 63 64 def __init__(self, filenames=(), strict=True): 65 if not inited: 66 init() 67 self.encodings_map = encodings_map.copy() 68 self.suffix_map = suffix_map.copy() 69 self.types_map = ({}, {}) # dict for (non-strict, strict) 70 self.types_map_inv = ({}, {}) 71 for (ext, type) in types_map.items(): 72 self.add_type(type, ext, True) 73 for (ext, type) in common_types.items(): 74 self.add_type(type, ext, False) 75 for name in filenames: 76 self.read(name, strict) 77 78 def add_type(self, type, ext, strict=True): 79 """Add a mapping between a type and an extension. 80 81 When the extension is already known, the new 82 type will replace the old one. When the type 83 is already known the extension will be added 84 to the list of known extensions. 85 86 If strict is true, information will be added to 87 list of standard types, else to the list of non-standard 88 types. 89 """ 90 self.types_map[strict][ext] = type 91 exts = self.types_map_inv[strict].setdefault(type, []) 92 if ext not in exts: 93 exts.append(ext) 94 95 def guess_type(self, url, strict=True): 96 """Guess the type of a file based on its URL. 97 98 Return value is a tuple (type, encoding) where type is None if 99 the type can't be guessed (no or unknown suffix) or a string 100 of the form type/subtype, usable for a MIME Content-type 101 header; and encoding is None for no encoding or the name of 102 the program used to encode (e.g. compress or gzip). The 103 mappings are table driven. Encoding suffixes are case 104 sensitive; type suffixes are first tried case sensitive, then 105 case insensitive. 106 107 The suffixes .tgz, .taz and .tz (case sensitive!) are all 108 mapped to '.tar.gz'. (This is table-driven too, using the 109 dictionary suffix_map.) 110 111 Optional `strict' argument when False adds a bunch of commonly found, 112 but non-standard types. 113 """ 114 scheme, url = urllib.splittype(url) 115 if scheme == 'data': 116 # syntax of data URLs: 117 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data 118 # mediatype := [ type "/" subtype ] *( ";" parameter ) 119 # data := *urlchar 120 # parameter := attribute "=" value 121 # type/subtype defaults to "text/plain" 122 comma = url.find(',') 123 if comma < 0: 124 # bad data URL 125 return None, None 126 semi = url.find(';', 0, comma) 127 if semi >= 0: 128 type = url[:semi] 129 else: 130 type = url[:comma] 131 if '=' in type or '/' not in type: 132 type = 'text/plain' 133 return type, None # never compressed, so encoding is None 134 base, ext = posixpath.splitext(url) 135 while ext in self.suffix_map: 136 base, ext = posixpath.splitext(base + self.suffix_map[ext]) 137 if ext in self.encodings_map: 138 encoding = self.encodings_map[ext] 139 base, ext = posixpath.splitext(base) 140 else: 141 encoding = None 142 types_map = self.types_map[True] 143 if ext in types_map: 144 return types_map[ext], encoding 145 elif ext.lower() in types_map: 146 return types_map[ext.lower()], encoding 147 elif strict: 148 return None, encoding 149 types_map = self.types_map[False] 150 if ext in types_map: 151 return types_map[ext], encoding 152 elif ext.lower() in types_map: 153 return types_map[ext.lower()], encoding 154 else: 155 return None, encoding 156 157 def guess_all_extensions(self, type, strict=True): 158 """Guess the extensions for a file based on its MIME type. 159 160 Return value is a list of strings giving the possible filename 161 extensions, including the leading dot ('.'). The extension is not 162 guaranteed to have been associated with any particular data stream, 163 but would be mapped to the MIME type `type' by guess_type(). 164 165 Optional `strict' argument when false adds a bunch of commonly found, 166 but non-standard types. 167 """ 168 type = type.lower() 169 extensions = self.types_map_inv[True].get(type, []) 170 if not strict: 171 for ext in self.types_map_inv[False].get(type, []): 172 if ext not in extensions: 173 extensions.append(ext) 174 return extensions 175 176 def guess_extension(self, type, strict=True): 177 """Guess the extension for a file based on its MIME type. 178 179 Return value is a string giving a filename extension, 180 including the leading dot ('.'). The extension is not 181 guaranteed to have been associated with any particular data 182 stream, but would be mapped to the MIME type `type' by 183 guess_type(). If no extension can be guessed for `type', None 184 is returned. 185 186 Optional `strict' argument when false adds a bunch of commonly found, 187 but non-standard types. 188 """ 189 extensions = self.guess_all_extensions(type, strict) 190 if not extensions: 191 return None 192 return extensions[0] 193 194 def read(self, filename, strict=True): 195 """ 196 Read a single mime.types-format file, specified by pathname. 197 198 If strict is true, information will be added to 199 list of standard types, else to the list of non-standard 200 types. 201 """ 202 with open(filename) as fp: 203 self.readfp(fp, strict) 204 205 def readfp(self, fp, strict=True): 206 """ 207 Read a single mime.types-format file. 208 209 If strict is true, information will be added to 210 list of standard types, else to the list of non-standard 211 types. 212 """ 213 while 1: 214 line = fp.readline() 215 if not line: 216 break 217 words = line.split() 218 for i in range(len(words)): 219 if words[i][0] == '#': 220 del words[i:] 221 break 222 if not words: 223 continue 224 type, suffixes = words[0], words[1:] 225 for suff in suffixes: 226 self.add_type(type, '.' + suff, strict) 227 228 def read_windows_registry(self, strict=True): 229 """ 230 Load the MIME types database from Windows registry. 231 232 If strict is true, information will be added to 233 list of standard types, else to the list of non-standard 234 types. 235 """ 236 237 # Windows only 238 if not _winreg: 239 return 240 241 def enum_types(mimedb): 242 i = 0 243 while True: 244 try: 245 ctype = _winreg.EnumKey(mimedb, i) 246 except EnvironmentError: 247 break 248 else: 249 if '\0' not in ctype: 250 yield ctype 251 i += 1 252 253 default_encoding = sys.getdefaultencoding() 254 with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr: 255 for subkeyname in enum_types(hkcr): 256 try: 257 with _winreg.OpenKey(hkcr, subkeyname) as subkey: 258 # Only check file extensions 259 if not subkeyname.startswith("."): 260 continue 261 # raises EnvironmentError if no 'Content Type' value 262 mimetype, datatype = _winreg.QueryValueEx( 263 subkey, 'Content Type') 264 if datatype != _winreg.REG_SZ: 265 continue 266 try: 267 mimetype = mimetype.encode(default_encoding) 268 except UnicodeEncodeError: 269 continue 270 self.add_type(mimetype, subkeyname, strict) 271 except EnvironmentError: 272 continue 273 274def guess_type(url, strict=True): 275 """Guess the type of a file based on its URL. 276 277 Return value is a tuple (type, encoding) where type is None if the 278 type can't be guessed (no or unknown suffix) or a string of the 279 form type/subtype, usable for a MIME Content-type header; and 280 encoding is None for no encoding or the name of the program used 281 to encode (e.g. compress or gzip). The mappings are table 282 driven. Encoding suffixes are case sensitive; type suffixes are 283 first tried case sensitive, then case insensitive. 284 285 The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped 286 to ".tar.gz". (This is table-driven too, using the dictionary 287 suffix_map). 288 289 Optional `strict' argument when false adds a bunch of commonly found, but 290 non-standard types. 291 """ 292 if _db is None: 293 init() 294 return _db.guess_type(url, strict) 295 296 297def guess_all_extensions(type, strict=True): 298 """Guess the extensions for a file based on its MIME type. 299 300 Return value is a list of strings giving the possible filename 301 extensions, including the leading dot ('.'). The extension is not 302 guaranteed to have been associated with any particular data 303 stream, but would be mapped to the MIME type `type' by 304 guess_type(). If no extension can be guessed for `type', None 305 is returned. 306 307 Optional `strict' argument when false adds a bunch of commonly found, 308 but non-standard types. 309 """ 310 if _db is None: 311 init() 312 return _db.guess_all_extensions(type, strict) 313 314def guess_extension(type, strict=True): 315 """Guess the extension for a file based on its MIME type. 316 317 Return value is a string giving a filename extension, including the 318 leading dot ('.'). The extension is not guaranteed to have been 319 associated with any particular data stream, but would be mapped to the 320 MIME type `type' by guess_type(). If no extension can be guessed for 321 `type', None is returned. 322 323 Optional `strict' argument when false adds a bunch of commonly found, 324 but non-standard types. 325 """ 326 if _db is None: 327 init() 328 return _db.guess_extension(type, strict) 329 330def add_type(type, ext, strict=True): 331 """Add a mapping between a type and an extension. 332 333 When the extension is already known, the new 334 type will replace the old one. When the type 335 is already known the extension will be added 336 to the list of known extensions. 337 338 If strict is true, information will be added to 339 list of standard types, else to the list of non-standard 340 types. 341 """ 342 if _db is None: 343 init() 344 return _db.add_type(type, ext, strict) 345 346 347def init(files=None): 348 global suffix_map, types_map, encodings_map, common_types 349 global inited, _db 350 inited = True # so that MimeTypes.__init__() doesn't call us again 351 db = MimeTypes() 352 if files is None: 353 if _winreg: 354 db.read_windows_registry() 355 files = knownfiles 356 for file in files: 357 if os.path.isfile(file): 358 db.read(file) 359 encodings_map = db.encodings_map 360 suffix_map = db.suffix_map 361 types_map = db.types_map[True] 362 common_types = db.types_map[False] 363 # Make the DB a global variable now that it is fully initialized 364 _db = db 365 366 367def read_mime_types(file): 368 try: 369 f = open(file) 370 except IOError: 371 return None 372 with f: 373 db = MimeTypes() 374 db.readfp(f, True) 375 return db.types_map[True] 376 377 378def _default_mime_types(): 379 global suffix_map 380 global encodings_map 381 global types_map 382 global common_types 383 384 suffix_map = { 385 '.svgz': '.svg.gz', 386 '.tgz': '.tar.gz', 387 '.taz': '.tar.gz', 388 '.tz': '.tar.gz', 389 '.tbz2': '.tar.bz2', 390 '.txz': '.tar.xz', 391 } 392 393 encodings_map = { 394 '.gz': 'gzip', 395 '.Z': 'compress', 396 '.bz2': 'bzip2', 397 '.xz': 'xz', 398 } 399 400 # Before adding new types, make sure they are either registered with IANA, 401 # at http://www.isi.edu/in-notes/iana/assignments/media-types 402 # or extensions, i.e. using the x- prefix 403 404 # If you add to these, please keep them sorted! 405 types_map = { 406 '.a' : 'application/octet-stream', 407 '.ai' : 'application/postscript', 408 '.aif' : 'audio/x-aiff', 409 '.aifc' : 'audio/x-aiff', 410 '.aiff' : 'audio/x-aiff', 411 '.au' : 'audio/basic', 412 '.avi' : 'video/x-msvideo', 413 '.bat' : 'text/plain', 414 '.bcpio' : 'application/x-bcpio', 415 '.bin' : 'application/octet-stream', 416 '.bmp' : 'image/x-ms-bmp', 417 '.c' : 'text/plain', 418 # Duplicates :( 419 '.cdf' : 'application/x-cdf', 420 '.cdf' : 'application/x-netcdf', 421 '.cpio' : 'application/x-cpio', 422 '.csh' : 'application/x-csh', 423 '.css' : 'text/css', 424 '.csv' : 'text/csv', 425 '.dll' : 'application/octet-stream', 426 '.doc' : 'application/msword', 427 '.dot' : 'application/msword', 428 '.dvi' : 'application/x-dvi', 429 '.eml' : 'message/rfc822', 430 '.eps' : 'application/postscript', 431 '.etx' : 'text/x-setext', 432 '.exe' : 'application/octet-stream', 433 '.gif' : 'image/gif', 434 '.gtar' : 'application/x-gtar', 435 '.h' : 'text/plain', 436 '.hdf' : 'application/x-hdf', 437 '.htm' : 'text/html', 438 '.html' : 'text/html', 439 '.ico' : 'image/vnd.microsoft.icon', 440 '.ief' : 'image/ief', 441 '.jpe' : 'image/jpeg', 442 '.jpeg' : 'image/jpeg', 443 '.jpg' : 'image/jpeg', 444 '.js' : 'application/javascript', 445 '.ksh' : 'text/plain', 446 '.latex' : 'application/x-latex', 447 '.m1v' : 'video/mpeg', 448 '.man' : 'application/x-troff-man', 449 '.me' : 'application/x-troff-me', 450 '.mht' : 'message/rfc822', 451 '.mhtml' : 'message/rfc822', 452 '.mif' : 'application/x-mif', 453 '.mov' : 'video/quicktime', 454 '.movie' : 'video/x-sgi-movie', 455 '.mp2' : 'audio/mpeg', 456 '.mp3' : 'audio/mpeg', 457 '.mp4' : 'video/mp4', 458 '.mpa' : 'video/mpeg', 459 '.mpe' : 'video/mpeg', 460 '.mpeg' : 'video/mpeg', 461 '.mpg' : 'video/mpeg', 462 '.ms' : 'application/x-troff-ms', 463 '.nc' : 'application/x-netcdf', 464 '.nws' : 'message/rfc822', 465 '.o' : 'application/octet-stream', 466 '.obj' : 'application/octet-stream', 467 '.oda' : 'application/oda', 468 '.p12' : 'application/x-pkcs12', 469 '.p7c' : 'application/pkcs7-mime', 470 '.pbm' : 'image/x-portable-bitmap', 471 '.pdf' : 'application/pdf', 472 '.pfx' : 'application/x-pkcs12', 473 '.pgm' : 'image/x-portable-graymap', 474 '.pl' : 'text/plain', 475 '.png' : 'image/png', 476 '.pnm' : 'image/x-portable-anymap', 477 '.pot' : 'application/vnd.ms-powerpoint', 478 '.ppa' : 'application/vnd.ms-powerpoint', 479 '.ppm' : 'image/x-portable-pixmap', 480 '.pps' : 'application/vnd.ms-powerpoint', 481 '.ppt' : 'application/vnd.ms-powerpoint', 482 '.ps' : 'application/postscript', 483 '.pwz' : 'application/vnd.ms-powerpoint', 484 '.py' : 'text/x-python', 485 '.pyc' : 'application/x-python-code', 486 '.pyo' : 'application/x-python-code', 487 '.qt' : 'video/quicktime', 488 '.ra' : 'audio/x-pn-realaudio', 489 '.ram' : 'application/x-pn-realaudio', 490 '.ras' : 'image/x-cmu-raster', 491 '.rdf' : 'application/xml', 492 '.rgb' : 'image/x-rgb', 493 '.roff' : 'application/x-troff', 494 '.rtx' : 'text/richtext', 495 '.sgm' : 'text/x-sgml', 496 '.sgml' : 'text/x-sgml', 497 '.sh' : 'application/x-sh', 498 '.shar' : 'application/x-shar', 499 '.snd' : 'audio/basic', 500 '.so' : 'application/octet-stream', 501 '.src' : 'application/x-wais-source', 502 '.sv4cpio': 'application/x-sv4cpio', 503 '.sv4crc' : 'application/x-sv4crc', 504 '.svg' : 'image/svg+xml', 505 '.swf' : 'application/x-shockwave-flash', 506 '.t' : 'application/x-troff', 507 '.tar' : 'application/x-tar', 508 '.tcl' : 'application/x-tcl', 509 '.tex' : 'application/x-tex', 510 '.texi' : 'application/x-texinfo', 511 '.texinfo': 'application/x-texinfo', 512 '.tif' : 'image/tiff', 513 '.tiff' : 'image/tiff', 514 '.tr' : 'application/x-troff', 515 '.tsv' : 'text/tab-separated-values', 516 '.txt' : 'text/plain', 517 '.ustar' : 'application/x-ustar', 518 '.vcf' : 'text/x-vcard', 519 '.wav' : 'audio/x-wav', 520 '.webm' : 'video/webm', 521 '.wiz' : 'application/msword', 522 '.wsdl' : 'application/xml', 523 '.xbm' : 'image/x-xbitmap', 524 '.xlb' : 'application/vnd.ms-excel', 525 # Duplicates :( 526 '.xls' : 'application/excel', 527 '.xls' : 'application/vnd.ms-excel', 528 '.xml' : 'text/xml', 529 '.xpdl' : 'application/xml', 530 '.xpm' : 'image/x-xpixmap', 531 '.xsl' : 'application/xml', 532 '.xwd' : 'image/x-xwindowdump', 533 '.zip' : 'application/zip', 534 } 535 536 # These are non-standard types, commonly found in the wild. They will 537 # only match if strict=0 flag is given to the API methods. 538 539 # Please sort these too 540 common_types = { 541 '.jpg' : 'image/jpg', 542 '.mid' : 'audio/midi', 543 '.midi': 'audio/midi', 544 '.pct' : 'image/pict', 545 '.pic' : 'image/pict', 546 '.pict': 'image/pict', 547 '.rtf' : 'application/rtf', 548 '.xul' : 'text/xul' 549 } 550 551 552_default_mime_types() 553 554 555if __name__ == '__main__': 556 import getopt 557 558 USAGE = """\ 559Usage: mimetypes.py [options] type 560 561Options: 562 --help / -h -- print this message and exit 563 --lenient / -l -- additionally search of some common, but non-standard 564 types. 565 --extension / -e -- guess extension instead of type 566 567More than one type argument may be given. 568""" 569 570 def usage(code, msg=''): 571 print USAGE 572 if msg: print msg 573 sys.exit(code) 574 575 try: 576 opts, args = getopt.getopt(sys.argv[1:], 'hle', 577 ['help', 'lenient', 'extension']) 578 except getopt.error, msg: 579 usage(1, msg) 580 581 strict = 1 582 extension = 0 583 for opt, arg in opts: 584 if opt in ('-h', '--help'): 585 usage(0) 586 elif opt in ('-l', '--lenient'): 587 strict = 0 588 elif opt in ('-e', '--extension'): 589 extension = 1 590 for gtype in args: 591 if extension: 592 guess = guess_extension(gtype, strict) 593 if not guess: print "I don't know anything about type", gtype 594 else: print guess 595 else: 596 guess, encoding = guess_type(gtype, strict) 597 if not guess: print "I don't know anything about type", gtype 598 else: print 'type:', guess, 'encoding:', encoding 599