1"""PyPI and direct package downloading""" 2import sys 3import os 4import re 5import shutil 6import socket 7import base64 8import hashlib 9import itertools 10from functools import wraps 11 12from setuptools.extern import six 13from setuptools.extern.six.moves import urllib, http_client, configparser, map 14 15import setuptools 16from pkg_resources import ( 17 CHECKOUT_DIST, Distribution, BINARY_DIST, normalize_path, SOURCE_DIST, 18 Environment, find_distributions, safe_name, safe_version, 19 to_filename, Requirement, DEVELOP_DIST, EGG_DIST, 20) 21from setuptools import ssl_support 22from distutils import log 23from distutils.errors import DistutilsError 24from fnmatch import translate 25from setuptools.py27compat import get_all_headers 26from setuptools.py33compat import unescape 27from setuptools.wheel import Wheel 28 29EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.+!]+)$') 30HREF = re.compile("""href\\s*=\\s*['"]?([^'"> ]+)""", re.I) 31# this is here to fix emacs' cruddy broken syntax highlighting 32PYPI_MD5 = re.compile( 33 '<a href="([^"#]+)">([^<]+)</a>\n\\s+\\(<a (?:title="MD5 hash"\n\\s+)' 34 'href="[^?]+\\?:action=show_md5&digest=([0-9a-f]{32})">md5</a>\\)' 35) 36URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match 37EXTENSIONS = ".tar.gz .tar.bz2 .tar .zip .tgz".split() 38 39__all__ = [ 40 'PackageIndex', 'distros_for_url', 'parse_bdist_wininst', 41 'interpret_distro_name', 42] 43 44_SOCKET_TIMEOUT = 15 45 46_tmpl = "setuptools/{setuptools.__version__} Python-urllib/{py_major}" 47user_agent = _tmpl.format(py_major=sys.version[:3], setuptools=setuptools) 48 49 50def parse_requirement_arg(spec): 51 try: 52 return Requirement.parse(spec) 53 except ValueError: 54 raise DistutilsError( 55 "Not a URL, existing file, or requirement spec: %r" % (spec,) 56 ) 57 58 59def parse_bdist_wininst(name): 60 """Return (base,pyversion) or (None,None) for possible .exe name""" 61 62 lower = name.lower() 63 base, py_ver, plat = None, None, None 64 65 if lower.endswith('.exe'): 66 if lower.endswith('.win32.exe'): 67 base = name[:-10] 68 plat = 'win32' 69 elif lower.startswith('.win32-py', -16): 70 py_ver = name[-7:-4] 71 base = name[:-16] 72 plat = 'win32' 73 elif lower.endswith('.win-amd64.exe'): 74 base = name[:-14] 75 plat = 'win-amd64' 76 elif lower.startswith('.win-amd64-py', -20): 77 py_ver = name[-7:-4] 78 base = name[:-20] 79 plat = 'win-amd64' 80 return base, py_ver, plat 81 82 83def egg_info_for_url(url): 84 parts = urllib.parse.urlparse(url) 85 scheme, server, path, parameters, query, fragment = parts 86 base = urllib.parse.unquote(path.split('/')[-1]) 87 if server == 'sourceforge.net' and base == 'download': # XXX Yuck 88 base = urllib.parse.unquote(path.split('/')[-2]) 89 if '#' in base: 90 base, fragment = base.split('#', 1) 91 return base, fragment 92 93 94def distros_for_url(url, metadata=None): 95 """Yield egg or source distribution objects that might be found at a URL""" 96 base, fragment = egg_info_for_url(url) 97 for dist in distros_for_location(url, base, metadata): 98 yield dist 99 if fragment: 100 match = EGG_FRAGMENT.match(fragment) 101 if match: 102 for dist in interpret_distro_name( 103 url, match.group(1), metadata, precedence=CHECKOUT_DIST 104 ): 105 yield dist 106 107 108def distros_for_location(location, basename, metadata=None): 109 """Yield egg or source distribution objects based on basename""" 110 if basename.endswith('.egg.zip'): 111 basename = basename[:-4] # strip the .zip 112 if basename.endswith('.egg') and '-' in basename: 113 # only one, unambiguous interpretation 114 return [Distribution.from_location(location, basename, metadata)] 115 if basename.endswith('.whl') and '-' in basename: 116 wheel = Wheel(basename) 117 if not wheel.is_compatible(): 118 return [] 119 return [Distribution( 120 location=location, 121 project_name=wheel.project_name, 122 version=wheel.version, 123 # Increase priority over eggs. 124 precedence=EGG_DIST + 1, 125 )] 126 if basename.endswith('.exe'): 127 win_base, py_ver, platform = parse_bdist_wininst(basename) 128 if win_base is not None: 129 return interpret_distro_name( 130 location, win_base, metadata, py_ver, BINARY_DIST, platform 131 ) 132 # Try source distro extensions (.zip, .tgz, etc.) 133 # 134 for ext in EXTENSIONS: 135 if basename.endswith(ext): 136 basename = basename[:-len(ext)] 137 return interpret_distro_name(location, basename, metadata) 138 return [] # no extension matched 139 140 141def distros_for_filename(filename, metadata=None): 142 """Yield possible egg or source distribution objects based on a filename""" 143 return distros_for_location( 144 normalize_path(filename), os.path.basename(filename), metadata 145 ) 146 147 148def interpret_distro_name( 149 location, basename, metadata, py_version=None, precedence=SOURCE_DIST, 150 platform=None 151): 152 """Generate alternative interpretations of a source distro name 153 154 Note: if `location` is a filesystem filename, you should call 155 ``pkg_resources.normalize_path()`` on it before passing it to this 156 routine! 157 """ 158 # Generate alternative interpretations of a source distro name 159 # Because some packages are ambiguous as to name/versions split 160 # e.g. "adns-python-1.1.0", "egenix-mx-commercial", etc. 161 # So, we generate each possible interepretation (e.g. "adns, python-1.1.0" 162 # "adns-python, 1.1.0", and "adns-python-1.1.0, no version"). In practice, 163 # the spurious interpretations should be ignored, because in the event 164 # there's also an "adns" package, the spurious "python-1.1.0" version will 165 # compare lower than any numeric version number, and is therefore unlikely 166 # to match a request for it. It's still a potential problem, though, and 167 # in the long run PyPI and the distutils should go for "safe" names and 168 # versions in distribution archive names (sdist and bdist). 169 170 parts = basename.split('-') 171 if not py_version and any(re.match(r'py\d\.\d$', p) for p in parts[2:]): 172 # it is a bdist_dumb, not an sdist -- bail out 173 return 174 175 for p in range(1, len(parts) + 1): 176 yield Distribution( 177 location, metadata, '-'.join(parts[:p]), '-'.join(parts[p:]), 178 py_version=py_version, precedence=precedence, 179 platform=platform 180 ) 181 182 183# From Python 2.7 docs 184def unique_everseen(iterable, key=None): 185 "List unique elements, preserving order. Remember all elements ever seen." 186 # unique_everseen('AAAABBBCCDAABBB') --> A B C D 187 # unique_everseen('ABBCcAD', str.lower) --> A B C D 188 seen = set() 189 seen_add = seen.add 190 if key is None: 191 for element in six.moves.filterfalse(seen.__contains__, iterable): 192 seen_add(element) 193 yield element 194 else: 195 for element in iterable: 196 k = key(element) 197 if k not in seen: 198 seen_add(k) 199 yield element 200 201 202def unique_values(func): 203 """ 204 Wrap a function returning an iterable such that the resulting iterable 205 only ever yields unique items. 206 """ 207 208 @wraps(func) 209 def wrapper(*args, **kwargs): 210 return unique_everseen(func(*args, **kwargs)) 211 212 return wrapper 213 214 215REL = re.compile(r"""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I) 216# this line is here to fix emacs' cruddy broken syntax highlighting 217 218 219@unique_values 220def find_external_links(url, page): 221 """Find rel="homepage" and rel="download" links in `page`, yielding URLs""" 222 223 for match in REL.finditer(page): 224 tag, rel = match.groups() 225 rels = set(map(str.strip, rel.lower().split(','))) 226 if 'homepage' in rels or 'download' in rels: 227 for match in HREF.finditer(tag): 228 yield urllib.parse.urljoin(url, htmldecode(match.group(1))) 229 230 for tag in ("<th>Home Page", "<th>Download URL"): 231 pos = page.find(tag) 232 if pos != -1: 233 match = HREF.search(page, pos) 234 if match: 235 yield urllib.parse.urljoin(url, htmldecode(match.group(1))) 236 237 238class ContentChecker(object): 239 """ 240 A null content checker that defines the interface for checking content 241 """ 242 243 def feed(self, block): 244 """ 245 Feed a block of data to the hash. 246 """ 247 return 248 249 def is_valid(self): 250 """ 251 Check the hash. Return False if validation fails. 252 """ 253 return True 254 255 def report(self, reporter, template): 256 """ 257 Call reporter with information about the checker (hash name) 258 substituted into the template. 259 """ 260 return 261 262 263class HashChecker(ContentChecker): 264 pattern = re.compile( 265 r'(?P<hash_name>sha1|sha224|sha384|sha256|sha512|md5)=' 266 r'(?P<expected>[a-f0-9]+)' 267 ) 268 269 def __init__(self, hash_name, expected): 270 self.hash_name = hash_name 271 self.hash = hashlib.new(hash_name) 272 self.expected = expected 273 274 @classmethod 275 def from_url(cls, url): 276 "Construct a (possibly null) ContentChecker from a URL" 277 fragment = urllib.parse.urlparse(url)[-1] 278 if not fragment: 279 return ContentChecker() 280 match = cls.pattern.search(fragment) 281 if not match: 282 return ContentChecker() 283 return cls(**match.groupdict()) 284 285 def feed(self, block): 286 self.hash.update(block) 287 288 def is_valid(self): 289 return self.hash.hexdigest() == self.expected 290 291 def report(self, reporter, template): 292 msg = template % self.hash_name 293 return reporter(msg) 294 295 296class PackageIndex(Environment): 297 """A distribution index that scans web pages for download URLs""" 298 299 def __init__( 300 self, index_url="https://pypi.org/simple/", hosts=('*',), 301 ca_bundle=None, verify_ssl=True, *args, **kw 302 ): 303 Environment.__init__(self, *args, **kw) 304 self.index_url = index_url + "/" [:not index_url.endswith('/')] 305 self.scanned_urls = {} 306 self.fetched_urls = {} 307 self.package_pages = {} 308 self.allows = re.compile('|'.join(map(translate, hosts))).match 309 self.to_scan = [] 310 use_ssl = ( 311 verify_ssl 312 and ssl_support.is_available 313 and (ca_bundle or ssl_support.find_ca_bundle()) 314 ) 315 if use_ssl: 316 self.opener = ssl_support.opener_for(ca_bundle) 317 else: 318 self.opener = urllib.request.urlopen 319 320 def process_url(self, url, retrieve=False): 321 """Evaluate a URL as a possible download, and maybe retrieve it""" 322 if url in self.scanned_urls and not retrieve: 323 return 324 self.scanned_urls[url] = True 325 if not URL_SCHEME(url): 326 self.process_filename(url) 327 return 328 else: 329 dists = list(distros_for_url(url)) 330 if dists: 331 if not self.url_ok(url): 332 return 333 self.debug("Found link: %s", url) 334 335 if dists or not retrieve or url in self.fetched_urls: 336 list(map(self.add, dists)) 337 return # don't need the actual page 338 339 if not self.url_ok(url): 340 self.fetched_urls[url] = True 341 return 342 343 self.info("Reading %s", url) 344 self.fetched_urls[url] = True # prevent multiple fetch attempts 345 tmpl = "Download error on %s: %%s -- Some packages may not be found!" 346 f = self.open_url(url, tmpl % url) 347 if f is None: 348 return 349 self.fetched_urls[f.url] = True 350 if 'html' not in f.headers.get('content-type', '').lower(): 351 f.close() # not html, we can't process it 352 return 353 354 base = f.url # handle redirects 355 page = f.read() 356 if not isinstance(page, str): 357 # In Python 3 and got bytes but want str. 358 if isinstance(f, urllib.error.HTTPError): 359 # Errors have no charset, assume latin1: 360 charset = 'latin-1' 361 else: 362 charset = f.headers.get_param('charset') or 'latin-1' 363 page = page.decode(charset, "ignore") 364 f.close() 365 for match in HREF.finditer(page): 366 link = urllib.parse.urljoin(base, htmldecode(match.group(1))) 367 self.process_url(link) 368 if url.startswith(self.index_url) and getattr(f, 'code', None) != 404: 369 page = self.process_index(url, page) 370 371 def process_filename(self, fn, nested=False): 372 # process filenames or directories 373 if not os.path.exists(fn): 374 self.warn("Not found: %s", fn) 375 return 376 377 if os.path.isdir(fn) and not nested: 378 path = os.path.realpath(fn) 379 for item in os.listdir(path): 380 self.process_filename(os.path.join(path, item), True) 381 382 dists = distros_for_filename(fn) 383 if dists: 384 self.debug("Found: %s", fn) 385 list(map(self.add, dists)) 386 387 def url_ok(self, url, fatal=False): 388 s = URL_SCHEME(url) 389 is_file = s and s.group(1).lower() == 'file' 390 if is_file or self.allows(urllib.parse.urlparse(url)[1]): 391 return True 392 msg = ( 393 "\nNote: Bypassing %s (disallowed host; see " 394 "http://bit.ly/2hrImnY for details).\n") 395 if fatal: 396 raise DistutilsError(msg % url) 397 else: 398 self.warn(msg, url) 399 400 def scan_egg_links(self, search_path): 401 dirs = filter(os.path.isdir, search_path) 402 egg_links = ( 403 (path, entry) 404 for path in dirs 405 for entry in os.listdir(path) 406 if entry.endswith('.egg-link') 407 ) 408 list(itertools.starmap(self.scan_egg_link, egg_links)) 409 410 def scan_egg_link(self, path, entry): 411 with open(os.path.join(path, entry)) as raw_lines: 412 # filter non-empty lines 413 lines = list(filter(None, map(str.strip, raw_lines))) 414 415 if len(lines) != 2: 416 # format is not recognized; punt 417 return 418 419 egg_path, setup_path = lines 420 421 for dist in find_distributions(os.path.join(path, egg_path)): 422 dist.location = os.path.join(path, *lines) 423 dist.precedence = SOURCE_DIST 424 self.add(dist) 425 426 def process_index(self, url, page): 427 """Process the contents of a PyPI page""" 428 429 def scan(link): 430 # Process a URL to see if it's for a package page 431 if link.startswith(self.index_url): 432 parts = list(map( 433 urllib.parse.unquote, link[len(self.index_url):].split('/') 434 )) 435 if len(parts) == 2 and '#' not in parts[1]: 436 # it's a package page, sanitize and index it 437 pkg = safe_name(parts[0]) 438 ver = safe_version(parts[1]) 439 self.package_pages.setdefault(pkg.lower(), {})[link] = True 440 return to_filename(pkg), to_filename(ver) 441 return None, None 442 443 # process an index page into the package-page index 444 for match in HREF.finditer(page): 445 try: 446 scan(urllib.parse.urljoin(url, htmldecode(match.group(1)))) 447 except ValueError: 448 pass 449 450 pkg, ver = scan(url) # ensure this page is in the page index 451 if pkg: 452 # process individual package page 453 for new_url in find_external_links(url, page): 454 # Process the found URL 455 base, frag = egg_info_for_url(new_url) 456 if base.endswith('.py') and not frag: 457 if ver: 458 new_url += '#egg=%s-%s' % (pkg, ver) 459 else: 460 self.need_version_info(url) 461 self.scan_url(new_url) 462 463 return PYPI_MD5.sub( 464 lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1, 3, 2), page 465 ) 466 else: 467 return "" # no sense double-scanning non-package pages 468 469 def need_version_info(self, url): 470 self.scan_all( 471 "Page at %s links to .py file(s) without version info; an index " 472 "scan is required.", url 473 ) 474 475 def scan_all(self, msg=None, *args): 476 if self.index_url not in self.fetched_urls: 477 if msg: 478 self.warn(msg, *args) 479 self.info( 480 "Scanning index of all packages (this may take a while)" 481 ) 482 self.scan_url(self.index_url) 483 484 def find_packages(self, requirement): 485 self.scan_url(self.index_url + requirement.unsafe_name + '/') 486 487 if not self.package_pages.get(requirement.key): 488 # Fall back to safe version of the name 489 self.scan_url(self.index_url + requirement.project_name + '/') 490 491 if not self.package_pages.get(requirement.key): 492 # We couldn't find the target package, so search the index page too 493 self.not_found_in_index(requirement) 494 495 for url in list(self.package_pages.get(requirement.key, ())): 496 # scan each page that might be related to the desired package 497 self.scan_url(url) 498 499 def obtain(self, requirement, installer=None): 500 self.prescan() 501 self.find_packages(requirement) 502 for dist in self[requirement.key]: 503 if dist in requirement: 504 return dist 505 self.debug("%s does not match %s", requirement, dist) 506 return super(PackageIndex, self).obtain(requirement, installer) 507 508 def check_hash(self, checker, filename, tfp): 509 """ 510 checker is a ContentChecker 511 """ 512 checker.report( 513 self.debug, 514 "Validating %%s checksum for %s" % filename) 515 if not checker.is_valid(): 516 tfp.close() 517 os.unlink(filename) 518 raise DistutilsError( 519 "%s validation failed for %s; " 520 "possible download problem?" 521 % (checker.hash.name, os.path.basename(filename)) 522 ) 523 524 def add_find_links(self, urls): 525 """Add `urls` to the list that will be prescanned for searches""" 526 for url in urls: 527 if ( 528 self.to_scan is None # if we have already "gone online" 529 or not URL_SCHEME(url) # or it's a local file/directory 530 or url.startswith('file:') 531 or list(distros_for_url(url)) # or a direct package link 532 ): 533 # then go ahead and process it now 534 self.scan_url(url) 535 else: 536 # otherwise, defer retrieval till later 537 self.to_scan.append(url) 538 539 def prescan(self): 540 """Scan urls scheduled for prescanning (e.g. --find-links)""" 541 if self.to_scan: 542 list(map(self.scan_url, self.to_scan)) 543 self.to_scan = None # from now on, go ahead and process immediately 544 545 def not_found_in_index(self, requirement): 546 if self[requirement.key]: # we've seen at least one distro 547 meth, msg = self.info, "Couldn't retrieve index page for %r" 548 else: # no distros seen for this name, might be misspelled 549 meth, msg = ( 550 self.warn, 551 "Couldn't find index page for %r (maybe misspelled?)") 552 meth(msg, requirement.unsafe_name) 553 self.scan_all() 554 555 def download(self, spec, tmpdir): 556 """Locate and/or download `spec` to `tmpdir`, returning a local path 557 558 `spec` may be a ``Requirement`` object, or a string containing a URL, 559 an existing local filename, or a project/version requirement spec 560 (i.e. the string form of a ``Requirement`` object). If it is the URL 561 of a .py file with an unambiguous ``#egg=name-version`` tag (i.e., one 562 that escapes ``-`` as ``_`` throughout), a trivial ``setup.py`` is 563 automatically created alongside the downloaded file. 564 565 If `spec` is a ``Requirement`` object or a string containing a 566 project/version requirement spec, this method returns the location of 567 a matching distribution (possibly after downloading it to `tmpdir`). 568 If `spec` is a locally existing file or directory name, it is simply 569 returned unchanged. If `spec` is a URL, it is downloaded to a subpath 570 of `tmpdir`, and the local filename is returned. Various errors may be 571 raised if a problem occurs during downloading. 572 """ 573 if not isinstance(spec, Requirement): 574 scheme = URL_SCHEME(spec) 575 if scheme: 576 # It's a url, download it to tmpdir 577 found = self._download_url(scheme.group(1), spec, tmpdir) 578 base, fragment = egg_info_for_url(spec) 579 if base.endswith('.py'): 580 found = self.gen_setup(found, fragment, tmpdir) 581 return found 582 elif os.path.exists(spec): 583 # Existing file or directory, just return it 584 return spec 585 else: 586 spec = parse_requirement_arg(spec) 587 return getattr(self.fetch_distribution(spec, tmpdir), 'location', None) 588 589 def fetch_distribution( 590 self, requirement, tmpdir, force_scan=False, source=False, 591 develop_ok=False, local_index=None): 592 """Obtain a distribution suitable for fulfilling `requirement` 593 594 `requirement` must be a ``pkg_resources.Requirement`` instance. 595 If necessary, or if the `force_scan` flag is set, the requirement is 596 searched for in the (online) package index as well as the locally 597 installed packages. If a distribution matching `requirement` is found, 598 the returned distribution's ``location`` is the value you would have 599 gotten from calling the ``download()`` method with the matching 600 distribution's URL or filename. If no matching distribution is found, 601 ``None`` is returned. 602 603 If the `source` flag is set, only source distributions and source 604 checkout links will be considered. Unless the `develop_ok` flag is 605 set, development and system eggs (i.e., those using the ``.egg-info`` 606 format) will be ignored. 607 """ 608 # process a Requirement 609 self.info("Searching for %s", requirement) 610 skipped = {} 611 dist = None 612 613 def find(req, env=None): 614 if env is None: 615 env = self 616 # Find a matching distribution; may be called more than once 617 618 for dist in env[req.key]: 619 620 if dist.precedence == DEVELOP_DIST and not develop_ok: 621 if dist not in skipped: 622 self.warn( 623 "Skipping development or system egg: %s", dist, 624 ) 625 skipped[dist] = 1 626 continue 627 628 test = ( 629 dist in req 630 and (dist.precedence <= SOURCE_DIST or not source) 631 ) 632 if test: 633 loc = self.download(dist.location, tmpdir) 634 dist.download_location = loc 635 if os.path.exists(dist.download_location): 636 return dist 637 638 if force_scan: 639 self.prescan() 640 self.find_packages(requirement) 641 dist = find(requirement) 642 643 if not dist and local_index is not None: 644 dist = find(requirement, local_index) 645 646 if dist is None: 647 if self.to_scan is not None: 648 self.prescan() 649 dist = find(requirement) 650 651 if dist is None and not force_scan: 652 self.find_packages(requirement) 653 dist = find(requirement) 654 655 if dist is None: 656 self.warn( 657 "No local packages or working download links found for %s%s", 658 (source and "a source distribution of " or ""), 659 requirement, 660 ) 661 else: 662 self.info("Best match: %s", dist) 663 return dist.clone(location=dist.download_location) 664 665 def fetch(self, requirement, tmpdir, force_scan=False, source=False): 666 """Obtain a file suitable for fulfilling `requirement` 667 668 DEPRECATED; use the ``fetch_distribution()`` method now instead. For 669 backward compatibility, this routine is identical but returns the 670 ``location`` of the downloaded distribution instead of a distribution 671 object. 672 """ 673 dist = self.fetch_distribution(requirement, tmpdir, force_scan, source) 674 if dist is not None: 675 return dist.location 676 return None 677 678 def gen_setup(self, filename, fragment, tmpdir): 679 match = EGG_FRAGMENT.match(fragment) 680 dists = match and [ 681 d for d in 682 interpret_distro_name(filename, match.group(1), None) if d.version 683 ] or [] 684 685 if len(dists) == 1: # unambiguous ``#egg`` fragment 686 basename = os.path.basename(filename) 687 688 # Make sure the file has been downloaded to the temp dir. 689 if os.path.dirname(filename) != tmpdir: 690 dst = os.path.join(tmpdir, basename) 691 from setuptools.command.easy_install import samefile 692 if not samefile(filename, dst): 693 shutil.copy2(filename, dst) 694 filename = dst 695 696 with open(os.path.join(tmpdir, 'setup.py'), 'w') as file: 697 file.write( 698 "from setuptools import setup\n" 699 "setup(name=%r, version=%r, py_modules=[%r])\n" 700 % ( 701 dists[0].project_name, dists[0].version, 702 os.path.splitext(basename)[0] 703 ) 704 ) 705 return filename 706 707 elif match: 708 raise DistutilsError( 709 "Can't unambiguously interpret project/version identifier %r; " 710 "any dashes in the name or version should be escaped using " 711 "underscores. %r" % (fragment, dists) 712 ) 713 else: 714 raise DistutilsError( 715 "Can't process plain .py files without an '#egg=name-version'" 716 " suffix to enable automatic setup script generation." 717 ) 718 719 dl_blocksize = 8192 720 721 def _download_to(self, url, filename): 722 self.info("Downloading %s", url) 723 # Download the file 724 fp = None 725 try: 726 checker = HashChecker.from_url(url) 727 fp = self.open_url(url) 728 if isinstance(fp, urllib.error.HTTPError): 729 raise DistutilsError( 730 "Can't download %s: %s %s" % (url, fp.code, fp.msg) 731 ) 732 headers = fp.info() 733 blocknum = 0 734 bs = self.dl_blocksize 735 size = -1 736 if "content-length" in headers: 737 # Some servers return multiple Content-Length headers :( 738 sizes = get_all_headers(headers, 'Content-Length') 739 size = max(map(int, sizes)) 740 self.reporthook(url, filename, blocknum, bs, size) 741 with open(filename, 'wb') as tfp: 742 while True: 743 block = fp.read(bs) 744 if block: 745 checker.feed(block) 746 tfp.write(block) 747 blocknum += 1 748 self.reporthook(url, filename, blocknum, bs, size) 749 else: 750 break 751 self.check_hash(checker, filename, tfp) 752 return headers 753 finally: 754 if fp: 755 fp.close() 756 757 def reporthook(self, url, filename, blocknum, blksize, size): 758 pass # no-op 759 760 def open_url(self, url, warning=None): 761 if url.startswith('file:'): 762 return local_open(url) 763 try: 764 return open_with_auth(url, self.opener) 765 except (ValueError, http_client.InvalidURL) as v: 766 msg = ' '.join([str(arg) for arg in v.args]) 767 if warning: 768 self.warn(warning, msg) 769 else: 770 raise DistutilsError('%s %s' % (url, msg)) 771 except urllib.error.HTTPError as v: 772 return v 773 except urllib.error.URLError as v: 774 if warning: 775 self.warn(warning, v.reason) 776 else: 777 raise DistutilsError("Download error for %s: %s" 778 % (url, v.reason)) 779 except http_client.BadStatusLine as v: 780 if warning: 781 self.warn(warning, v.line) 782 else: 783 raise DistutilsError( 784 '%s returned a bad status line. The server might be ' 785 'down, %s' % 786 (url, v.line) 787 ) 788 except (http_client.HTTPException, socket.error) as v: 789 if warning: 790 self.warn(warning, v) 791 else: 792 raise DistutilsError("Download error for %s: %s" 793 % (url, v)) 794 795 def _download_url(self, scheme, url, tmpdir): 796 # Determine download filename 797 # 798 name, fragment = egg_info_for_url(url) 799 if name: 800 while '..' in name: 801 name = name.replace('..', '.').replace('\\', '_') 802 else: 803 name = "__downloaded__" # default if URL has no path contents 804 805 if name.endswith('.egg.zip'): 806 name = name[:-4] # strip the extra .zip before download 807 808 filename = os.path.join(tmpdir, name) 809 810 # Download the file 811 # 812 if scheme == 'svn' or scheme.startswith('svn+'): 813 return self._download_svn(url, filename) 814 elif scheme == 'git' or scheme.startswith('git+'): 815 return self._download_git(url, filename) 816 elif scheme.startswith('hg+'): 817 return self._download_hg(url, filename) 818 elif scheme == 'file': 819 return urllib.request.url2pathname(urllib.parse.urlparse(url)[2]) 820 else: 821 self.url_ok(url, True) # raises error if not allowed 822 return self._attempt_download(url, filename) 823 824 def scan_url(self, url): 825 self.process_url(url, True) 826 827 def _attempt_download(self, url, filename): 828 headers = self._download_to(url, filename) 829 if 'html' in headers.get('content-type', '').lower(): 830 return self._download_html(url, headers, filename) 831 else: 832 return filename 833 834 def _download_html(self, url, headers, filename): 835 file = open(filename) 836 for line in file: 837 if line.strip(): 838 # Check for a subversion index page 839 if re.search(r'<title>([^- ]+ - )?Revision \d+:', line): 840 # it's a subversion index page: 841 file.close() 842 os.unlink(filename) 843 return self._download_svn(url, filename) 844 break # not an index page 845 file.close() 846 os.unlink(filename) 847 raise DistutilsError("Unexpected HTML page found at " + url) 848 849 def _download_svn(self, url, filename): 850 url = url.split('#', 1)[0] # remove any fragment for svn's sake 851 creds = '' 852 if url.lower().startswith('svn:') and '@' in url: 853 scheme, netloc, path, p, q, f = urllib.parse.urlparse(url) 854 if not netloc and path.startswith('//') and '/' in path[2:]: 855 netloc, path = path[2:].split('/', 1) 856 auth, host = urllib.parse.splituser(netloc) 857 if auth: 858 if ':' in auth: 859 user, pw = auth.split(':', 1) 860 creds = " --username=%s --password=%s" % (user, pw) 861 else: 862 creds = " --username=" + auth 863 netloc = host 864 parts = scheme, netloc, url, p, q, f 865 url = urllib.parse.urlunparse(parts) 866 self.info("Doing subversion checkout from %s to %s", url, filename) 867 os.system("svn checkout%s -q %s %s" % (creds, url, filename)) 868 return filename 869 870 @staticmethod 871 def _vcs_split_rev_from_url(url, pop_prefix=False): 872 scheme, netloc, path, query, frag = urllib.parse.urlsplit(url) 873 874 scheme = scheme.split('+', 1)[-1] 875 876 # Some fragment identification fails 877 path = path.split('#', 1)[0] 878 879 rev = None 880 if '@' in path: 881 path, rev = path.rsplit('@', 1) 882 883 # Also, discard fragment 884 url = urllib.parse.urlunsplit((scheme, netloc, path, query, '')) 885 886 return url, rev 887 888 def _download_git(self, url, filename): 889 filename = filename.split('#', 1)[0] 890 url, rev = self._vcs_split_rev_from_url(url, pop_prefix=True) 891 892 self.info("Doing git clone from %s to %s", url, filename) 893 os.system("git clone --quiet %s %s" % (url, filename)) 894 895 if rev is not None: 896 self.info("Checking out %s", rev) 897 os.system("(cd %s && git checkout --quiet %s)" % ( 898 filename, 899 rev, 900 )) 901 902 return filename 903 904 def _download_hg(self, url, filename): 905 filename = filename.split('#', 1)[0] 906 url, rev = self._vcs_split_rev_from_url(url, pop_prefix=True) 907 908 self.info("Doing hg clone from %s to %s", url, filename) 909 os.system("hg clone --quiet %s %s" % (url, filename)) 910 911 if rev is not None: 912 self.info("Updating to %s", rev) 913 os.system("(cd %s && hg up -C -r %s -q)" % ( 914 filename, 915 rev, 916 )) 917 918 return filename 919 920 def debug(self, msg, *args): 921 log.debug(msg, *args) 922 923 def info(self, msg, *args): 924 log.info(msg, *args) 925 926 def warn(self, msg, *args): 927 log.warn(msg, *args) 928 929 930# This pattern matches a character entity reference (a decimal numeric 931# references, a hexadecimal numeric reference, or a named reference). 932entity_sub = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub 933 934 935def decode_entity(match): 936 what = match.group(1) 937 return unescape(what) 938 939 940def htmldecode(text): 941 """Decode HTML entities in the given text.""" 942 return entity_sub(decode_entity, text) 943 944 945def socket_timeout(timeout=15): 946 def _socket_timeout(func): 947 def _socket_timeout(*args, **kwargs): 948 old_timeout = socket.getdefaulttimeout() 949 socket.setdefaulttimeout(timeout) 950 try: 951 return func(*args, **kwargs) 952 finally: 953 socket.setdefaulttimeout(old_timeout) 954 955 return _socket_timeout 956 957 return _socket_timeout 958 959 960def _encode_auth(auth): 961 """ 962 A function compatible with Python 2.3-3.3 that will encode 963 auth from a URL suitable for an HTTP header. 964 >>> str(_encode_auth('username%3Apassword')) 965 'dXNlcm5hbWU6cGFzc3dvcmQ=' 966 967 Long auth strings should not cause a newline to be inserted. 968 >>> long_auth = 'username:' + 'password'*10 969 >>> chr(10) in str(_encode_auth(long_auth)) 970 False 971 """ 972 auth_s = urllib.parse.unquote(auth) 973 # convert to bytes 974 auth_bytes = auth_s.encode() 975 # use the legacy interface for Python 2.3 support 976 encoded_bytes = base64.encodestring(auth_bytes) 977 # convert back to a string 978 encoded = encoded_bytes.decode() 979 # strip the trailing carriage return 980 return encoded.replace('\n', '') 981 982 983class Credential(object): 984 """ 985 A username/password pair. Use like a namedtuple. 986 """ 987 988 def __init__(self, username, password): 989 self.username = username 990 self.password = password 991 992 def __iter__(self): 993 yield self.username 994 yield self.password 995 996 def __str__(self): 997 return '%(username)s:%(password)s' % vars(self) 998 999 1000class PyPIConfig(configparser.RawConfigParser): 1001 def __init__(self): 1002 """ 1003 Load from ~/.pypirc 1004 """ 1005 defaults = dict.fromkeys(['username', 'password', 'repository'], '') 1006 configparser.RawConfigParser.__init__(self, defaults) 1007 1008 rc = os.path.join(os.path.expanduser('~'), '.pypirc') 1009 if os.path.exists(rc): 1010 self.read(rc) 1011 1012 @property 1013 def creds_by_repository(self): 1014 sections_with_repositories = [ 1015 section for section in self.sections() 1016 if self.get(section, 'repository').strip() 1017 ] 1018 1019 return dict(map(self._get_repo_cred, sections_with_repositories)) 1020 1021 def _get_repo_cred(self, section): 1022 repo = self.get(section, 'repository').strip() 1023 return repo, Credential( 1024 self.get(section, 'username').strip(), 1025 self.get(section, 'password').strip(), 1026 ) 1027 1028 def find_credential(self, url): 1029 """ 1030 If the URL indicated appears to be a repository defined in this 1031 config, return the credential for that repository. 1032 """ 1033 for repository, cred in self.creds_by_repository.items(): 1034 if url.startswith(repository): 1035 return cred 1036 1037 1038def open_with_auth(url, opener=urllib.request.urlopen): 1039 """Open a urllib2 request, handling HTTP authentication""" 1040 1041 scheme, netloc, path, params, query, frag = urllib.parse.urlparse(url) 1042 1043 # Double scheme does not raise on Mac OS X as revealed by a 1044 # failing test. We would expect "nonnumeric port". Refs #20. 1045 if netloc.endswith(':'): 1046 raise http_client.InvalidURL("nonnumeric port: ''") 1047 1048 if scheme in ('http', 'https'): 1049 auth, host = urllib.parse.splituser(netloc) 1050 else: 1051 auth = None 1052 1053 if not auth: 1054 cred = PyPIConfig().find_credential(url) 1055 if cred: 1056 auth = str(cred) 1057 info = cred.username, url 1058 log.info('Authenticating as %s for %s (from .pypirc)', *info) 1059 1060 if auth: 1061 auth = "Basic " + _encode_auth(auth) 1062 parts = scheme, host, path, params, query, frag 1063 new_url = urllib.parse.urlunparse(parts) 1064 request = urllib.request.Request(new_url) 1065 request.add_header("Authorization", auth) 1066 else: 1067 request = urllib.request.Request(url) 1068 1069 request.add_header('User-Agent', user_agent) 1070 fp = opener(request) 1071 1072 if auth: 1073 # Put authentication info back into request URL if same host, 1074 # so that links found on the page will work 1075 s2, h2, path2, param2, query2, frag2 = urllib.parse.urlparse(fp.url) 1076 if s2 == scheme and h2 == host: 1077 parts = s2, netloc, path2, param2, query2, frag2 1078 fp.url = urllib.parse.urlunparse(parts) 1079 1080 return fp 1081 1082 1083# adding a timeout to avoid freezing package_index 1084open_with_auth = socket_timeout(_SOCKET_TIMEOUT)(open_with_auth) 1085 1086 1087def fix_sf_url(url): 1088 return url # backward compatibility 1089 1090 1091def local_open(url): 1092 """Read a local path, with special support for directories""" 1093 scheme, server, path, param, query, frag = urllib.parse.urlparse(url) 1094 filename = urllib.request.url2pathname(path) 1095 if os.path.isfile(filename): 1096 return urllib.request.urlopen(url) 1097 elif path.endswith('/') and os.path.isdir(filename): 1098 files = [] 1099 for f in os.listdir(filename): 1100 filepath = os.path.join(filename, f) 1101 if f == 'index.html': 1102 with open(filepath, 'r') as fp: 1103 body = fp.read() 1104 break 1105 elif os.path.isdir(filepath): 1106 f += '/' 1107 files.append('<a href="{name}">{name}</a>'.format(name=f)) 1108 else: 1109 tmpl = ( 1110 "<html><head><title>{url}</title>" 1111 "</head><body>{files}</body></html>") 1112 body = tmpl.format(url=url, files='\n'.join(files)) 1113 status, message = 200, "OK" 1114 else: 1115 status, message, body = 404, "Path not found", "Not found" 1116 1117 headers = {'content-type': 'text/html'} 1118 body_stream = six.StringIO(body) 1119 return urllib.error.HTTPError(url, status, message, headers, body_stream) 1120