1"""PyPI and direct package downloading""" 2import sys 3import os 4import re 5import io 6import shutil 7import socket 8import base64 9import hashlib 10import itertools 11import warnings 12import configparser 13import html 14import http.client 15import urllib.parse 16import urllib.request 17import urllib.error 18from functools import wraps 19 20import setuptools 21from pkg_resources import ( 22 CHECKOUT_DIST, Distribution, BINARY_DIST, normalize_path, SOURCE_DIST, 23 Environment, find_distributions, safe_name, safe_version, 24 to_filename, Requirement, DEVELOP_DIST, EGG_DIST, parse_version, 25) 26from distutils import log 27from distutils.errors import DistutilsError 28from fnmatch import translate 29from setuptools.wheel import Wheel 30from setuptools.extern.more_itertools import unique_everseen 31 32 33EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.+!]+)$') 34HREF = re.compile(r"""href\s*=\s*['"]?([^'"> ]+)""", re.I) 35PYPI_MD5 = re.compile( 36 r'<a href="([^"#]+)">([^<]+)</a>\n\s+\(<a (?:title="MD5 hash"\n\s+)' 37 r'href="[^?]+\?:action=show_md5&digest=([0-9a-f]{32})">md5</a>\)' 38) 39URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match 40EXTENSIONS = ".tar.gz .tar.bz2 .tar .zip .tgz".split() 41 42__all__ = [ 43 'PackageIndex', 'distros_for_url', 'parse_bdist_wininst', 44 'interpret_distro_name', 45] 46 47_SOCKET_TIMEOUT = 15 48 49_tmpl = "setuptools/{setuptools.__version__} Python-urllib/{py_major}" 50user_agent = _tmpl.format( 51 py_major='{}.{}'.format(*sys.version_info), setuptools=setuptools) 52 53 54def parse_requirement_arg(spec): 55 try: 56 return Requirement.parse(spec) 57 except ValueError as e: 58 raise DistutilsError( 59 "Not a URL, existing file, or requirement spec: %r" % (spec,) 60 ) from e 61 62 63def parse_bdist_wininst(name): 64 """Return (base,pyversion) or (None,None) for possible .exe name""" 65 66 lower = name.lower() 67 base, py_ver, plat = None, None, None 68 69 if lower.endswith('.exe'): 70 if lower.endswith('.win32.exe'): 71 base = name[:-10] 72 plat = 'win32' 73 elif lower.startswith('.win32-py', -16): 74 py_ver = name[-7:-4] 75 base = name[:-16] 76 plat = 'win32' 77 elif lower.endswith('.win-amd64.exe'): 78 base = name[:-14] 79 plat = 'win-amd64' 80 elif lower.startswith('.win-amd64-py', -20): 81 py_ver = name[-7:-4] 82 base = name[:-20] 83 plat = 'win-amd64' 84 return base, py_ver, plat 85 86 87def egg_info_for_url(url): 88 parts = urllib.parse.urlparse(url) 89 scheme, server, path, parameters, query, fragment = parts 90 base = urllib.parse.unquote(path.split('/')[-1]) 91 if server == 'sourceforge.net' and base == 'download': # XXX Yuck 92 base = urllib.parse.unquote(path.split('/')[-2]) 93 if '#' in base: 94 base, fragment = base.split('#', 1) 95 return base, fragment 96 97 98def distros_for_url(url, metadata=None): 99 """Yield egg or source distribution objects that might be found at a URL""" 100 base, fragment = egg_info_for_url(url) 101 for dist in distros_for_location(url, base, metadata): 102 yield dist 103 if fragment: 104 match = EGG_FRAGMENT.match(fragment) 105 if match: 106 for dist in interpret_distro_name( 107 url, match.group(1), metadata, precedence=CHECKOUT_DIST 108 ): 109 yield dist 110 111 112def distros_for_location(location, basename, metadata=None): 113 """Yield egg or source distribution objects based on basename""" 114 if basename.endswith('.egg.zip'): 115 basename = basename[:-4] # strip the .zip 116 if basename.endswith('.egg') and '-' in basename: 117 # only one, unambiguous interpretation 118 return [Distribution.from_location(location, basename, metadata)] 119 if basename.endswith('.whl') and '-' in basename: 120 wheel = Wheel(basename) 121 if not wheel.is_compatible(): 122 return [] 123 return [Distribution( 124 location=location, 125 project_name=wheel.project_name, 126 version=wheel.version, 127 # Increase priority over eggs. 128 precedence=EGG_DIST + 1, 129 )] 130 if basename.endswith('.exe'): 131 win_base, py_ver, platform = parse_bdist_wininst(basename) 132 if win_base is not None: 133 return interpret_distro_name( 134 location, win_base, metadata, py_ver, BINARY_DIST, platform 135 ) 136 # Try source distro extensions (.zip, .tgz, etc.) 137 # 138 for ext in EXTENSIONS: 139 if basename.endswith(ext): 140 basename = basename[:-len(ext)] 141 return interpret_distro_name(location, basename, metadata) 142 return [] # no extension matched 143 144 145def distros_for_filename(filename, metadata=None): 146 """Yield possible egg or source distribution objects based on a filename""" 147 return distros_for_location( 148 normalize_path(filename), os.path.basename(filename), metadata 149 ) 150 151 152def interpret_distro_name( 153 location, basename, metadata, py_version=None, precedence=SOURCE_DIST, 154 platform=None 155): 156 """Generate alternative interpretations of a source distro name 157 158 Note: if `location` is a filesystem filename, you should call 159 ``pkg_resources.normalize_path()`` on it before passing it to this 160 routine! 161 """ 162 # Generate alternative interpretations of a source distro name 163 # Because some packages are ambiguous as to name/versions split 164 # e.g. "adns-python-1.1.0", "egenix-mx-commercial", etc. 165 # So, we generate each possible interpretation (e.g. "adns, python-1.1.0" 166 # "adns-python, 1.1.0", and "adns-python-1.1.0, no version"). In practice, 167 # the spurious interpretations should be ignored, because in the event 168 # there's also an "adns" package, the spurious "python-1.1.0" version will 169 # compare lower than any numeric version number, and is therefore unlikely 170 # to match a request for it. It's still a potential problem, though, and 171 # in the long run PyPI and the distutils should go for "safe" names and 172 # versions in distribution archive names (sdist and bdist). 173 174 parts = basename.split('-') 175 if not py_version and any(re.match(r'py\d\.\d$', p) for p in parts[2:]): 176 # it is a bdist_dumb, not an sdist -- bail out 177 return 178 179 for p in range(1, len(parts) + 1): 180 yield Distribution( 181 location, metadata, '-'.join(parts[:p]), '-'.join(parts[p:]), 182 py_version=py_version, precedence=precedence, 183 platform=platform 184 ) 185 186 187def unique_values(func): 188 """ 189 Wrap a function returning an iterable such that the resulting iterable 190 only ever yields unique items. 191 """ 192 193 @wraps(func) 194 def wrapper(*args, **kwargs): 195 return unique_everseen(func(*args, **kwargs)) 196 197 return wrapper 198 199 200REL = re.compile(r"""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I) 201# this line is here to fix emacs' cruddy broken syntax highlighting 202 203 204@unique_values 205def find_external_links(url, page): 206 """Find rel="homepage" and rel="download" links in `page`, yielding URLs""" 207 208 for match in REL.finditer(page): 209 tag, rel = match.groups() 210 rels = set(map(str.strip, rel.lower().split(','))) 211 if 'homepage' in rels or 'download' in rels: 212 for match in HREF.finditer(tag): 213 yield urllib.parse.urljoin(url, htmldecode(match.group(1))) 214 215 for tag in ("<th>Home Page", "<th>Download URL"): 216 pos = page.find(tag) 217 if pos != -1: 218 match = HREF.search(page, pos) 219 if match: 220 yield urllib.parse.urljoin(url, htmldecode(match.group(1))) 221 222 223class ContentChecker: 224 """ 225 A null content checker that defines the interface for checking content 226 """ 227 228 def feed(self, block): 229 """ 230 Feed a block of data to the hash. 231 """ 232 return 233 234 def is_valid(self): 235 """ 236 Check the hash. Return False if validation fails. 237 """ 238 return True 239 240 def report(self, reporter, template): 241 """ 242 Call reporter with information about the checker (hash name) 243 substituted into the template. 244 """ 245 return 246 247 248class HashChecker(ContentChecker): 249 pattern = re.compile( 250 r'(?P<hash_name>sha1|sha224|sha384|sha256|sha512|md5)=' 251 r'(?P<expected>[a-f0-9]+)' 252 ) 253 254 def __init__(self, hash_name, expected): 255 self.hash_name = hash_name 256 self.hash = hashlib.new(hash_name) 257 self.expected = expected 258 259 @classmethod 260 def from_url(cls, url): 261 "Construct a (possibly null) ContentChecker from a URL" 262 fragment = urllib.parse.urlparse(url)[-1] 263 if not fragment: 264 return ContentChecker() 265 match = cls.pattern.search(fragment) 266 if not match: 267 return ContentChecker() 268 return cls(**match.groupdict()) 269 270 def feed(self, block): 271 self.hash.update(block) 272 273 def is_valid(self): 274 return self.hash.hexdigest() == self.expected 275 276 def report(self, reporter, template): 277 msg = template % self.hash_name 278 return reporter(msg) 279 280 281class PackageIndex(Environment): 282 """A distribution index that scans web pages for download URLs""" 283 284 def __init__( 285 self, index_url="https://pypi.org/simple/", hosts=('*',), 286 ca_bundle=None, verify_ssl=True, *args, **kw 287 ): 288 super().__init__(*args, **kw) 289 self.index_url = index_url + "/" [:not index_url.endswith('/')] 290 self.scanned_urls = {} 291 self.fetched_urls = {} 292 self.package_pages = {} 293 self.allows = re.compile('|'.join(map(translate, hosts))).match 294 self.to_scan = [] 295 self.opener = urllib.request.urlopen 296 297 def add(self, dist): 298 # ignore invalid versions 299 try: 300 parse_version(dist.version) 301 except Exception: 302 return 303 return super().add(dist) 304 305 # FIXME: 'PackageIndex.process_url' is too complex (14) 306 def process_url(self, url, retrieve=False): # noqa: C901 307 """Evaluate a URL as a possible download, and maybe retrieve it""" 308 if url in self.scanned_urls and not retrieve: 309 return 310 self.scanned_urls[url] = True 311 if not URL_SCHEME(url): 312 self.process_filename(url) 313 return 314 else: 315 dists = list(distros_for_url(url)) 316 if dists: 317 if not self.url_ok(url): 318 return 319 self.debug("Found link: %s", url) 320 321 if dists or not retrieve or url in self.fetched_urls: 322 list(map(self.add, dists)) 323 return # don't need the actual page 324 325 if not self.url_ok(url): 326 self.fetched_urls[url] = True 327 return 328 329 self.info("Reading %s", url) 330 self.fetched_urls[url] = True # prevent multiple fetch attempts 331 tmpl = "Download error on %s: %%s -- Some packages may not be found!" 332 f = self.open_url(url, tmpl % url) 333 if f is None: 334 return 335 if isinstance(f, urllib.error.HTTPError) and f.code == 401: 336 self.info("Authentication error: %s" % f.msg) 337 self.fetched_urls[f.url] = True 338 if 'html' not in f.headers.get('content-type', '').lower(): 339 f.close() # not html, we can't process it 340 return 341 342 base = f.url # handle redirects 343 page = f.read() 344 if not isinstance(page, str): 345 # In Python 3 and got bytes but want str. 346 if isinstance(f, urllib.error.HTTPError): 347 # Errors have no charset, assume latin1: 348 charset = 'latin-1' 349 else: 350 charset = f.headers.get_param('charset') or 'latin-1' 351 page = page.decode(charset, "ignore") 352 f.close() 353 for match in HREF.finditer(page): 354 link = urllib.parse.urljoin(base, htmldecode(match.group(1))) 355 self.process_url(link) 356 if url.startswith(self.index_url) and getattr(f, 'code', None) != 404: 357 page = self.process_index(url, page) 358 359 def process_filename(self, fn, nested=False): 360 # process filenames or directories 361 if not os.path.exists(fn): 362 self.warn("Not found: %s", fn) 363 return 364 365 if os.path.isdir(fn) and not nested: 366 path = os.path.realpath(fn) 367 for item in os.listdir(path): 368 self.process_filename(os.path.join(path, item), True) 369 370 dists = distros_for_filename(fn) 371 if dists: 372 self.debug("Found: %s", fn) 373 list(map(self.add, dists)) 374 375 def url_ok(self, url, fatal=False): 376 s = URL_SCHEME(url) 377 is_file = s and s.group(1).lower() == 'file' 378 if is_file or self.allows(urllib.parse.urlparse(url)[1]): 379 return True 380 msg = ( 381 "\nNote: Bypassing %s (disallowed host; see " 382 "http://bit.ly/2hrImnY for details).\n") 383 if fatal: 384 raise DistutilsError(msg % url) 385 else: 386 self.warn(msg, url) 387 388 def scan_egg_links(self, search_path): 389 dirs = filter(os.path.isdir, search_path) 390 egg_links = ( 391 (path, entry) 392 for path in dirs 393 for entry in os.listdir(path) 394 if entry.endswith('.egg-link') 395 ) 396 list(itertools.starmap(self.scan_egg_link, egg_links)) 397 398 def scan_egg_link(self, path, entry): 399 with open(os.path.join(path, entry)) as raw_lines: 400 # filter non-empty lines 401 lines = list(filter(None, map(str.strip, raw_lines))) 402 403 if len(lines) != 2: 404 # format is not recognized; punt 405 return 406 407 egg_path, setup_path = lines 408 409 for dist in find_distributions(os.path.join(path, egg_path)): 410 dist.location = os.path.join(path, *lines) 411 dist.precedence = SOURCE_DIST 412 self.add(dist) 413 414 def _scan(self, link): 415 # Process a URL to see if it's for a package page 416 NO_MATCH_SENTINEL = None, None 417 if not link.startswith(self.index_url): 418 return NO_MATCH_SENTINEL 419 420 parts = list(map( 421 urllib.parse.unquote, link[len(self.index_url):].split('/') 422 )) 423 if len(parts) != 2 or '#' in parts[1]: 424 return NO_MATCH_SENTINEL 425 426 # it's a package page, sanitize and index it 427 pkg = safe_name(parts[0]) 428 ver = safe_version(parts[1]) 429 self.package_pages.setdefault(pkg.lower(), {})[link] = True 430 return to_filename(pkg), to_filename(ver) 431 432 def process_index(self, url, page): 433 """Process the contents of a PyPI page""" 434 435 # process an index page into the package-page index 436 for match in HREF.finditer(page): 437 try: 438 self._scan(urllib.parse.urljoin(url, htmldecode(match.group(1)))) 439 except ValueError: 440 pass 441 442 pkg, ver = self._scan(url) # ensure this page is in the page index 443 if not pkg: 444 return "" # no sense double-scanning non-package pages 445 446 # process individual package page 447 for new_url in find_external_links(url, page): 448 # Process the found URL 449 base, frag = egg_info_for_url(new_url) 450 if base.endswith('.py') and not frag: 451 if ver: 452 new_url += '#egg=%s-%s' % (pkg, ver) 453 else: 454 self.need_version_info(url) 455 self.scan_url(new_url) 456 457 return PYPI_MD5.sub( 458 lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1, 3, 2), page 459 ) 460 461 def need_version_info(self, url): 462 self.scan_all( 463 "Page at %s links to .py file(s) without version info; an index " 464 "scan is required.", url 465 ) 466 467 def scan_all(self, msg=None, *args): 468 if self.index_url not in self.fetched_urls: 469 if msg: 470 self.warn(msg, *args) 471 self.info( 472 "Scanning index of all packages (this may take a while)" 473 ) 474 self.scan_url(self.index_url) 475 476 def find_packages(self, requirement): 477 self.scan_url(self.index_url + requirement.unsafe_name + '/') 478 479 if not self.package_pages.get(requirement.key): 480 # Fall back to safe version of the name 481 self.scan_url(self.index_url + requirement.project_name + '/') 482 483 if not self.package_pages.get(requirement.key): 484 # We couldn't find the target package, so search the index page too 485 self.not_found_in_index(requirement) 486 487 for url in list(self.package_pages.get(requirement.key, ())): 488 # scan each page that might be related to the desired package 489 self.scan_url(url) 490 491 def obtain(self, requirement, installer=None): 492 self.prescan() 493 self.find_packages(requirement) 494 for dist in self[requirement.key]: 495 if dist in requirement: 496 return dist 497 self.debug("%s does not match %s", requirement, dist) 498 return super(PackageIndex, self).obtain(requirement, installer) 499 500 def check_hash(self, checker, filename, tfp): 501 """ 502 checker is a ContentChecker 503 """ 504 checker.report( 505 self.debug, 506 "Validating %%s checksum for %s" % filename) 507 if not checker.is_valid(): 508 tfp.close() 509 os.unlink(filename) 510 raise DistutilsError( 511 "%s validation failed for %s; " 512 "possible download problem?" 513 % (checker.hash.name, os.path.basename(filename)) 514 ) 515 516 def add_find_links(self, urls): 517 """Add `urls` to the list that will be prescanned for searches""" 518 for url in urls: 519 if ( 520 self.to_scan is None # if we have already "gone online" 521 or not URL_SCHEME(url) # or it's a local file/directory 522 or url.startswith('file:') 523 or list(distros_for_url(url)) # or a direct package link 524 ): 525 # then go ahead and process it now 526 self.scan_url(url) 527 else: 528 # otherwise, defer retrieval till later 529 self.to_scan.append(url) 530 531 def prescan(self): 532 """Scan urls scheduled for prescanning (e.g. --find-links)""" 533 if self.to_scan: 534 list(map(self.scan_url, self.to_scan)) 535 self.to_scan = None # from now on, go ahead and process immediately 536 537 def not_found_in_index(self, requirement): 538 if self[requirement.key]: # we've seen at least one distro 539 meth, msg = self.info, "Couldn't retrieve index page for %r" 540 else: # no distros seen for this name, might be misspelled 541 meth, msg = ( 542 self.warn, 543 "Couldn't find index page for %r (maybe misspelled?)") 544 meth(msg, requirement.unsafe_name) 545 self.scan_all() 546 547 def download(self, spec, tmpdir): 548 """Locate and/or download `spec` to `tmpdir`, returning a local path 549 550 `spec` may be a ``Requirement`` object, or a string containing a URL, 551 an existing local filename, or a project/version requirement spec 552 (i.e. the string form of a ``Requirement`` object). If it is the URL 553 of a .py file with an unambiguous ``#egg=name-version`` tag (i.e., one 554 that escapes ``-`` as ``_`` throughout), a trivial ``setup.py`` is 555 automatically created alongside the downloaded file. 556 557 If `spec` is a ``Requirement`` object or a string containing a 558 project/version requirement spec, this method returns the location of 559 a matching distribution (possibly after downloading it to `tmpdir`). 560 If `spec` is a locally existing file or directory name, it is simply 561 returned unchanged. If `spec` is a URL, it is downloaded to a subpath 562 of `tmpdir`, and the local filename is returned. Various errors may be 563 raised if a problem occurs during downloading. 564 """ 565 if not isinstance(spec, Requirement): 566 scheme = URL_SCHEME(spec) 567 if scheme: 568 # It's a url, download it to tmpdir 569 found = self._download_url(scheme.group(1), spec, tmpdir) 570 base, fragment = egg_info_for_url(spec) 571 if base.endswith('.py'): 572 found = self.gen_setup(found, fragment, tmpdir) 573 return found 574 elif os.path.exists(spec): 575 # Existing file or directory, just return it 576 return spec 577 else: 578 spec = parse_requirement_arg(spec) 579 return getattr(self.fetch_distribution(spec, tmpdir), 'location', None) 580 581 def fetch_distribution( # noqa: C901 # is too complex (14) # FIXME 582 self, requirement, tmpdir, force_scan=False, source=False, 583 develop_ok=False, local_index=None): 584 """Obtain a distribution suitable for fulfilling `requirement` 585 586 `requirement` must be a ``pkg_resources.Requirement`` instance. 587 If necessary, or if the `force_scan` flag is set, the requirement is 588 searched for in the (online) package index as well as the locally 589 installed packages. If a distribution matching `requirement` is found, 590 the returned distribution's ``location`` is the value you would have 591 gotten from calling the ``download()`` method with the matching 592 distribution's URL or filename. If no matching distribution is found, 593 ``None`` is returned. 594 595 If the `source` flag is set, only source distributions and source 596 checkout links will be considered. Unless the `develop_ok` flag is 597 set, development and system eggs (i.e., those using the ``.egg-info`` 598 format) will be ignored. 599 """ 600 # process a Requirement 601 self.info("Searching for %s", requirement) 602 skipped = {} 603 dist = None 604 605 def find(req, env=None): 606 if env is None: 607 env = self 608 # Find a matching distribution; may be called more than once 609 610 for dist in env[req.key]: 611 612 if dist.precedence == DEVELOP_DIST and not develop_ok: 613 if dist not in skipped: 614 self.warn( 615 "Skipping development or system egg: %s", dist, 616 ) 617 skipped[dist] = 1 618 continue 619 620 test = ( 621 dist in req 622 and (dist.precedence <= SOURCE_DIST or not source) 623 ) 624 if test: 625 loc = self.download(dist.location, tmpdir) 626 dist.download_location = loc 627 if os.path.exists(dist.download_location): 628 return dist 629 630 if force_scan: 631 self.prescan() 632 self.find_packages(requirement) 633 dist = find(requirement) 634 635 if not dist and local_index is not None: 636 dist = find(requirement, local_index) 637 638 if dist is None: 639 if self.to_scan is not None: 640 self.prescan() 641 dist = find(requirement) 642 643 if dist is None and not force_scan: 644 self.find_packages(requirement) 645 dist = find(requirement) 646 647 if dist is None: 648 self.warn( 649 "No local packages or working download links found for %s%s", 650 (source and "a source distribution of " or ""), 651 requirement, 652 ) 653 else: 654 self.info("Best match: %s", dist) 655 return dist.clone(location=dist.download_location) 656 657 def fetch(self, requirement, tmpdir, force_scan=False, source=False): 658 """Obtain a file suitable for fulfilling `requirement` 659 660 DEPRECATED; use the ``fetch_distribution()`` method now instead. For 661 backward compatibility, this routine is identical but returns the 662 ``location`` of the downloaded distribution instead of a distribution 663 object. 664 """ 665 dist = self.fetch_distribution(requirement, tmpdir, force_scan, source) 666 if dist is not None: 667 return dist.location 668 return None 669 670 def gen_setup(self, filename, fragment, tmpdir): 671 match = EGG_FRAGMENT.match(fragment) 672 dists = match and [ 673 d for d in 674 interpret_distro_name(filename, match.group(1), None) if d.version 675 ] or [] 676 677 if len(dists) == 1: # unambiguous ``#egg`` fragment 678 basename = os.path.basename(filename) 679 680 # Make sure the file has been downloaded to the temp dir. 681 if os.path.dirname(filename) != tmpdir: 682 dst = os.path.join(tmpdir, basename) 683 if not (os.path.exists(dst) and os.path.samefile(filename, dst)): 684 shutil.copy2(filename, dst) 685 filename = dst 686 687 with open(os.path.join(tmpdir, 'setup.py'), 'w') as file: 688 file.write( 689 "from setuptools import setup\n" 690 "setup(name=%r, version=%r, py_modules=[%r])\n" 691 % ( 692 dists[0].project_name, dists[0].version, 693 os.path.splitext(basename)[0] 694 ) 695 ) 696 return filename 697 698 elif match: 699 raise DistutilsError( 700 "Can't unambiguously interpret project/version identifier %r; " 701 "any dashes in the name or version should be escaped using " 702 "underscores. %r" % (fragment, dists) 703 ) 704 else: 705 raise DistutilsError( 706 "Can't process plain .py files without an '#egg=name-version'" 707 " suffix to enable automatic setup script generation." 708 ) 709 710 dl_blocksize = 8192 711 712 def _download_to(self, url, filename): 713 self.info("Downloading %s", url) 714 # Download the file 715 fp = None 716 try: 717 checker = HashChecker.from_url(url) 718 fp = self.open_url(url) 719 if isinstance(fp, urllib.error.HTTPError): 720 raise DistutilsError( 721 "Can't download %s: %s %s" % (url, fp.code, fp.msg) 722 ) 723 headers = fp.info() 724 blocknum = 0 725 bs = self.dl_blocksize 726 size = -1 727 if "content-length" in headers: 728 # Some servers return multiple Content-Length headers :( 729 sizes = headers.get_all('Content-Length') 730 size = max(map(int, sizes)) 731 self.reporthook(url, filename, blocknum, bs, size) 732 with open(filename, 'wb') as tfp: 733 while True: 734 block = fp.read(bs) 735 if block: 736 checker.feed(block) 737 tfp.write(block) 738 blocknum += 1 739 self.reporthook(url, filename, blocknum, bs, size) 740 else: 741 break 742 self.check_hash(checker, filename, tfp) 743 return headers 744 finally: 745 if fp: 746 fp.close() 747 748 def reporthook(self, url, filename, blocknum, blksize, size): 749 pass # no-op 750 751 # FIXME: 752 def open_url(self, url, warning=None): # noqa: C901 # is too complex (12) 753 if url.startswith('file:'): 754 return local_open(url) 755 try: 756 return open_with_auth(url, self.opener) 757 except (ValueError, http.client.InvalidURL) as v: 758 msg = ' '.join([str(arg) for arg in v.args]) 759 if warning: 760 self.warn(warning, msg) 761 else: 762 raise DistutilsError('%s %s' % (url, msg)) from v 763 except urllib.error.HTTPError as v: 764 return v 765 except urllib.error.URLError as v: 766 if warning: 767 self.warn(warning, v.reason) 768 else: 769 raise DistutilsError("Download error for %s: %s" 770 % (url, v.reason)) from v 771 except http.client.BadStatusLine as v: 772 if warning: 773 self.warn(warning, v.line) 774 else: 775 raise DistutilsError( 776 '%s returned a bad status line. The server might be ' 777 'down, %s' % 778 (url, v.line) 779 ) from v 780 except (http.client.HTTPException, socket.error) as v: 781 if warning: 782 self.warn(warning, v) 783 else: 784 raise DistutilsError("Download error for %s: %s" 785 % (url, v)) from v 786 787 def _download_url(self, scheme, url, tmpdir): 788 # Determine download filename 789 # 790 name, fragment = egg_info_for_url(url) 791 if name: 792 while '..' in name: 793 name = name.replace('..', '.').replace('\\', '_') 794 else: 795 name = "__downloaded__" # default if URL has no path contents 796 797 if name.endswith('.egg.zip'): 798 name = name[:-4] # strip the extra .zip before download 799 800 filename = os.path.join(tmpdir, name) 801 802 # Download the file 803 # 804 if scheme == 'svn' or scheme.startswith('svn+'): 805 return self._download_svn(url, filename) 806 elif scheme == 'git' or scheme.startswith('git+'): 807 return self._download_git(url, filename) 808 elif scheme.startswith('hg+'): 809 return self._download_hg(url, filename) 810 elif scheme == 'file': 811 return urllib.request.url2pathname(urllib.parse.urlparse(url)[2]) 812 else: 813 self.url_ok(url, True) # raises error if not allowed 814 return self._attempt_download(url, filename) 815 816 def scan_url(self, url): 817 self.process_url(url, True) 818 819 def _attempt_download(self, url, filename): 820 headers = self._download_to(url, filename) 821 if 'html' in headers.get('content-type', '').lower(): 822 return self._download_html(url, headers, filename) 823 else: 824 return filename 825 826 def _download_html(self, url, headers, filename): 827 file = open(filename) 828 for line in file: 829 if line.strip(): 830 # Check for a subversion index page 831 if re.search(r'<title>([^- ]+ - )?Revision \d+:', line): 832 # it's a subversion index page: 833 file.close() 834 os.unlink(filename) 835 return self._download_svn(url, filename) 836 break # not an index page 837 file.close() 838 os.unlink(filename) 839 raise DistutilsError("Unexpected HTML page found at " + url) 840 841 def _download_svn(self, url, filename): 842 warnings.warn("SVN download support is deprecated", UserWarning) 843 url = url.split('#', 1)[0] # remove any fragment for svn's sake 844 creds = '' 845 if url.lower().startswith('svn:') and '@' in url: 846 scheme, netloc, path, p, q, f = urllib.parse.urlparse(url) 847 if not netloc and path.startswith('//') and '/' in path[2:]: 848 netloc, path = path[2:].split('/', 1) 849 auth, host = _splituser(netloc) 850 if auth: 851 if ':' in auth: 852 user, pw = auth.split(':', 1) 853 creds = " --username=%s --password=%s" % (user, pw) 854 else: 855 creds = " --username=" + auth 856 netloc = host 857 parts = scheme, netloc, url, p, q, f 858 url = urllib.parse.urlunparse(parts) 859 self.info("Doing subversion checkout from %s to %s", url, filename) 860 os.system("svn checkout%s -q %s %s" % (creds, url, filename)) 861 return filename 862 863 @staticmethod 864 def _vcs_split_rev_from_url(url, pop_prefix=False): 865 scheme, netloc, path, query, frag = urllib.parse.urlsplit(url) 866 867 scheme = scheme.split('+', 1)[-1] 868 869 # Some fragment identification fails 870 path = path.split('#', 1)[0] 871 872 rev = None 873 if '@' in path: 874 path, rev = path.rsplit('@', 1) 875 876 # Also, discard fragment 877 url = urllib.parse.urlunsplit((scheme, netloc, path, query, '')) 878 879 return url, rev 880 881 def _download_git(self, url, filename): 882 filename = filename.split('#', 1)[0] 883 url, rev = self._vcs_split_rev_from_url(url, pop_prefix=True) 884 885 self.info("Doing git clone from %s to %s", url, filename) 886 os.system("git clone --quiet %s %s" % (url, filename)) 887 888 if rev is not None: 889 self.info("Checking out %s", rev) 890 os.system("git -C %s checkout --quiet %s" % ( 891 filename, 892 rev, 893 )) 894 895 return filename 896 897 def _download_hg(self, url, filename): 898 filename = filename.split('#', 1)[0] 899 url, rev = self._vcs_split_rev_from_url(url, pop_prefix=True) 900 901 self.info("Doing hg clone from %s to %s", url, filename) 902 os.system("hg clone --quiet %s %s" % (url, filename)) 903 904 if rev is not None: 905 self.info("Updating to %s", rev) 906 os.system("hg --cwd %s up -C -r %s -q" % ( 907 filename, 908 rev, 909 )) 910 911 return filename 912 913 def debug(self, msg, *args): 914 log.debug(msg, *args) 915 916 def info(self, msg, *args): 917 log.info(msg, *args) 918 919 def warn(self, msg, *args): 920 log.warn(msg, *args) 921 922 923# This pattern matches a character entity reference (a decimal numeric 924# references, a hexadecimal numeric reference, or a named reference). 925entity_sub = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub 926 927 928def decode_entity(match): 929 what = match.group(0) 930 return html.unescape(what) 931 932 933def htmldecode(text): 934 """ 935 Decode HTML entities in the given text. 936 937 >>> htmldecode( 938 ... 'https://../package_name-0.1.2.tar.gz' 939 ... '?tokena=A&tokenb=B">package_name-0.1.2.tar.gz') 940 'https://../package_name-0.1.2.tar.gz?tokena=A&tokenb=B">package_name-0.1.2.tar.gz' 941 """ 942 return entity_sub(decode_entity, text) 943 944 945def socket_timeout(timeout=15): 946 def _socket_timeout(func): 947 def _socket_timeout(*args, **kwargs): 948 old_timeout = socket.getdefaulttimeout() 949 socket.setdefaulttimeout(timeout) 950 try: 951 return func(*args, **kwargs) 952 finally: 953 socket.setdefaulttimeout(old_timeout) 954 955 return _socket_timeout 956 957 return _socket_timeout 958 959 960def _encode_auth(auth): 961 """ 962 Encode auth from a URL suitable for an HTTP header. 963 >>> str(_encode_auth('username%3Apassword')) 964 'dXNlcm5hbWU6cGFzc3dvcmQ=' 965 966 Long auth strings should not cause a newline to be inserted. 967 >>> long_auth = 'username:' + 'password'*10 968 >>> chr(10) in str(_encode_auth(long_auth)) 969 False 970 """ 971 auth_s = urllib.parse.unquote(auth) 972 # convert to bytes 973 auth_bytes = auth_s.encode() 974 encoded_bytes = base64.b64encode(auth_bytes) 975 # convert back to a string 976 encoded = encoded_bytes.decode() 977 # strip the trailing carriage return 978 return encoded.replace('\n', '') 979 980 981class Credential: 982 """ 983 A username/password pair. Use like a namedtuple. 984 """ 985 986 def __init__(self, username, password): 987 self.username = username 988 self.password = password 989 990 def __iter__(self): 991 yield self.username 992 yield self.password 993 994 def __str__(self): 995 return '%(username)s:%(password)s' % vars(self) 996 997 998class PyPIConfig(configparser.RawConfigParser): 999 def __init__(self): 1000 """ 1001 Load from ~/.pypirc 1002 """ 1003 defaults = dict.fromkeys(['username', 'password', 'repository'], '') 1004 super().__init__(defaults) 1005 1006 rc = os.path.join(os.path.expanduser('~'), '.pypirc') 1007 if os.path.exists(rc): 1008 self.read(rc) 1009 1010 @property 1011 def creds_by_repository(self): 1012 sections_with_repositories = [ 1013 section for section in self.sections() 1014 if self.get(section, 'repository').strip() 1015 ] 1016 1017 return dict(map(self._get_repo_cred, sections_with_repositories)) 1018 1019 def _get_repo_cred(self, section): 1020 repo = self.get(section, 'repository').strip() 1021 return repo, Credential( 1022 self.get(section, 'username').strip(), 1023 self.get(section, 'password').strip(), 1024 ) 1025 1026 def find_credential(self, url): 1027 """ 1028 If the URL indicated appears to be a repository defined in this 1029 config, return the credential for that repository. 1030 """ 1031 for repository, cred in self.creds_by_repository.items(): 1032 if url.startswith(repository): 1033 return cred 1034 1035 1036def open_with_auth(url, opener=urllib.request.urlopen): 1037 """Open a urllib2 request, handling HTTP authentication""" 1038 1039 parsed = urllib.parse.urlparse(url) 1040 scheme, netloc, path, params, query, frag = parsed 1041 1042 # Double scheme does not raise on macOS as revealed by a 1043 # failing test. We would expect "nonnumeric port". Refs #20. 1044 if netloc.endswith(':'): 1045 raise http.client.InvalidURL("nonnumeric port: ''") 1046 1047 if scheme in ('http', 'https'): 1048 auth, address = _splituser(netloc) 1049 else: 1050 auth = None 1051 1052 if not auth: 1053 cred = PyPIConfig().find_credential(url) 1054 if cred: 1055 auth = str(cred) 1056 info = cred.username, url 1057 log.info('Authenticating as %s for %s (from .pypirc)', *info) 1058 1059 if auth: 1060 auth = "Basic " + _encode_auth(auth) 1061 parts = scheme, address, path, params, query, frag 1062 new_url = urllib.parse.urlunparse(parts) 1063 request = urllib.request.Request(new_url) 1064 request.add_header("Authorization", auth) 1065 else: 1066 request = urllib.request.Request(url) 1067 1068 request.add_header('User-Agent', user_agent) 1069 fp = opener(request) 1070 1071 if auth: 1072 # Put authentication info back into request URL if same host, 1073 # so that links found on the page will work 1074 s2, h2, path2, param2, query2, frag2 = urllib.parse.urlparse(fp.url) 1075 if s2 == scheme and h2 == address: 1076 parts = s2, netloc, path2, param2, query2, frag2 1077 fp.url = urllib.parse.urlunparse(parts) 1078 1079 return fp 1080 1081 1082# copy of urllib.parse._splituser from Python 3.8 1083def _splituser(host): 1084 """splituser('user[:passwd]@host[:port]') 1085 --> 'user[:passwd]', 'host[:port]'.""" 1086 user, delim, host = host.rpartition('@') 1087 return (user if delim else None), host 1088 1089 1090# adding a timeout to avoid freezing package_index 1091open_with_auth = socket_timeout(_SOCKET_TIMEOUT)(open_with_auth) 1092 1093 1094def fix_sf_url(url): 1095 return url # backward compatibility 1096 1097 1098def local_open(url): 1099 """Read a local path, with special support for directories""" 1100 scheme, server, path, param, query, frag = urllib.parse.urlparse(url) 1101 filename = urllib.request.url2pathname(path) 1102 if os.path.isfile(filename): 1103 return urllib.request.urlopen(url) 1104 elif path.endswith('/') and os.path.isdir(filename): 1105 files = [] 1106 for f in os.listdir(filename): 1107 filepath = os.path.join(filename, f) 1108 if f == 'index.html': 1109 with open(filepath, 'r') as fp: 1110 body = fp.read() 1111 break 1112 elif os.path.isdir(filepath): 1113 f += '/' 1114 files.append('<a href="{name}">{name}</a>'.format(name=f)) 1115 else: 1116 tmpl = ( 1117 "<html><head><title>{url}</title>" 1118 "</head><body>{files}</body></html>") 1119 body = tmpl.format(url=url, files='\n'.join(files)) 1120 status, message = 200, "OK" 1121 else: 1122 status, message, body = 404, "Path not found", "Not found" 1123 1124 headers = {'content-type': 'text/html'} 1125 body_stream = io.StringIO(body) 1126 return urllib.error.HTTPError(url, status, message, headers, body_stream) 1127