• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""PyPI and direct package downloading"""
2import sys
3import os
4import re
5import io
6import shutil
7import socket
8import base64
9import hashlib
10import itertools
11import warnings
12import configparser
13import html
14import http.client
15import urllib.parse
16import urllib.request
17import urllib.error
18from functools import wraps
19
20import setuptools
21from pkg_resources import (
22    CHECKOUT_DIST, Distribution, BINARY_DIST, normalize_path, SOURCE_DIST,
23    Environment, find_distributions, safe_name, safe_version,
24    to_filename, Requirement, DEVELOP_DIST, EGG_DIST, parse_version,
25)
26from distutils import log
27from distutils.errors import DistutilsError
28from fnmatch import translate
29from setuptools.wheel import Wheel
30from setuptools.extern.more_itertools import unique_everseen
31
32
33EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.+!]+)$')
34HREF = re.compile(r"""href\s*=\s*['"]?([^'"> ]+)""", re.I)
35PYPI_MD5 = re.compile(
36    r'<a href="([^"#]+)">([^<]+)</a>\n\s+\(<a (?:title="MD5 hash"\n\s+)'
37    r'href="[^?]+\?:action=show_md5&amp;digest=([0-9a-f]{32})">md5</a>\)'
38)
39URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
40EXTENSIONS = ".tar.gz .tar.bz2 .tar .zip .tgz".split()
41
42__all__ = [
43    'PackageIndex', 'distros_for_url', 'parse_bdist_wininst',
44    'interpret_distro_name',
45]
46
47_SOCKET_TIMEOUT = 15
48
49_tmpl = "setuptools/{setuptools.__version__} Python-urllib/{py_major}"
50user_agent = _tmpl.format(
51    py_major='{}.{}'.format(*sys.version_info), setuptools=setuptools)
52
53
54def parse_requirement_arg(spec):
55    try:
56        return Requirement.parse(spec)
57    except ValueError as e:
58        raise DistutilsError(
59            "Not a URL, existing file, or requirement spec: %r" % (spec,)
60        ) from e
61
62
63def parse_bdist_wininst(name):
64    """Return (base,pyversion) or (None,None) for possible .exe name"""
65
66    lower = name.lower()
67    base, py_ver, plat = None, None, None
68
69    if lower.endswith('.exe'):
70        if lower.endswith('.win32.exe'):
71            base = name[:-10]
72            plat = 'win32'
73        elif lower.startswith('.win32-py', -16):
74            py_ver = name[-7:-4]
75            base = name[:-16]
76            plat = 'win32'
77        elif lower.endswith('.win-amd64.exe'):
78            base = name[:-14]
79            plat = 'win-amd64'
80        elif lower.startswith('.win-amd64-py', -20):
81            py_ver = name[-7:-4]
82            base = name[:-20]
83            plat = 'win-amd64'
84    return base, py_ver, plat
85
86
87def egg_info_for_url(url):
88    parts = urllib.parse.urlparse(url)
89    scheme, server, path, parameters, query, fragment = parts
90    base = urllib.parse.unquote(path.split('/')[-1])
91    if server == 'sourceforge.net' and base == 'download':  # XXX Yuck
92        base = urllib.parse.unquote(path.split('/')[-2])
93    if '#' in base:
94        base, fragment = base.split('#', 1)
95    return base, fragment
96
97
98def distros_for_url(url, metadata=None):
99    """Yield egg or source distribution objects that might be found at a URL"""
100    base, fragment = egg_info_for_url(url)
101    for dist in distros_for_location(url, base, metadata):
102        yield dist
103    if fragment:
104        match = EGG_FRAGMENT.match(fragment)
105        if match:
106            for dist in interpret_distro_name(
107                url, match.group(1), metadata, precedence=CHECKOUT_DIST
108            ):
109                yield dist
110
111
112def distros_for_location(location, basename, metadata=None):
113    """Yield egg or source distribution objects based on basename"""
114    if basename.endswith('.egg.zip'):
115        basename = basename[:-4]  # strip the .zip
116    if basename.endswith('.egg') and '-' in basename:
117        # only one, unambiguous interpretation
118        return [Distribution.from_location(location, basename, metadata)]
119    if basename.endswith('.whl') and '-' in basename:
120        wheel = Wheel(basename)
121        if not wheel.is_compatible():
122            return []
123        return [Distribution(
124            location=location,
125            project_name=wheel.project_name,
126            version=wheel.version,
127            # Increase priority over eggs.
128            precedence=EGG_DIST + 1,
129        )]
130    if basename.endswith('.exe'):
131        win_base, py_ver, platform = parse_bdist_wininst(basename)
132        if win_base is not None:
133            return interpret_distro_name(
134                location, win_base, metadata, py_ver, BINARY_DIST, platform
135            )
136    # Try source distro extensions (.zip, .tgz, etc.)
137    #
138    for ext in EXTENSIONS:
139        if basename.endswith(ext):
140            basename = basename[:-len(ext)]
141            return interpret_distro_name(location, basename, metadata)
142    return []  # no extension matched
143
144
145def distros_for_filename(filename, metadata=None):
146    """Yield possible egg or source distribution objects based on a filename"""
147    return distros_for_location(
148        normalize_path(filename), os.path.basename(filename), metadata
149    )
150
151
152def interpret_distro_name(
153        location, basename, metadata, py_version=None, precedence=SOURCE_DIST,
154        platform=None
155):
156    """Generate alternative interpretations of a source distro name
157
158    Note: if `location` is a filesystem filename, you should call
159    ``pkg_resources.normalize_path()`` on it before passing it to this
160    routine!
161    """
162    # Generate alternative interpretations of a source distro name
163    # Because some packages are ambiguous as to name/versions split
164    # e.g. "adns-python-1.1.0", "egenix-mx-commercial", etc.
165    # So, we generate each possible interpretation (e.g. "adns, python-1.1.0"
166    # "adns-python, 1.1.0", and "adns-python-1.1.0, no version").  In practice,
167    # the spurious interpretations should be ignored, because in the event
168    # there's also an "adns" package, the spurious "python-1.1.0" version will
169    # compare lower than any numeric version number, and is therefore unlikely
170    # to match a request for it.  It's still a potential problem, though, and
171    # in the long run PyPI and the distutils should go for "safe" names and
172    # versions in distribution archive names (sdist and bdist).
173
174    parts = basename.split('-')
175    if not py_version and any(re.match(r'py\d\.\d$', p) for p in parts[2:]):
176        # it is a bdist_dumb, not an sdist -- bail out
177        return
178
179    for p in range(1, len(parts) + 1):
180        yield Distribution(
181            location, metadata, '-'.join(parts[:p]), '-'.join(parts[p:]),
182            py_version=py_version, precedence=precedence,
183            platform=platform
184        )
185
186
187def unique_values(func):
188    """
189    Wrap a function returning an iterable such that the resulting iterable
190    only ever yields unique items.
191    """
192
193    @wraps(func)
194    def wrapper(*args, **kwargs):
195        return unique_everseen(func(*args, **kwargs))
196
197    return wrapper
198
199
200REL = re.compile(r"""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I)
201# this line is here to fix emacs' cruddy broken syntax highlighting
202
203
204@unique_values
205def find_external_links(url, page):
206    """Find rel="homepage" and rel="download" links in `page`, yielding URLs"""
207
208    for match in REL.finditer(page):
209        tag, rel = match.groups()
210        rels = set(map(str.strip, rel.lower().split(',')))
211        if 'homepage' in rels or 'download' in rels:
212            for match in HREF.finditer(tag):
213                yield urllib.parse.urljoin(url, htmldecode(match.group(1)))
214
215    for tag in ("<th>Home Page", "<th>Download URL"):
216        pos = page.find(tag)
217        if pos != -1:
218            match = HREF.search(page, pos)
219            if match:
220                yield urllib.parse.urljoin(url, htmldecode(match.group(1)))
221
222
223class ContentChecker:
224    """
225    A null content checker that defines the interface for checking content
226    """
227
228    def feed(self, block):
229        """
230        Feed a block of data to the hash.
231        """
232        return
233
234    def is_valid(self):
235        """
236        Check the hash. Return False if validation fails.
237        """
238        return True
239
240    def report(self, reporter, template):
241        """
242        Call reporter with information about the checker (hash name)
243        substituted into the template.
244        """
245        return
246
247
248class HashChecker(ContentChecker):
249    pattern = re.compile(
250        r'(?P<hash_name>sha1|sha224|sha384|sha256|sha512|md5)='
251        r'(?P<expected>[a-f0-9]+)'
252    )
253
254    def __init__(self, hash_name, expected):
255        self.hash_name = hash_name
256        self.hash = hashlib.new(hash_name)
257        self.expected = expected
258
259    @classmethod
260    def from_url(cls, url):
261        "Construct a (possibly null) ContentChecker from a URL"
262        fragment = urllib.parse.urlparse(url)[-1]
263        if not fragment:
264            return ContentChecker()
265        match = cls.pattern.search(fragment)
266        if not match:
267            return ContentChecker()
268        return cls(**match.groupdict())
269
270    def feed(self, block):
271        self.hash.update(block)
272
273    def is_valid(self):
274        return self.hash.hexdigest() == self.expected
275
276    def report(self, reporter, template):
277        msg = template % self.hash_name
278        return reporter(msg)
279
280
281class PackageIndex(Environment):
282    """A distribution index that scans web pages for download URLs"""
283
284    def __init__(
285            self, index_url="https://pypi.org/simple/", hosts=('*',),
286            ca_bundle=None, verify_ssl=True, *args, **kw
287    ):
288        super().__init__(*args, **kw)
289        self.index_url = index_url + "/" [:not index_url.endswith('/')]
290        self.scanned_urls = {}
291        self.fetched_urls = {}
292        self.package_pages = {}
293        self.allows = re.compile('|'.join(map(translate, hosts))).match
294        self.to_scan = []
295        self.opener = urllib.request.urlopen
296
297    def add(self, dist):
298        # ignore invalid versions
299        try:
300            parse_version(dist.version)
301        except Exception:
302            return
303        return super().add(dist)
304
305    # FIXME: 'PackageIndex.process_url' is too complex (14)
306    def process_url(self, url, retrieve=False):  # noqa: C901
307        """Evaluate a URL as a possible download, and maybe retrieve it"""
308        if url in self.scanned_urls and not retrieve:
309            return
310        self.scanned_urls[url] = True
311        if not URL_SCHEME(url):
312            self.process_filename(url)
313            return
314        else:
315            dists = list(distros_for_url(url))
316            if dists:
317                if not self.url_ok(url):
318                    return
319                self.debug("Found link: %s", url)
320
321        if dists or not retrieve or url in self.fetched_urls:
322            list(map(self.add, dists))
323            return  # don't need the actual page
324
325        if not self.url_ok(url):
326            self.fetched_urls[url] = True
327            return
328
329        self.info("Reading %s", url)
330        self.fetched_urls[url] = True  # prevent multiple fetch attempts
331        tmpl = "Download error on %s: %%s -- Some packages may not be found!"
332        f = self.open_url(url, tmpl % url)
333        if f is None:
334            return
335        if isinstance(f, urllib.error.HTTPError) and f.code == 401:
336            self.info("Authentication error: %s" % f.msg)
337        self.fetched_urls[f.url] = True
338        if 'html' not in f.headers.get('content-type', '').lower():
339            f.close()  # not html, we can't process it
340            return
341
342        base = f.url  # handle redirects
343        page = f.read()
344        if not isinstance(page, str):
345            # In Python 3 and got bytes but want str.
346            if isinstance(f, urllib.error.HTTPError):
347                # Errors have no charset, assume latin1:
348                charset = 'latin-1'
349            else:
350                charset = f.headers.get_param('charset') or 'latin-1'
351            page = page.decode(charset, "ignore")
352        f.close()
353        for match in HREF.finditer(page):
354            link = urllib.parse.urljoin(base, htmldecode(match.group(1)))
355            self.process_url(link)
356        if url.startswith(self.index_url) and getattr(f, 'code', None) != 404:
357            page = self.process_index(url, page)
358
359    def process_filename(self, fn, nested=False):
360        # process filenames or directories
361        if not os.path.exists(fn):
362            self.warn("Not found: %s", fn)
363            return
364
365        if os.path.isdir(fn) and not nested:
366            path = os.path.realpath(fn)
367            for item in os.listdir(path):
368                self.process_filename(os.path.join(path, item), True)
369
370        dists = distros_for_filename(fn)
371        if dists:
372            self.debug("Found: %s", fn)
373            list(map(self.add, dists))
374
375    def url_ok(self, url, fatal=False):
376        s = URL_SCHEME(url)
377        is_file = s and s.group(1).lower() == 'file'
378        if is_file or self.allows(urllib.parse.urlparse(url)[1]):
379            return True
380        msg = (
381            "\nNote: Bypassing %s (disallowed host; see "
382            "http://bit.ly/2hrImnY for details).\n")
383        if fatal:
384            raise DistutilsError(msg % url)
385        else:
386            self.warn(msg, url)
387
388    def scan_egg_links(self, search_path):
389        dirs = filter(os.path.isdir, search_path)
390        egg_links = (
391            (path, entry)
392            for path in dirs
393            for entry in os.listdir(path)
394            if entry.endswith('.egg-link')
395        )
396        list(itertools.starmap(self.scan_egg_link, egg_links))
397
398    def scan_egg_link(self, path, entry):
399        with open(os.path.join(path, entry)) as raw_lines:
400            # filter non-empty lines
401            lines = list(filter(None, map(str.strip, raw_lines)))
402
403        if len(lines) != 2:
404            # format is not recognized; punt
405            return
406
407        egg_path, setup_path = lines
408
409        for dist in find_distributions(os.path.join(path, egg_path)):
410            dist.location = os.path.join(path, *lines)
411            dist.precedence = SOURCE_DIST
412            self.add(dist)
413
414    def _scan(self, link):
415        # Process a URL to see if it's for a package page
416        NO_MATCH_SENTINEL = None, None
417        if not link.startswith(self.index_url):
418            return NO_MATCH_SENTINEL
419
420        parts = list(map(
421            urllib.parse.unquote, link[len(self.index_url):].split('/')
422        ))
423        if len(parts) != 2 or '#' in parts[1]:
424            return NO_MATCH_SENTINEL
425
426        # it's a package page, sanitize and index it
427        pkg = safe_name(parts[0])
428        ver = safe_version(parts[1])
429        self.package_pages.setdefault(pkg.lower(), {})[link] = True
430        return to_filename(pkg), to_filename(ver)
431
432    def process_index(self, url, page):
433        """Process the contents of a PyPI page"""
434
435        # process an index page into the package-page index
436        for match in HREF.finditer(page):
437            try:
438                self._scan(urllib.parse.urljoin(url, htmldecode(match.group(1))))
439            except ValueError:
440                pass
441
442        pkg, ver = self._scan(url)  # ensure this page is in the page index
443        if not pkg:
444            return ""  # no sense double-scanning non-package pages
445
446        # process individual package page
447        for new_url in find_external_links(url, page):
448            # Process the found URL
449            base, frag = egg_info_for_url(new_url)
450            if base.endswith('.py') and not frag:
451                if ver:
452                    new_url += '#egg=%s-%s' % (pkg, ver)
453                else:
454                    self.need_version_info(url)
455            self.scan_url(new_url)
456
457        return PYPI_MD5.sub(
458            lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1, 3, 2), page
459        )
460
461    def need_version_info(self, url):
462        self.scan_all(
463            "Page at %s links to .py file(s) without version info; an index "
464            "scan is required.", url
465        )
466
467    def scan_all(self, msg=None, *args):
468        if self.index_url not in self.fetched_urls:
469            if msg:
470                self.warn(msg, *args)
471            self.info(
472                "Scanning index of all packages (this may take a while)"
473            )
474        self.scan_url(self.index_url)
475
476    def find_packages(self, requirement):
477        self.scan_url(self.index_url + requirement.unsafe_name + '/')
478
479        if not self.package_pages.get(requirement.key):
480            # Fall back to safe version of the name
481            self.scan_url(self.index_url + requirement.project_name + '/')
482
483        if not self.package_pages.get(requirement.key):
484            # We couldn't find the target package, so search the index page too
485            self.not_found_in_index(requirement)
486
487        for url in list(self.package_pages.get(requirement.key, ())):
488            # scan each page that might be related to the desired package
489            self.scan_url(url)
490
491    def obtain(self, requirement, installer=None):
492        self.prescan()
493        self.find_packages(requirement)
494        for dist in self[requirement.key]:
495            if dist in requirement:
496                return dist
497            self.debug("%s does not match %s", requirement, dist)
498        return super(PackageIndex, self).obtain(requirement, installer)
499
500    def check_hash(self, checker, filename, tfp):
501        """
502        checker is a ContentChecker
503        """
504        checker.report(
505            self.debug,
506            "Validating %%s checksum for %s" % filename)
507        if not checker.is_valid():
508            tfp.close()
509            os.unlink(filename)
510            raise DistutilsError(
511                "%s validation failed for %s; "
512                "possible download problem?"
513                % (checker.hash.name, os.path.basename(filename))
514            )
515
516    def add_find_links(self, urls):
517        """Add `urls` to the list that will be prescanned for searches"""
518        for url in urls:
519            if (
520                self.to_scan is None  # if we have already "gone online"
521                or not URL_SCHEME(url)  # or it's a local file/directory
522                or url.startswith('file:')
523                or list(distros_for_url(url))  # or a direct package link
524            ):
525                # then go ahead and process it now
526                self.scan_url(url)
527            else:
528                # otherwise, defer retrieval till later
529                self.to_scan.append(url)
530
531    def prescan(self):
532        """Scan urls scheduled for prescanning (e.g. --find-links)"""
533        if self.to_scan:
534            list(map(self.scan_url, self.to_scan))
535        self.to_scan = None  # from now on, go ahead and process immediately
536
537    def not_found_in_index(self, requirement):
538        if self[requirement.key]:  # we've seen at least one distro
539            meth, msg = self.info, "Couldn't retrieve index page for %r"
540        else:  # no distros seen for this name, might be misspelled
541            meth, msg = (
542                self.warn,
543                "Couldn't find index page for %r (maybe misspelled?)")
544        meth(msg, requirement.unsafe_name)
545        self.scan_all()
546
547    def download(self, spec, tmpdir):
548        """Locate and/or download `spec` to `tmpdir`, returning a local path
549
550        `spec` may be a ``Requirement`` object, or a string containing a URL,
551        an existing local filename, or a project/version requirement spec
552        (i.e. the string form of a ``Requirement`` object).  If it is the URL
553        of a .py file with an unambiguous ``#egg=name-version`` tag (i.e., one
554        that escapes ``-`` as ``_`` throughout), a trivial ``setup.py`` is
555        automatically created alongside the downloaded file.
556
557        If `spec` is a ``Requirement`` object or a string containing a
558        project/version requirement spec, this method returns the location of
559        a matching distribution (possibly after downloading it to `tmpdir`).
560        If `spec` is a locally existing file or directory name, it is simply
561        returned unchanged.  If `spec` is a URL, it is downloaded to a subpath
562        of `tmpdir`, and the local filename is returned.  Various errors may be
563        raised if a problem occurs during downloading.
564        """
565        if not isinstance(spec, Requirement):
566            scheme = URL_SCHEME(spec)
567            if scheme:
568                # It's a url, download it to tmpdir
569                found = self._download_url(scheme.group(1), spec, tmpdir)
570                base, fragment = egg_info_for_url(spec)
571                if base.endswith('.py'):
572                    found = self.gen_setup(found, fragment, tmpdir)
573                return found
574            elif os.path.exists(spec):
575                # Existing file or directory, just return it
576                return spec
577            else:
578                spec = parse_requirement_arg(spec)
579        return getattr(self.fetch_distribution(spec, tmpdir), 'location', None)
580
581    def fetch_distribution(  # noqa: C901  # is too complex (14)  # FIXME
582            self, requirement, tmpdir, force_scan=False, source=False,
583            develop_ok=False, local_index=None):
584        """Obtain a distribution suitable for fulfilling `requirement`
585
586        `requirement` must be a ``pkg_resources.Requirement`` instance.
587        If necessary, or if the `force_scan` flag is set, the requirement is
588        searched for in the (online) package index as well as the locally
589        installed packages.  If a distribution matching `requirement` is found,
590        the returned distribution's ``location`` is the value you would have
591        gotten from calling the ``download()`` method with the matching
592        distribution's URL or filename.  If no matching distribution is found,
593        ``None`` is returned.
594
595        If the `source` flag is set, only source distributions and source
596        checkout links will be considered.  Unless the `develop_ok` flag is
597        set, development and system eggs (i.e., those using the ``.egg-info``
598        format) will be ignored.
599        """
600        # process a Requirement
601        self.info("Searching for %s", requirement)
602        skipped = {}
603        dist = None
604
605        def find(req, env=None):
606            if env is None:
607                env = self
608            # Find a matching distribution; may be called more than once
609
610            for dist in env[req.key]:
611
612                if dist.precedence == DEVELOP_DIST and not develop_ok:
613                    if dist not in skipped:
614                        self.warn(
615                            "Skipping development or system egg: %s", dist,
616                        )
617                        skipped[dist] = 1
618                    continue
619
620                test = (
621                    dist in req
622                    and (dist.precedence <= SOURCE_DIST or not source)
623                )
624                if test:
625                    loc = self.download(dist.location, tmpdir)
626                    dist.download_location = loc
627                    if os.path.exists(dist.download_location):
628                        return dist
629
630        if force_scan:
631            self.prescan()
632            self.find_packages(requirement)
633            dist = find(requirement)
634
635        if not dist and local_index is not None:
636            dist = find(requirement, local_index)
637
638        if dist is None:
639            if self.to_scan is not None:
640                self.prescan()
641            dist = find(requirement)
642
643        if dist is None and not force_scan:
644            self.find_packages(requirement)
645            dist = find(requirement)
646
647        if dist is None:
648            self.warn(
649                "No local packages or working download links found for %s%s",
650                (source and "a source distribution of " or ""),
651                requirement,
652            )
653        else:
654            self.info("Best match: %s", dist)
655            return dist.clone(location=dist.download_location)
656
657    def fetch(self, requirement, tmpdir, force_scan=False, source=False):
658        """Obtain a file suitable for fulfilling `requirement`
659
660        DEPRECATED; use the ``fetch_distribution()`` method now instead.  For
661        backward compatibility, this routine is identical but returns the
662        ``location`` of the downloaded distribution instead of a distribution
663        object.
664        """
665        dist = self.fetch_distribution(requirement, tmpdir, force_scan, source)
666        if dist is not None:
667            return dist.location
668        return None
669
670    def gen_setup(self, filename, fragment, tmpdir):
671        match = EGG_FRAGMENT.match(fragment)
672        dists = match and [
673            d for d in
674            interpret_distro_name(filename, match.group(1), None) if d.version
675        ] or []
676
677        if len(dists) == 1:  # unambiguous ``#egg`` fragment
678            basename = os.path.basename(filename)
679
680            # Make sure the file has been downloaded to the temp dir.
681            if os.path.dirname(filename) != tmpdir:
682                dst = os.path.join(tmpdir, basename)
683                if not (os.path.exists(dst) and os.path.samefile(filename, dst)):
684                    shutil.copy2(filename, dst)
685                    filename = dst
686
687            with open(os.path.join(tmpdir, 'setup.py'), 'w') as file:
688                file.write(
689                    "from setuptools import setup\n"
690                    "setup(name=%r, version=%r, py_modules=[%r])\n"
691                    % (
692                        dists[0].project_name, dists[0].version,
693                        os.path.splitext(basename)[0]
694                    )
695                )
696            return filename
697
698        elif match:
699            raise DistutilsError(
700                "Can't unambiguously interpret project/version identifier %r; "
701                "any dashes in the name or version should be escaped using "
702                "underscores. %r" % (fragment, dists)
703            )
704        else:
705            raise DistutilsError(
706                "Can't process plain .py files without an '#egg=name-version'"
707                " suffix to enable automatic setup script generation."
708            )
709
710    dl_blocksize = 8192
711
712    def _download_to(self, url, filename):
713        self.info("Downloading %s", url)
714        # Download the file
715        fp = None
716        try:
717            checker = HashChecker.from_url(url)
718            fp = self.open_url(url)
719            if isinstance(fp, urllib.error.HTTPError):
720                raise DistutilsError(
721                    "Can't download %s: %s %s" % (url, fp.code, fp.msg)
722                )
723            headers = fp.info()
724            blocknum = 0
725            bs = self.dl_blocksize
726            size = -1
727            if "content-length" in headers:
728                # Some servers return multiple Content-Length headers :(
729                sizes = headers.get_all('Content-Length')
730                size = max(map(int, sizes))
731                self.reporthook(url, filename, blocknum, bs, size)
732            with open(filename, 'wb') as tfp:
733                while True:
734                    block = fp.read(bs)
735                    if block:
736                        checker.feed(block)
737                        tfp.write(block)
738                        blocknum += 1
739                        self.reporthook(url, filename, blocknum, bs, size)
740                    else:
741                        break
742                self.check_hash(checker, filename, tfp)
743            return headers
744        finally:
745            if fp:
746                fp.close()
747
748    def reporthook(self, url, filename, blocknum, blksize, size):
749        pass  # no-op
750
751    # FIXME:
752    def open_url(self, url, warning=None):  # noqa: C901  # is too complex (12)
753        if url.startswith('file:'):
754            return local_open(url)
755        try:
756            return open_with_auth(url, self.opener)
757        except (ValueError, http.client.InvalidURL) as v:
758            msg = ' '.join([str(arg) for arg in v.args])
759            if warning:
760                self.warn(warning, msg)
761            else:
762                raise DistutilsError('%s %s' % (url, msg)) from v
763        except urllib.error.HTTPError as v:
764            return v
765        except urllib.error.URLError as v:
766            if warning:
767                self.warn(warning, v.reason)
768            else:
769                raise DistutilsError("Download error for %s: %s"
770                                     % (url, v.reason)) from v
771        except http.client.BadStatusLine as v:
772            if warning:
773                self.warn(warning, v.line)
774            else:
775                raise DistutilsError(
776                    '%s returned a bad status line. The server might be '
777                    'down, %s' %
778                    (url, v.line)
779                ) from v
780        except (http.client.HTTPException, socket.error) as v:
781            if warning:
782                self.warn(warning, v)
783            else:
784                raise DistutilsError("Download error for %s: %s"
785                                     % (url, v)) from v
786
787    def _download_url(self, scheme, url, tmpdir):
788        # Determine download filename
789        #
790        name, fragment = egg_info_for_url(url)
791        if name:
792            while '..' in name:
793                name = name.replace('..', '.').replace('\\', '_')
794        else:
795            name = "__downloaded__"  # default if URL has no path contents
796
797        if name.endswith('.egg.zip'):
798            name = name[:-4]  # strip the extra .zip before download
799
800        filename = os.path.join(tmpdir, name)
801
802        # Download the file
803        #
804        if scheme == 'svn' or scheme.startswith('svn+'):
805            return self._download_svn(url, filename)
806        elif scheme == 'git' or scheme.startswith('git+'):
807            return self._download_git(url, filename)
808        elif scheme.startswith('hg+'):
809            return self._download_hg(url, filename)
810        elif scheme == 'file':
811            return urllib.request.url2pathname(urllib.parse.urlparse(url)[2])
812        else:
813            self.url_ok(url, True)  # raises error if not allowed
814            return self._attempt_download(url, filename)
815
816    def scan_url(self, url):
817        self.process_url(url, True)
818
819    def _attempt_download(self, url, filename):
820        headers = self._download_to(url, filename)
821        if 'html' in headers.get('content-type', '').lower():
822            return self._download_html(url, headers, filename)
823        else:
824            return filename
825
826    def _download_html(self, url, headers, filename):
827        file = open(filename)
828        for line in file:
829            if line.strip():
830                # Check for a subversion index page
831                if re.search(r'<title>([^- ]+ - )?Revision \d+:', line):
832                    # it's a subversion index page:
833                    file.close()
834                    os.unlink(filename)
835                    return self._download_svn(url, filename)
836                break  # not an index page
837        file.close()
838        os.unlink(filename)
839        raise DistutilsError("Unexpected HTML page found at " + url)
840
841    def _download_svn(self, url, filename):
842        warnings.warn("SVN download support is deprecated", UserWarning)
843        url = url.split('#', 1)[0]  # remove any fragment for svn's sake
844        creds = ''
845        if url.lower().startswith('svn:') and '@' in url:
846            scheme, netloc, path, p, q, f = urllib.parse.urlparse(url)
847            if not netloc and path.startswith('//') and '/' in path[2:]:
848                netloc, path = path[2:].split('/', 1)
849                auth, host = _splituser(netloc)
850                if auth:
851                    if ':' in auth:
852                        user, pw = auth.split(':', 1)
853                        creds = " --username=%s --password=%s" % (user, pw)
854                    else:
855                        creds = " --username=" + auth
856                    netloc = host
857                    parts = scheme, netloc, url, p, q, f
858                    url = urllib.parse.urlunparse(parts)
859        self.info("Doing subversion checkout from %s to %s", url, filename)
860        os.system("svn checkout%s -q %s %s" % (creds, url, filename))
861        return filename
862
863    @staticmethod
864    def _vcs_split_rev_from_url(url, pop_prefix=False):
865        scheme, netloc, path, query, frag = urllib.parse.urlsplit(url)
866
867        scheme = scheme.split('+', 1)[-1]
868
869        # Some fragment identification fails
870        path = path.split('#', 1)[0]
871
872        rev = None
873        if '@' in path:
874            path, rev = path.rsplit('@', 1)
875
876        # Also, discard fragment
877        url = urllib.parse.urlunsplit((scheme, netloc, path, query, ''))
878
879        return url, rev
880
881    def _download_git(self, url, filename):
882        filename = filename.split('#', 1)[0]
883        url, rev = self._vcs_split_rev_from_url(url, pop_prefix=True)
884
885        self.info("Doing git clone from %s to %s", url, filename)
886        os.system("git clone --quiet %s %s" % (url, filename))
887
888        if rev is not None:
889            self.info("Checking out %s", rev)
890            os.system("git -C %s checkout --quiet %s" % (
891                filename,
892                rev,
893            ))
894
895        return filename
896
897    def _download_hg(self, url, filename):
898        filename = filename.split('#', 1)[0]
899        url, rev = self._vcs_split_rev_from_url(url, pop_prefix=True)
900
901        self.info("Doing hg clone from %s to %s", url, filename)
902        os.system("hg clone --quiet %s %s" % (url, filename))
903
904        if rev is not None:
905            self.info("Updating to %s", rev)
906            os.system("hg --cwd %s up -C -r %s -q" % (
907                filename,
908                rev,
909            ))
910
911        return filename
912
913    def debug(self, msg, *args):
914        log.debug(msg, *args)
915
916    def info(self, msg, *args):
917        log.info(msg, *args)
918
919    def warn(self, msg, *args):
920        log.warn(msg, *args)
921
922
923# This pattern matches a character entity reference (a decimal numeric
924# references, a hexadecimal numeric reference, or a named reference).
925entity_sub = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub
926
927
928def decode_entity(match):
929    what = match.group(0)
930    return html.unescape(what)
931
932
933def htmldecode(text):
934    """
935    Decode HTML entities in the given text.
936
937    >>> htmldecode(
938    ...     'https://../package_name-0.1.2.tar.gz'
939    ...     '?tokena=A&amp;tokenb=B">package_name-0.1.2.tar.gz')
940    'https://../package_name-0.1.2.tar.gz?tokena=A&tokenb=B">package_name-0.1.2.tar.gz'
941    """
942    return entity_sub(decode_entity, text)
943
944
945def socket_timeout(timeout=15):
946    def _socket_timeout(func):
947        def _socket_timeout(*args, **kwargs):
948            old_timeout = socket.getdefaulttimeout()
949            socket.setdefaulttimeout(timeout)
950            try:
951                return func(*args, **kwargs)
952            finally:
953                socket.setdefaulttimeout(old_timeout)
954
955        return _socket_timeout
956
957    return _socket_timeout
958
959
960def _encode_auth(auth):
961    """
962    Encode auth from a URL suitable for an HTTP header.
963    >>> str(_encode_auth('username%3Apassword'))
964    'dXNlcm5hbWU6cGFzc3dvcmQ='
965
966    Long auth strings should not cause a newline to be inserted.
967    >>> long_auth = 'username:' + 'password'*10
968    >>> chr(10) in str(_encode_auth(long_auth))
969    False
970    """
971    auth_s = urllib.parse.unquote(auth)
972    # convert to bytes
973    auth_bytes = auth_s.encode()
974    encoded_bytes = base64.b64encode(auth_bytes)
975    # convert back to a string
976    encoded = encoded_bytes.decode()
977    # strip the trailing carriage return
978    return encoded.replace('\n', '')
979
980
981class Credential:
982    """
983    A username/password pair. Use like a namedtuple.
984    """
985
986    def __init__(self, username, password):
987        self.username = username
988        self.password = password
989
990    def __iter__(self):
991        yield self.username
992        yield self.password
993
994    def __str__(self):
995        return '%(username)s:%(password)s' % vars(self)
996
997
998class PyPIConfig(configparser.RawConfigParser):
999    def __init__(self):
1000        """
1001        Load from ~/.pypirc
1002        """
1003        defaults = dict.fromkeys(['username', 'password', 'repository'], '')
1004        super().__init__(defaults)
1005
1006        rc = os.path.join(os.path.expanduser('~'), '.pypirc')
1007        if os.path.exists(rc):
1008            self.read(rc)
1009
1010    @property
1011    def creds_by_repository(self):
1012        sections_with_repositories = [
1013            section for section in self.sections()
1014            if self.get(section, 'repository').strip()
1015        ]
1016
1017        return dict(map(self._get_repo_cred, sections_with_repositories))
1018
1019    def _get_repo_cred(self, section):
1020        repo = self.get(section, 'repository').strip()
1021        return repo, Credential(
1022            self.get(section, 'username').strip(),
1023            self.get(section, 'password').strip(),
1024        )
1025
1026    def find_credential(self, url):
1027        """
1028        If the URL indicated appears to be a repository defined in this
1029        config, return the credential for that repository.
1030        """
1031        for repository, cred in self.creds_by_repository.items():
1032            if url.startswith(repository):
1033                return cred
1034
1035
1036def open_with_auth(url, opener=urllib.request.urlopen):
1037    """Open a urllib2 request, handling HTTP authentication"""
1038
1039    parsed = urllib.parse.urlparse(url)
1040    scheme, netloc, path, params, query, frag = parsed
1041
1042    # Double scheme does not raise on macOS as revealed by a
1043    # failing test. We would expect "nonnumeric port". Refs #20.
1044    if netloc.endswith(':'):
1045        raise http.client.InvalidURL("nonnumeric port: ''")
1046
1047    if scheme in ('http', 'https'):
1048        auth, address = _splituser(netloc)
1049    else:
1050        auth = None
1051
1052    if not auth:
1053        cred = PyPIConfig().find_credential(url)
1054        if cred:
1055            auth = str(cred)
1056            info = cred.username, url
1057            log.info('Authenticating as %s for %s (from .pypirc)', *info)
1058
1059    if auth:
1060        auth = "Basic " + _encode_auth(auth)
1061        parts = scheme, address, path, params, query, frag
1062        new_url = urllib.parse.urlunparse(parts)
1063        request = urllib.request.Request(new_url)
1064        request.add_header("Authorization", auth)
1065    else:
1066        request = urllib.request.Request(url)
1067
1068    request.add_header('User-Agent', user_agent)
1069    fp = opener(request)
1070
1071    if auth:
1072        # Put authentication info back into request URL if same host,
1073        # so that links found on the page will work
1074        s2, h2, path2, param2, query2, frag2 = urllib.parse.urlparse(fp.url)
1075        if s2 == scheme and h2 == address:
1076            parts = s2, netloc, path2, param2, query2, frag2
1077            fp.url = urllib.parse.urlunparse(parts)
1078
1079    return fp
1080
1081
1082# copy of urllib.parse._splituser from Python 3.8
1083def _splituser(host):
1084    """splituser('user[:passwd]@host[:port]')
1085    --> 'user[:passwd]', 'host[:port]'."""
1086    user, delim, host = host.rpartition('@')
1087    return (user if delim else None), host
1088
1089
1090# adding a timeout to avoid freezing package_index
1091open_with_auth = socket_timeout(_SOCKET_TIMEOUT)(open_with_auth)
1092
1093
1094def fix_sf_url(url):
1095    return url  # backward compatibility
1096
1097
1098def local_open(url):
1099    """Read a local path, with special support for directories"""
1100    scheme, server, path, param, query, frag = urllib.parse.urlparse(url)
1101    filename = urllib.request.url2pathname(path)
1102    if os.path.isfile(filename):
1103        return urllib.request.urlopen(url)
1104    elif path.endswith('/') and os.path.isdir(filename):
1105        files = []
1106        for f in os.listdir(filename):
1107            filepath = os.path.join(filename, f)
1108            if f == 'index.html':
1109                with open(filepath, 'r') as fp:
1110                    body = fp.read()
1111                break
1112            elif os.path.isdir(filepath):
1113                f += '/'
1114            files.append('<a href="{name}">{name}</a>'.format(name=f))
1115        else:
1116            tmpl = (
1117                "<html><head><title>{url}</title>"
1118                "</head><body>{files}</body></html>")
1119            body = tmpl.format(url=url, files='\n'.join(files))
1120        status, message = 200, "OK"
1121    else:
1122        status, message, body = 404, "Path not found", "Not found"
1123
1124    headers = {'content-type': 'text/html'}
1125    body_stream = io.StringIO(body)
1126    return urllib.error.HTTPError(url, status, message, headers, body_stream)
1127