• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""PyPI and direct package downloading"""
2import sys
3import os
4import re
5import shutil
6import socket
7import base64
8import hashlib
9import itertools
10from functools import wraps
11
12from setuptools.extern import six
13from setuptools.extern.six.moves import urllib, http_client, configparser, map
14
15import setuptools
16from pkg_resources import (
17    CHECKOUT_DIST, Distribution, BINARY_DIST, normalize_path, SOURCE_DIST,
18    Environment, find_distributions, safe_name, safe_version,
19    to_filename, Requirement, DEVELOP_DIST, EGG_DIST,
20)
21from setuptools import ssl_support
22from distutils import log
23from distutils.errors import DistutilsError
24from fnmatch import translate
25from setuptools.py27compat import get_all_headers
26from setuptools.py33compat import unescape
27from setuptools.wheel import Wheel
28
29EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.+!]+)$')
30HREF = re.compile("""href\\s*=\\s*['"]?([^'"> ]+)""", re.I)
31# this is here to fix emacs' cruddy broken syntax highlighting
32PYPI_MD5 = re.compile(
33    '<a href="([^"#]+)">([^<]+)</a>\n\\s+\\(<a (?:title="MD5 hash"\n\\s+)'
34    'href="[^?]+\\?:action=show_md5&amp;digest=([0-9a-f]{32})">md5</a>\\)'
35)
36URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
37EXTENSIONS = ".tar.gz .tar.bz2 .tar .zip .tgz".split()
38
39__all__ = [
40    'PackageIndex', 'distros_for_url', 'parse_bdist_wininst',
41    'interpret_distro_name',
42]
43
44_SOCKET_TIMEOUT = 15
45
46_tmpl = "setuptools/{setuptools.__version__} Python-urllib/{py_major}"
47user_agent = _tmpl.format(py_major=sys.version[:3], setuptools=setuptools)
48
49
50def parse_requirement_arg(spec):
51    try:
52        return Requirement.parse(spec)
53    except ValueError:
54        raise DistutilsError(
55            "Not a URL, existing file, or requirement spec: %r" % (spec,)
56        )
57
58
59def parse_bdist_wininst(name):
60    """Return (base,pyversion) or (None,None) for possible .exe name"""
61
62    lower = name.lower()
63    base, py_ver, plat = None, None, None
64
65    if lower.endswith('.exe'):
66        if lower.endswith('.win32.exe'):
67            base = name[:-10]
68            plat = 'win32'
69        elif lower.startswith('.win32-py', -16):
70            py_ver = name[-7:-4]
71            base = name[:-16]
72            plat = 'win32'
73        elif lower.endswith('.win-amd64.exe'):
74            base = name[:-14]
75            plat = 'win-amd64'
76        elif lower.startswith('.win-amd64-py', -20):
77            py_ver = name[-7:-4]
78            base = name[:-20]
79            plat = 'win-amd64'
80    return base, py_ver, plat
81
82
83def egg_info_for_url(url):
84    parts = urllib.parse.urlparse(url)
85    scheme, server, path, parameters, query, fragment = parts
86    base = urllib.parse.unquote(path.split('/')[-1])
87    if server == 'sourceforge.net' and base == 'download':  # XXX Yuck
88        base = urllib.parse.unquote(path.split('/')[-2])
89    if '#' in base:
90        base, fragment = base.split('#', 1)
91    return base, fragment
92
93
94def distros_for_url(url, metadata=None):
95    """Yield egg or source distribution objects that might be found at a URL"""
96    base, fragment = egg_info_for_url(url)
97    for dist in distros_for_location(url, base, metadata):
98        yield dist
99    if fragment:
100        match = EGG_FRAGMENT.match(fragment)
101        if match:
102            for dist in interpret_distro_name(
103                url, match.group(1), metadata, precedence=CHECKOUT_DIST
104            ):
105                yield dist
106
107
108def distros_for_location(location, basename, metadata=None):
109    """Yield egg or source distribution objects based on basename"""
110    if basename.endswith('.egg.zip'):
111        basename = basename[:-4]  # strip the .zip
112    if basename.endswith('.egg') and '-' in basename:
113        # only one, unambiguous interpretation
114        return [Distribution.from_location(location, basename, metadata)]
115    if basename.endswith('.whl') and '-' in basename:
116        wheel = Wheel(basename)
117        if not wheel.is_compatible():
118            return []
119        return [Distribution(
120            location=location,
121            project_name=wheel.project_name,
122            version=wheel.version,
123            # Increase priority over eggs.
124            precedence=EGG_DIST + 1,
125        )]
126    if basename.endswith('.exe'):
127        win_base, py_ver, platform = parse_bdist_wininst(basename)
128        if win_base is not None:
129            return interpret_distro_name(
130                location, win_base, metadata, py_ver, BINARY_DIST, platform
131            )
132    # Try source distro extensions (.zip, .tgz, etc.)
133    #
134    for ext in EXTENSIONS:
135        if basename.endswith(ext):
136            basename = basename[:-len(ext)]
137            return interpret_distro_name(location, basename, metadata)
138    return []  # no extension matched
139
140
141def distros_for_filename(filename, metadata=None):
142    """Yield possible egg or source distribution objects based on a filename"""
143    return distros_for_location(
144        normalize_path(filename), os.path.basename(filename), metadata
145    )
146
147
148def interpret_distro_name(
149        location, basename, metadata, py_version=None, precedence=SOURCE_DIST,
150        platform=None
151):
152    """Generate alternative interpretations of a source distro name
153
154    Note: if `location` is a filesystem filename, you should call
155    ``pkg_resources.normalize_path()`` on it before passing it to this
156    routine!
157    """
158    # Generate alternative interpretations of a source distro name
159    # Because some packages are ambiguous as to name/versions split
160    # e.g. "adns-python-1.1.0", "egenix-mx-commercial", etc.
161    # So, we generate each possible interepretation (e.g. "adns, python-1.1.0"
162    # "adns-python, 1.1.0", and "adns-python-1.1.0, no version").  In practice,
163    # the spurious interpretations should be ignored, because in the event
164    # there's also an "adns" package, the spurious "python-1.1.0" version will
165    # compare lower than any numeric version number, and is therefore unlikely
166    # to match a request for it.  It's still a potential problem, though, and
167    # in the long run PyPI and the distutils should go for "safe" names and
168    # versions in distribution archive names (sdist and bdist).
169
170    parts = basename.split('-')
171    if not py_version and any(re.match(r'py\d\.\d$', p) for p in parts[2:]):
172        # it is a bdist_dumb, not an sdist -- bail out
173        return
174
175    for p in range(1, len(parts) + 1):
176        yield Distribution(
177            location, metadata, '-'.join(parts[:p]), '-'.join(parts[p:]),
178            py_version=py_version, precedence=precedence,
179            platform=platform
180        )
181
182
183# From Python 2.7 docs
184def unique_everseen(iterable, key=None):
185    "List unique elements, preserving order. Remember all elements ever seen."
186    # unique_everseen('AAAABBBCCDAABBB') --> A B C D
187    # unique_everseen('ABBCcAD', str.lower) --> A B C D
188    seen = set()
189    seen_add = seen.add
190    if key is None:
191        for element in six.moves.filterfalse(seen.__contains__, iterable):
192            seen_add(element)
193            yield element
194    else:
195        for element in iterable:
196            k = key(element)
197            if k not in seen:
198                seen_add(k)
199                yield element
200
201
202def unique_values(func):
203    """
204    Wrap a function returning an iterable such that the resulting iterable
205    only ever yields unique items.
206    """
207
208    @wraps(func)
209    def wrapper(*args, **kwargs):
210        return unique_everseen(func(*args, **kwargs))
211
212    return wrapper
213
214
215REL = re.compile(r"""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I)
216# this line is here to fix emacs' cruddy broken syntax highlighting
217
218
219@unique_values
220def find_external_links(url, page):
221    """Find rel="homepage" and rel="download" links in `page`, yielding URLs"""
222
223    for match in REL.finditer(page):
224        tag, rel = match.groups()
225        rels = set(map(str.strip, rel.lower().split(',')))
226        if 'homepage' in rels or 'download' in rels:
227            for match in HREF.finditer(tag):
228                yield urllib.parse.urljoin(url, htmldecode(match.group(1)))
229
230    for tag in ("<th>Home Page", "<th>Download URL"):
231        pos = page.find(tag)
232        if pos != -1:
233            match = HREF.search(page, pos)
234            if match:
235                yield urllib.parse.urljoin(url, htmldecode(match.group(1)))
236
237
238class ContentChecker(object):
239    """
240    A null content checker that defines the interface for checking content
241    """
242
243    def feed(self, block):
244        """
245        Feed a block of data to the hash.
246        """
247        return
248
249    def is_valid(self):
250        """
251        Check the hash. Return False if validation fails.
252        """
253        return True
254
255    def report(self, reporter, template):
256        """
257        Call reporter with information about the checker (hash name)
258        substituted into the template.
259        """
260        return
261
262
263class HashChecker(ContentChecker):
264    pattern = re.compile(
265        r'(?P<hash_name>sha1|sha224|sha384|sha256|sha512|md5)='
266        r'(?P<expected>[a-f0-9]+)'
267    )
268
269    def __init__(self, hash_name, expected):
270        self.hash_name = hash_name
271        self.hash = hashlib.new(hash_name)
272        self.expected = expected
273
274    @classmethod
275    def from_url(cls, url):
276        "Construct a (possibly null) ContentChecker from a URL"
277        fragment = urllib.parse.urlparse(url)[-1]
278        if not fragment:
279            return ContentChecker()
280        match = cls.pattern.search(fragment)
281        if not match:
282            return ContentChecker()
283        return cls(**match.groupdict())
284
285    def feed(self, block):
286        self.hash.update(block)
287
288    def is_valid(self):
289        return self.hash.hexdigest() == self.expected
290
291    def report(self, reporter, template):
292        msg = template % self.hash_name
293        return reporter(msg)
294
295
296class PackageIndex(Environment):
297    """A distribution index that scans web pages for download URLs"""
298
299    def __init__(
300            self, index_url="https://pypi.org/simple/", hosts=('*',),
301            ca_bundle=None, verify_ssl=True, *args, **kw
302    ):
303        Environment.__init__(self, *args, **kw)
304        self.index_url = index_url + "/" [:not index_url.endswith('/')]
305        self.scanned_urls = {}
306        self.fetched_urls = {}
307        self.package_pages = {}
308        self.allows = re.compile('|'.join(map(translate, hosts))).match
309        self.to_scan = []
310        use_ssl = (
311            verify_ssl
312            and ssl_support.is_available
313            and (ca_bundle or ssl_support.find_ca_bundle())
314        )
315        if use_ssl:
316            self.opener = ssl_support.opener_for(ca_bundle)
317        else:
318            self.opener = urllib.request.urlopen
319
320    def process_url(self, url, retrieve=False):
321        """Evaluate a URL as a possible download, and maybe retrieve it"""
322        if url in self.scanned_urls and not retrieve:
323            return
324        self.scanned_urls[url] = True
325        if not URL_SCHEME(url):
326            self.process_filename(url)
327            return
328        else:
329            dists = list(distros_for_url(url))
330            if dists:
331                if not self.url_ok(url):
332                    return
333                self.debug("Found link: %s", url)
334
335        if dists or not retrieve or url in self.fetched_urls:
336            list(map(self.add, dists))
337            return  # don't need the actual page
338
339        if not self.url_ok(url):
340            self.fetched_urls[url] = True
341            return
342
343        self.info("Reading %s", url)
344        self.fetched_urls[url] = True  # prevent multiple fetch attempts
345        tmpl = "Download error on %s: %%s -- Some packages may not be found!"
346        f = self.open_url(url, tmpl % url)
347        if f is None:
348            return
349        self.fetched_urls[f.url] = True
350        if 'html' not in f.headers.get('content-type', '').lower():
351            f.close()  # not html, we can't process it
352            return
353
354        base = f.url  # handle redirects
355        page = f.read()
356        if not isinstance(page, str):
357            # In Python 3 and got bytes but want str.
358            if isinstance(f, urllib.error.HTTPError):
359                # Errors have no charset, assume latin1:
360                charset = 'latin-1'
361            else:
362                charset = f.headers.get_param('charset') or 'latin-1'
363            page = page.decode(charset, "ignore")
364        f.close()
365        for match in HREF.finditer(page):
366            link = urllib.parse.urljoin(base, htmldecode(match.group(1)))
367            self.process_url(link)
368        if url.startswith(self.index_url) and getattr(f, 'code', None) != 404:
369            page = self.process_index(url, page)
370
371    def process_filename(self, fn, nested=False):
372        # process filenames or directories
373        if not os.path.exists(fn):
374            self.warn("Not found: %s", fn)
375            return
376
377        if os.path.isdir(fn) and not nested:
378            path = os.path.realpath(fn)
379            for item in os.listdir(path):
380                self.process_filename(os.path.join(path, item), True)
381
382        dists = distros_for_filename(fn)
383        if dists:
384            self.debug("Found: %s", fn)
385            list(map(self.add, dists))
386
387    def url_ok(self, url, fatal=False):
388        s = URL_SCHEME(url)
389        is_file = s and s.group(1).lower() == 'file'
390        if is_file or self.allows(urllib.parse.urlparse(url)[1]):
391            return True
392        msg = (
393            "\nNote: Bypassing %s (disallowed host; see "
394            "http://bit.ly/2hrImnY for details).\n")
395        if fatal:
396            raise DistutilsError(msg % url)
397        else:
398            self.warn(msg, url)
399
400    def scan_egg_links(self, search_path):
401        dirs = filter(os.path.isdir, search_path)
402        egg_links = (
403            (path, entry)
404            for path in dirs
405            for entry in os.listdir(path)
406            if entry.endswith('.egg-link')
407        )
408        list(itertools.starmap(self.scan_egg_link, egg_links))
409
410    def scan_egg_link(self, path, entry):
411        with open(os.path.join(path, entry)) as raw_lines:
412            # filter non-empty lines
413            lines = list(filter(None, map(str.strip, raw_lines)))
414
415        if len(lines) != 2:
416            # format is not recognized; punt
417            return
418
419        egg_path, setup_path = lines
420
421        for dist in find_distributions(os.path.join(path, egg_path)):
422            dist.location = os.path.join(path, *lines)
423            dist.precedence = SOURCE_DIST
424            self.add(dist)
425
426    def process_index(self, url, page):
427        """Process the contents of a PyPI page"""
428
429        def scan(link):
430            # Process a URL to see if it's for a package page
431            if link.startswith(self.index_url):
432                parts = list(map(
433                    urllib.parse.unquote, link[len(self.index_url):].split('/')
434                ))
435                if len(parts) == 2 and '#' not in parts[1]:
436                    # it's a package page, sanitize and index it
437                    pkg = safe_name(parts[0])
438                    ver = safe_version(parts[1])
439                    self.package_pages.setdefault(pkg.lower(), {})[link] = True
440                    return to_filename(pkg), to_filename(ver)
441            return None, None
442
443        # process an index page into the package-page index
444        for match in HREF.finditer(page):
445            try:
446                scan(urllib.parse.urljoin(url, htmldecode(match.group(1))))
447            except ValueError:
448                pass
449
450        pkg, ver = scan(url)  # ensure this page is in the page index
451        if pkg:
452            # process individual package page
453            for new_url in find_external_links(url, page):
454                # Process the found URL
455                base, frag = egg_info_for_url(new_url)
456                if base.endswith('.py') and not frag:
457                    if ver:
458                        new_url += '#egg=%s-%s' % (pkg, ver)
459                    else:
460                        self.need_version_info(url)
461                self.scan_url(new_url)
462
463            return PYPI_MD5.sub(
464                lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1, 3, 2), page
465            )
466        else:
467            return ""  # no sense double-scanning non-package pages
468
469    def need_version_info(self, url):
470        self.scan_all(
471            "Page at %s links to .py file(s) without version info; an index "
472            "scan is required.", url
473        )
474
475    def scan_all(self, msg=None, *args):
476        if self.index_url not in self.fetched_urls:
477            if msg:
478                self.warn(msg, *args)
479            self.info(
480                "Scanning index of all packages (this may take a while)"
481            )
482        self.scan_url(self.index_url)
483
484    def find_packages(self, requirement):
485        self.scan_url(self.index_url + requirement.unsafe_name + '/')
486
487        if not self.package_pages.get(requirement.key):
488            # Fall back to safe version of the name
489            self.scan_url(self.index_url + requirement.project_name + '/')
490
491        if not self.package_pages.get(requirement.key):
492            # We couldn't find the target package, so search the index page too
493            self.not_found_in_index(requirement)
494
495        for url in list(self.package_pages.get(requirement.key, ())):
496            # scan each page that might be related to the desired package
497            self.scan_url(url)
498
499    def obtain(self, requirement, installer=None):
500        self.prescan()
501        self.find_packages(requirement)
502        for dist in self[requirement.key]:
503            if dist in requirement:
504                return dist
505            self.debug("%s does not match %s", requirement, dist)
506        return super(PackageIndex, self).obtain(requirement, installer)
507
508    def check_hash(self, checker, filename, tfp):
509        """
510        checker is a ContentChecker
511        """
512        checker.report(
513            self.debug,
514            "Validating %%s checksum for %s" % filename)
515        if not checker.is_valid():
516            tfp.close()
517            os.unlink(filename)
518            raise DistutilsError(
519                "%s validation failed for %s; "
520                "possible download problem?"
521                % (checker.hash.name, os.path.basename(filename))
522            )
523
524    def add_find_links(self, urls):
525        """Add `urls` to the list that will be prescanned for searches"""
526        for url in urls:
527            if (
528                self.to_scan is None  # if we have already "gone online"
529                or not URL_SCHEME(url)  # or it's a local file/directory
530                or url.startswith('file:')
531                or list(distros_for_url(url))  # or a direct package link
532            ):
533                # then go ahead and process it now
534                self.scan_url(url)
535            else:
536                # otherwise, defer retrieval till later
537                self.to_scan.append(url)
538
539    def prescan(self):
540        """Scan urls scheduled for prescanning (e.g. --find-links)"""
541        if self.to_scan:
542            list(map(self.scan_url, self.to_scan))
543        self.to_scan = None  # from now on, go ahead and process immediately
544
545    def not_found_in_index(self, requirement):
546        if self[requirement.key]:  # we've seen at least one distro
547            meth, msg = self.info, "Couldn't retrieve index page for %r"
548        else:  # no distros seen for this name, might be misspelled
549            meth, msg = (
550                self.warn,
551                "Couldn't find index page for %r (maybe misspelled?)")
552        meth(msg, requirement.unsafe_name)
553        self.scan_all()
554
555    def download(self, spec, tmpdir):
556        """Locate and/or download `spec` to `tmpdir`, returning a local path
557
558        `spec` may be a ``Requirement`` object, or a string containing a URL,
559        an existing local filename, or a project/version requirement spec
560        (i.e. the string form of a ``Requirement`` object).  If it is the URL
561        of a .py file with an unambiguous ``#egg=name-version`` tag (i.e., one
562        that escapes ``-`` as ``_`` throughout), a trivial ``setup.py`` is
563        automatically created alongside the downloaded file.
564
565        If `spec` is a ``Requirement`` object or a string containing a
566        project/version requirement spec, this method returns the location of
567        a matching distribution (possibly after downloading it to `tmpdir`).
568        If `spec` is a locally existing file or directory name, it is simply
569        returned unchanged.  If `spec` is a URL, it is downloaded to a subpath
570        of `tmpdir`, and the local filename is returned.  Various errors may be
571        raised if a problem occurs during downloading.
572        """
573        if not isinstance(spec, Requirement):
574            scheme = URL_SCHEME(spec)
575            if scheme:
576                # It's a url, download it to tmpdir
577                found = self._download_url(scheme.group(1), spec, tmpdir)
578                base, fragment = egg_info_for_url(spec)
579                if base.endswith('.py'):
580                    found = self.gen_setup(found, fragment, tmpdir)
581                return found
582            elif os.path.exists(spec):
583                # Existing file or directory, just return it
584                return spec
585            else:
586                spec = parse_requirement_arg(spec)
587        return getattr(self.fetch_distribution(spec, tmpdir), 'location', None)
588
589    def fetch_distribution(
590            self, requirement, tmpdir, force_scan=False, source=False,
591            develop_ok=False, local_index=None):
592        """Obtain a distribution suitable for fulfilling `requirement`
593
594        `requirement` must be a ``pkg_resources.Requirement`` instance.
595        If necessary, or if the `force_scan` flag is set, the requirement is
596        searched for in the (online) package index as well as the locally
597        installed packages.  If a distribution matching `requirement` is found,
598        the returned distribution's ``location`` is the value you would have
599        gotten from calling the ``download()`` method with the matching
600        distribution's URL or filename.  If no matching distribution is found,
601        ``None`` is returned.
602
603        If the `source` flag is set, only source distributions and source
604        checkout links will be considered.  Unless the `develop_ok` flag is
605        set, development and system eggs (i.e., those using the ``.egg-info``
606        format) will be ignored.
607        """
608        # process a Requirement
609        self.info("Searching for %s", requirement)
610        skipped = {}
611        dist = None
612
613        def find(req, env=None):
614            if env is None:
615                env = self
616            # Find a matching distribution; may be called more than once
617
618            for dist in env[req.key]:
619
620                if dist.precedence == DEVELOP_DIST and not develop_ok:
621                    if dist not in skipped:
622                        self.warn(
623                            "Skipping development or system egg: %s", dist,
624                        )
625                        skipped[dist] = 1
626                    continue
627
628                test = (
629                    dist in req
630                    and (dist.precedence <= SOURCE_DIST or not source)
631                )
632                if test:
633                    loc = self.download(dist.location, tmpdir)
634                    dist.download_location = loc
635                    if os.path.exists(dist.download_location):
636                        return dist
637
638        if force_scan:
639            self.prescan()
640            self.find_packages(requirement)
641            dist = find(requirement)
642
643        if not dist and local_index is not None:
644            dist = find(requirement, local_index)
645
646        if dist is None:
647            if self.to_scan is not None:
648                self.prescan()
649            dist = find(requirement)
650
651        if dist is None and not force_scan:
652            self.find_packages(requirement)
653            dist = find(requirement)
654
655        if dist is None:
656            self.warn(
657                "No local packages or working download links found for %s%s",
658                (source and "a source distribution of " or ""),
659                requirement,
660            )
661        else:
662            self.info("Best match: %s", dist)
663            return dist.clone(location=dist.download_location)
664
665    def fetch(self, requirement, tmpdir, force_scan=False, source=False):
666        """Obtain a file suitable for fulfilling `requirement`
667
668        DEPRECATED; use the ``fetch_distribution()`` method now instead.  For
669        backward compatibility, this routine is identical but returns the
670        ``location`` of the downloaded distribution instead of a distribution
671        object.
672        """
673        dist = self.fetch_distribution(requirement, tmpdir, force_scan, source)
674        if dist is not None:
675            return dist.location
676        return None
677
678    def gen_setup(self, filename, fragment, tmpdir):
679        match = EGG_FRAGMENT.match(fragment)
680        dists = match and [
681            d for d in
682            interpret_distro_name(filename, match.group(1), None) if d.version
683        ] or []
684
685        if len(dists) == 1:  # unambiguous ``#egg`` fragment
686            basename = os.path.basename(filename)
687
688            # Make sure the file has been downloaded to the temp dir.
689            if os.path.dirname(filename) != tmpdir:
690                dst = os.path.join(tmpdir, basename)
691                from setuptools.command.easy_install import samefile
692                if not samefile(filename, dst):
693                    shutil.copy2(filename, dst)
694                    filename = dst
695
696            with open(os.path.join(tmpdir, 'setup.py'), 'w') as file:
697                file.write(
698                    "from setuptools import setup\n"
699                    "setup(name=%r, version=%r, py_modules=[%r])\n"
700                    % (
701                        dists[0].project_name, dists[0].version,
702                        os.path.splitext(basename)[0]
703                    )
704                )
705            return filename
706
707        elif match:
708            raise DistutilsError(
709                "Can't unambiguously interpret project/version identifier %r; "
710                "any dashes in the name or version should be escaped using "
711                "underscores. %r" % (fragment, dists)
712            )
713        else:
714            raise DistutilsError(
715                "Can't process plain .py files without an '#egg=name-version'"
716                " suffix to enable automatic setup script generation."
717            )
718
719    dl_blocksize = 8192
720
721    def _download_to(self, url, filename):
722        self.info("Downloading %s", url)
723        # Download the file
724        fp = None
725        try:
726            checker = HashChecker.from_url(url)
727            fp = self.open_url(url)
728            if isinstance(fp, urllib.error.HTTPError):
729                raise DistutilsError(
730                    "Can't download %s: %s %s" % (url, fp.code, fp.msg)
731                )
732            headers = fp.info()
733            blocknum = 0
734            bs = self.dl_blocksize
735            size = -1
736            if "content-length" in headers:
737                # Some servers return multiple Content-Length headers :(
738                sizes = get_all_headers(headers, 'Content-Length')
739                size = max(map(int, sizes))
740                self.reporthook(url, filename, blocknum, bs, size)
741            with open(filename, 'wb') as tfp:
742                while True:
743                    block = fp.read(bs)
744                    if block:
745                        checker.feed(block)
746                        tfp.write(block)
747                        blocknum += 1
748                        self.reporthook(url, filename, blocknum, bs, size)
749                    else:
750                        break
751                self.check_hash(checker, filename, tfp)
752            return headers
753        finally:
754            if fp:
755                fp.close()
756
757    def reporthook(self, url, filename, blocknum, blksize, size):
758        pass  # no-op
759
760    def open_url(self, url, warning=None):
761        if url.startswith('file:'):
762            return local_open(url)
763        try:
764            return open_with_auth(url, self.opener)
765        except (ValueError, http_client.InvalidURL) as v:
766            msg = ' '.join([str(arg) for arg in v.args])
767            if warning:
768                self.warn(warning, msg)
769            else:
770                raise DistutilsError('%s %s' % (url, msg))
771        except urllib.error.HTTPError as v:
772            return v
773        except urllib.error.URLError as v:
774            if warning:
775                self.warn(warning, v.reason)
776            else:
777                raise DistutilsError("Download error for %s: %s"
778                                     % (url, v.reason))
779        except http_client.BadStatusLine as v:
780            if warning:
781                self.warn(warning, v.line)
782            else:
783                raise DistutilsError(
784                    '%s returned a bad status line. The server might be '
785                    'down, %s' %
786                    (url, v.line)
787                )
788        except (http_client.HTTPException, socket.error) as v:
789            if warning:
790                self.warn(warning, v)
791            else:
792                raise DistutilsError("Download error for %s: %s"
793                                     % (url, v))
794
795    def _download_url(self, scheme, url, tmpdir):
796        # Determine download filename
797        #
798        name, fragment = egg_info_for_url(url)
799        if name:
800            while '..' in name:
801                name = name.replace('..', '.').replace('\\', '_')
802        else:
803            name = "__downloaded__"  # default if URL has no path contents
804
805        if name.endswith('.egg.zip'):
806            name = name[:-4]  # strip the extra .zip before download
807
808        filename = os.path.join(tmpdir, name)
809
810        # Download the file
811        #
812        if scheme == 'svn' or scheme.startswith('svn+'):
813            return self._download_svn(url, filename)
814        elif scheme == 'git' or scheme.startswith('git+'):
815            return self._download_git(url, filename)
816        elif scheme.startswith('hg+'):
817            return self._download_hg(url, filename)
818        elif scheme == 'file':
819            return urllib.request.url2pathname(urllib.parse.urlparse(url)[2])
820        else:
821            self.url_ok(url, True)  # raises error if not allowed
822            return self._attempt_download(url, filename)
823
824    def scan_url(self, url):
825        self.process_url(url, True)
826
827    def _attempt_download(self, url, filename):
828        headers = self._download_to(url, filename)
829        if 'html' in headers.get('content-type', '').lower():
830            return self._download_html(url, headers, filename)
831        else:
832            return filename
833
834    def _download_html(self, url, headers, filename):
835        file = open(filename)
836        for line in file:
837            if line.strip():
838                # Check for a subversion index page
839                if re.search(r'<title>([^- ]+ - )?Revision \d+:', line):
840                    # it's a subversion index page:
841                    file.close()
842                    os.unlink(filename)
843                    return self._download_svn(url, filename)
844                break  # not an index page
845        file.close()
846        os.unlink(filename)
847        raise DistutilsError("Unexpected HTML page found at " + url)
848
849    def _download_svn(self, url, filename):
850        url = url.split('#', 1)[0]  # remove any fragment for svn's sake
851        creds = ''
852        if url.lower().startswith('svn:') and '@' in url:
853            scheme, netloc, path, p, q, f = urllib.parse.urlparse(url)
854            if not netloc and path.startswith('//') and '/' in path[2:]:
855                netloc, path = path[2:].split('/', 1)
856                auth, host = urllib.parse.splituser(netloc)
857                if auth:
858                    if ':' in auth:
859                        user, pw = auth.split(':', 1)
860                        creds = " --username=%s --password=%s" % (user, pw)
861                    else:
862                        creds = " --username=" + auth
863                    netloc = host
864                    parts = scheme, netloc, url, p, q, f
865                    url = urllib.parse.urlunparse(parts)
866        self.info("Doing subversion checkout from %s to %s", url, filename)
867        os.system("svn checkout%s -q %s %s" % (creds, url, filename))
868        return filename
869
870    @staticmethod
871    def _vcs_split_rev_from_url(url, pop_prefix=False):
872        scheme, netloc, path, query, frag = urllib.parse.urlsplit(url)
873
874        scheme = scheme.split('+', 1)[-1]
875
876        # Some fragment identification fails
877        path = path.split('#', 1)[0]
878
879        rev = None
880        if '@' in path:
881            path, rev = path.rsplit('@', 1)
882
883        # Also, discard fragment
884        url = urllib.parse.urlunsplit((scheme, netloc, path, query, ''))
885
886        return url, rev
887
888    def _download_git(self, url, filename):
889        filename = filename.split('#', 1)[0]
890        url, rev = self._vcs_split_rev_from_url(url, pop_prefix=True)
891
892        self.info("Doing git clone from %s to %s", url, filename)
893        os.system("git clone --quiet %s %s" % (url, filename))
894
895        if rev is not None:
896            self.info("Checking out %s", rev)
897            os.system("(cd %s && git checkout --quiet %s)" % (
898                filename,
899                rev,
900            ))
901
902        return filename
903
904    def _download_hg(self, url, filename):
905        filename = filename.split('#', 1)[0]
906        url, rev = self._vcs_split_rev_from_url(url, pop_prefix=True)
907
908        self.info("Doing hg clone from %s to %s", url, filename)
909        os.system("hg clone --quiet %s %s" % (url, filename))
910
911        if rev is not None:
912            self.info("Updating to %s", rev)
913            os.system("(cd %s && hg up -C -r %s -q)" % (
914                filename,
915                rev,
916            ))
917
918        return filename
919
920    def debug(self, msg, *args):
921        log.debug(msg, *args)
922
923    def info(self, msg, *args):
924        log.info(msg, *args)
925
926    def warn(self, msg, *args):
927        log.warn(msg, *args)
928
929
930# This pattern matches a character entity reference (a decimal numeric
931# references, a hexadecimal numeric reference, or a named reference).
932entity_sub = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub
933
934
935def decode_entity(match):
936    what = match.group(1)
937    return unescape(what)
938
939
940def htmldecode(text):
941    """Decode HTML entities in the given text."""
942    return entity_sub(decode_entity, text)
943
944
945def socket_timeout(timeout=15):
946    def _socket_timeout(func):
947        def _socket_timeout(*args, **kwargs):
948            old_timeout = socket.getdefaulttimeout()
949            socket.setdefaulttimeout(timeout)
950            try:
951                return func(*args, **kwargs)
952            finally:
953                socket.setdefaulttimeout(old_timeout)
954
955        return _socket_timeout
956
957    return _socket_timeout
958
959
960def _encode_auth(auth):
961    """
962    A function compatible with Python 2.3-3.3 that will encode
963    auth from a URL suitable for an HTTP header.
964    >>> str(_encode_auth('username%3Apassword'))
965    'dXNlcm5hbWU6cGFzc3dvcmQ='
966
967    Long auth strings should not cause a newline to be inserted.
968    >>> long_auth = 'username:' + 'password'*10
969    >>> chr(10) in str(_encode_auth(long_auth))
970    False
971    """
972    auth_s = urllib.parse.unquote(auth)
973    # convert to bytes
974    auth_bytes = auth_s.encode()
975    # use the legacy interface for Python 2.3 support
976    encoded_bytes = base64.encodestring(auth_bytes)
977    # convert back to a string
978    encoded = encoded_bytes.decode()
979    # strip the trailing carriage return
980    return encoded.replace('\n', '')
981
982
983class Credential(object):
984    """
985    A username/password pair. Use like a namedtuple.
986    """
987
988    def __init__(self, username, password):
989        self.username = username
990        self.password = password
991
992    def __iter__(self):
993        yield self.username
994        yield self.password
995
996    def __str__(self):
997        return '%(username)s:%(password)s' % vars(self)
998
999
1000class PyPIConfig(configparser.RawConfigParser):
1001    def __init__(self):
1002        """
1003        Load from ~/.pypirc
1004        """
1005        defaults = dict.fromkeys(['username', 'password', 'repository'], '')
1006        configparser.RawConfigParser.__init__(self, defaults)
1007
1008        rc = os.path.join(os.path.expanduser('~'), '.pypirc')
1009        if os.path.exists(rc):
1010            self.read(rc)
1011
1012    @property
1013    def creds_by_repository(self):
1014        sections_with_repositories = [
1015            section for section in self.sections()
1016            if self.get(section, 'repository').strip()
1017        ]
1018
1019        return dict(map(self._get_repo_cred, sections_with_repositories))
1020
1021    def _get_repo_cred(self, section):
1022        repo = self.get(section, 'repository').strip()
1023        return repo, Credential(
1024            self.get(section, 'username').strip(),
1025            self.get(section, 'password').strip(),
1026        )
1027
1028    def find_credential(self, url):
1029        """
1030        If the URL indicated appears to be a repository defined in this
1031        config, return the credential for that repository.
1032        """
1033        for repository, cred in self.creds_by_repository.items():
1034            if url.startswith(repository):
1035                return cred
1036
1037
1038def open_with_auth(url, opener=urllib.request.urlopen):
1039    """Open a urllib2 request, handling HTTP authentication"""
1040
1041    scheme, netloc, path, params, query, frag = urllib.parse.urlparse(url)
1042
1043    # Double scheme does not raise on Mac OS X as revealed by a
1044    # failing test. We would expect "nonnumeric port". Refs #20.
1045    if netloc.endswith(':'):
1046        raise http_client.InvalidURL("nonnumeric port: ''")
1047
1048    if scheme in ('http', 'https'):
1049        auth, host = urllib.parse.splituser(netloc)
1050    else:
1051        auth = None
1052
1053    if not auth:
1054        cred = PyPIConfig().find_credential(url)
1055        if cred:
1056            auth = str(cred)
1057            info = cred.username, url
1058            log.info('Authenticating as %s for %s (from .pypirc)', *info)
1059
1060    if auth:
1061        auth = "Basic " + _encode_auth(auth)
1062        parts = scheme, host, path, params, query, frag
1063        new_url = urllib.parse.urlunparse(parts)
1064        request = urllib.request.Request(new_url)
1065        request.add_header("Authorization", auth)
1066    else:
1067        request = urllib.request.Request(url)
1068
1069    request.add_header('User-Agent', user_agent)
1070    fp = opener(request)
1071
1072    if auth:
1073        # Put authentication info back into request URL if same host,
1074        # so that links found on the page will work
1075        s2, h2, path2, param2, query2, frag2 = urllib.parse.urlparse(fp.url)
1076        if s2 == scheme and h2 == host:
1077            parts = s2, netloc, path2, param2, query2, frag2
1078            fp.url = urllib.parse.urlunparse(parts)
1079
1080    return fp
1081
1082
1083# adding a timeout to avoid freezing package_index
1084open_with_auth = socket_timeout(_SOCKET_TIMEOUT)(open_with_auth)
1085
1086
1087def fix_sf_url(url):
1088    return url  # backward compatibility
1089
1090
1091def local_open(url):
1092    """Read a local path, with special support for directories"""
1093    scheme, server, path, param, query, frag = urllib.parse.urlparse(url)
1094    filename = urllib.request.url2pathname(path)
1095    if os.path.isfile(filename):
1096        return urllib.request.urlopen(url)
1097    elif path.endswith('/') and os.path.isdir(filename):
1098        files = []
1099        for f in os.listdir(filename):
1100            filepath = os.path.join(filename, f)
1101            if f == 'index.html':
1102                with open(filepath, 'r') as fp:
1103                    body = fp.read()
1104                break
1105            elif os.path.isdir(filepath):
1106                f += '/'
1107            files.append('<a href="{name}">{name}</a>'.format(name=f))
1108        else:
1109            tmpl = (
1110                "<html><head><title>{url}</title>"
1111                "</head><body>{files}</body></html>")
1112            body = tmpl.format(url=url, files='\n'.join(files))
1113        status, message = 200, "OK"
1114    else:
1115        status, message, body = 404, "Path not found", "Not found"
1116
1117    headers = {'content-type': 'text/html'}
1118    body_stream = six.StringIO(body)
1119    return urllib.error.HTTPError(url, status, message, headers, body_stream)
1120