• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3.8
2
3import argparse
4import os
5import json
6
7from typing import Dict, Any
8from urllib.request import urlretrieve
9
10argparser = argparse.ArgumentParser(
11    prog="download_pypi_packages",
12    description="Helper program to download PyPI packages",
13)
14argparser.add_argument(
15    "-n", "--number", type=int, default=100, help="Number of packages to download"
16)
17argparser.add_argument(
18    "-a", "--all", action="store_true", help="Download all packages listed in the json file"
19)
20
21
22def load_json(filename: str) -> Dict[Any, Any]:
23    with open(os.path.join("data", f"{filename}.json"), "r") as f:
24        j = json.loads(f.read())
25    return j
26
27
28def remove_json(filename: str) -> None:
29    path = os.path.join("data", f"{filename}.json")
30    os.remove(path)
31
32
33def download_package_json(package_name: str) -> None:
34    url = f"https://pypi.org/pypi/{package_name}/json"
35    urlretrieve(url, os.path.join("data", f"{package_name}.json"))
36
37
38def download_package_code(name: str, package_json: Dict[Any, Any]) -> None:
39    source_index = -1
40    for idx, url_info in enumerate(package_json["urls"]):
41        if url_info["python_version"] == "source":
42            source_index = idx
43            break
44    filename = package_json["urls"][source_index]["filename"]
45    url = package_json["urls"][source_index]["url"]
46    urlretrieve(url, os.path.join("data", "pypi", filename))
47
48
49def main() -> None:
50    args = argparser.parse_args()
51    number_packages = args.number
52    all_packages = args.all
53
54    top_pypi_packages = load_json("top-pypi-packages-365-days")
55    if all_packages:
56        top_pypi_packages = top_pypi_packages["rows"]
57    elif number_packages >= 0 and number_packages <= 4000:
58        top_pypi_packages = top_pypi_packages["rows"][:number_packages]
59    else:
60        raise AssertionError("Unknown value for NUMBER_OF_PACKAGES")
61
62    try:
63        os.mkdir(os.path.join("data", "pypi"))
64    except FileExistsError:
65        pass
66
67    for package in top_pypi_packages:
68        package_name = package["project"]
69
70        print(f"Downloading JSON Data for {package_name}... ", end="")
71        download_package_json(package_name)
72        print("Done")
73
74        package_json = load_json(package_name)
75        try:
76            print(f"Downloading and compressing package {package_name} ... ", end="")
77            download_package_code(package_name, package_json)
78            print("Done")
79        except (IndexError, KeyError):
80            print(f"Could not locate source for {package_name}")
81            continue
82        finally:
83            remove_json(package_name)
84
85
86if __name__ == "__main__":
87    main()
88