1#!/usr/bin/env python3.8 2 3import argparse 4import os 5import json 6 7from typing import Dict, Any 8from urllib.request import urlretrieve 9 10argparser = argparse.ArgumentParser( 11 prog="download_pypi_packages", 12 description="Helper program to download PyPI packages", 13) 14argparser.add_argument( 15 "-n", "--number", type=int, default=100, help="Number of packages to download" 16) 17argparser.add_argument( 18 "-a", "--all", action="store_true", help="Download all packages listed in the json file" 19) 20 21 22def load_json(filename: str) -> Dict[Any, Any]: 23 with open(os.path.join("data", f"{filename}.json"), "r") as f: 24 j = json.loads(f.read()) 25 return j 26 27 28def remove_json(filename: str) -> None: 29 path = os.path.join("data", f"{filename}.json") 30 os.remove(path) 31 32 33def download_package_json(package_name: str) -> None: 34 url = f"https://pypi.org/pypi/{package_name}/json" 35 urlretrieve(url, os.path.join("data", f"{package_name}.json")) 36 37 38def download_package_code(name: str, package_json: Dict[Any, Any]) -> None: 39 source_index = -1 40 for idx, url_info in enumerate(package_json["urls"]): 41 if url_info["python_version"] == "source": 42 source_index = idx 43 break 44 filename = package_json["urls"][source_index]["filename"] 45 url = package_json["urls"][source_index]["url"] 46 urlretrieve(url, os.path.join("data", "pypi", filename)) 47 48 49def main() -> None: 50 args = argparser.parse_args() 51 number_packages = args.number 52 all_packages = args.all 53 54 top_pypi_packages = load_json("top-pypi-packages-365-days") 55 if all_packages: 56 top_pypi_packages = top_pypi_packages["rows"] 57 elif number_packages >= 0 and number_packages <= 4000: 58 top_pypi_packages = top_pypi_packages["rows"][:number_packages] 59 else: 60 raise AssertionError("Unknown value for NUMBER_OF_PACKAGES") 61 62 try: 63 os.mkdir(os.path.join("data", "pypi")) 64 except FileExistsError: 65 pass 66 67 for package in top_pypi_packages: 68 package_name = package["project"] 69 70 print(f"Downloading JSON Data for {package_name}... ", end="") 71 download_package_json(package_name) 72 print("Done") 73 74 package_json = load_json(package_name) 75 try: 76 print(f"Downloading and compressing package {package_name} ... ", end="") 77 download_package_code(package_name, package_json) 78 print("Done") 79 except (IndexError, KeyError): 80 print(f"Could not locate source for {package_name}") 81 continue 82 finally: 83 remove_json(package_name) 84 85 86if __name__ == "__main__": 87 main() 88