1#!/usr/bin/env python3.8 2 3import argparse 4import os 5import json 6 7from typing import Dict, Any 8from urllib.request import urlretrieve 9 10argparser = argparse.ArgumentParser( 11 prog="download_pypi_packages", description="Helper program to download PyPI packages", 12) 13argparser.add_argument( 14 "-n", "--number", type=int, default=100, help="Number of packages to download" 15) 16argparser.add_argument( 17 "-a", "--all", action="store_true", help="Download all packages listed in the json file" 18) 19 20 21def load_json(filename: str) -> Dict[Any, Any]: 22 with open(os.path.join("data", f"{filename}.json"), "r") as f: 23 j = json.loads(f.read()) 24 return j 25 26 27def remove_json(filename: str) -> None: 28 path = os.path.join("data", f"{filename}.json") 29 os.remove(path) 30 31 32def download_package_json(package_name: str) -> None: 33 url = f"https://pypi.org/pypi/{package_name}/json" 34 urlretrieve(url, os.path.join("data", f"{package_name}.json")) 35 36 37def download_package_code(name: str, package_json: Dict[Any, Any]) -> None: 38 source_index = -1 39 for idx, url_info in enumerate(package_json["urls"]): 40 if url_info["python_version"] == "source": 41 source_index = idx 42 break 43 filename = package_json["urls"][source_index]["filename"] 44 url = package_json["urls"][source_index]["url"] 45 urlretrieve(url, os.path.join("data", "pypi", filename)) 46 47 48def main() -> None: 49 args = argparser.parse_args() 50 number_packages = args.number 51 all_packages = args.all 52 53 top_pypi_packages = load_json("top-pypi-packages-365-days") 54 if all_packages: 55 top_pypi_packages = top_pypi_packages["rows"] 56 elif number_packages >= 0 and number_packages <= 4000: 57 top_pypi_packages = top_pypi_packages["rows"][:number_packages] 58 else: 59 raise AssertionError("Unknown value for NUMBER_OF_PACKAGES") 60 61 try: 62 os.mkdir(os.path.join("data", "pypi")) 63 except FileExistsError: 64 pass 65 66 for package in top_pypi_packages: 67 package_name = package["project"] 68 69 print(f"Downloading JSON Data for {package_name}... ", end="") 70 download_package_json(package_name) 71 print("Done") 72 73 package_json = load_json(package_name) 74 try: 75 print(f"Downloading and compressing package {package_name} ... ", end="") 76 download_package_code(package_name, package_json) 77 print("Done") 78 except (IndexError, KeyError): 79 print(f"Could not locate source for {package_name}") 80 continue 81 finally: 82 remove_json(package_name) 83 84 85if __name__ == "__main__": 86 main() 87