1#!/usr/bin/env python3 2 3from subprocess import call 4import argparse 5import datetime 6import glob 7import json 8import os 9import re 10import shutil 11import tempfile 12import time 13import urllib3 14 15CRATES_IO_INDEX_GIT_LOC = "https://github.com/rust-lang/crates.io-index.git" 16RE_REGEX = re.compile(r"Regex::new\((r?\".*?\")\)") 17KNOWN_UNMAINTAINED_CRATES = set(["queryst-prime", "oozz"]) 18 19# if only requests was in the standard library... 20urllib3.disable_warnings() 21http = urllib3.PoolManager() 22 23 24def argparser(): 25 p = argparse.ArgumentParser("A script to scrape crates.io for regex.") 26 p.add_argument("-c", "--crates-index", metavar="CRATES_INDEX_DIR", 27 help=("A directory where we can find crates.io-index " 28 + "(if this isn't set it will be automatically " 29 + "downloaded).")) 30 p.add_argument("-o", "--output-file", metavar="OUTPUT", 31 default="crates_regex.rs", 32 help="The name of the output file to create.") 33 return p 34 35 36PRELUDE = """ 37// DO NOT EDIT. Automatically generated by 'scripts/scrape_crates_io.py' 38// on {date}. 39 40 41 42""".lstrip() 43 44 45def main(): 46 args = argparser().parse_args() 47 out = open(os.path.abspath(args.output_file), "w") 48 out.write(PRELUDE.format(date=str(datetime.datetime.now()))) 49 if args.crates_index: 50 args.crates_index = os.path.abspath(args.crates_index) 51 52 # enter our scratch directory 53 old_dir = os.getcwd() 54 work_dir = tempfile.mkdtemp(prefix="scrape-crates-io") 55 os.chdir(work_dir) 56 57 crates_index = (args.crates_index 58 if os.path.join(old_dir, args.crates_index) 59 else download_crates_index()) 60 61 for (name, vers) in iter_crates(crates_index): 62 if name in KNOWN_UNMAINTAINED_CRATES: 63 continue 64 65 with Crate(work_dir, name, vers) as c: 66 i = 0 67 for line in c.iter_lines(): 68 for r in RE_REGEX.findall(line): 69 print((name, vers, r)) 70 if len(r) >= 2 and r[-2] == "\\": 71 continue 72 out.write("// {}-{}: {}\n".format(name, vers, r)) 73 out.write("consistent!({}_{}, {});\n\n".format( 74 name.replace("-", "_"), i, r)) 75 out.flush() 76 i += 1 77 78 # Leave the scratch directory 79 os.chdir(old_dir) 80 shutil.rmtree(work_dir) 81 out.close() 82 83 84def download_crates_index(): 85 if call(["git", "clone", CRATES_IO_INDEX_GIT_LOC]) != 0: 86 print("Error cloning the crates.io index") 87 exit(1) 88 return "crates.io-index" 89 90 91def iter_crates(crates_index): 92 exclude = set(["config.json", ".git"]) 93 for crate_index_file in iter_files(crates_index, exclude=exclude): 94 with open(crate_index_file) as f: 95 most_recent = list(f) 96 most_recent = most_recent[len(most_recent) - 1] 97 98 crate_info = json.loads(most_recent) 99 if "regex" not in set(d["name"] for d in crate_info["deps"]): 100 continue 101 102 if crate_info["yanked"]: 103 continue 104 yield (crate_info["name"], crate_info["vers"]) 105 106 107def iter_files(d, exclude=set()): 108 for x in os.listdir(d): 109 if x in exclude: 110 continue 111 112 fullfp = os.path.abspath(d + "/" + x) 113 if os.path.isfile(fullfp): 114 yield fullfp 115 elif os.path.isdir(fullfp): 116 for f in iter_files(fullfp, exclude): 117 yield f 118 119 120class Crate(object): 121 def __init__(self, work_dir, name, version): 122 self.name = name 123 self.version = version 124 self.url = ("https://crates.io/api/v1/crates/{name}/{version}/download" 125 .format(name=self.name, version=self.version)) 126 self.filename = "{}/{}-{}.tar.gz".format( 127 work_dir, self.name, self.version) 128 129 def __enter__(self): 130 max_retries = 1 131 retries = 0 132 while retries < max_retries: 133 retries += 1 134 135 r = http.request("GET", self.url, preload_content=False) 136 try: 137 print("[{}/{}] Downloading {}".format( 138 retries, max_retries + 1, self.url)) 139 with open(self.filename, "wb") as f: 140 while True: 141 data = r.read(1024) 142 if not data: 143 break 144 f.write(data) 145 except Exception: 146 time.sleep(1) 147 r.release_conn() 148 continue 149 150 r.release_conn() 151 break 152 153 call(["tar", "-xf", self.filename]) 154 155 return self 156 157 def __exit__(self, ty, value, tb): 158 # We are going to clean up the whole temp dir anyway, so 159 # we don't really need to do this. Its nice to clean up 160 # after ourselves though. 161 try: 162 shutil.rmtree(self.filename[:-len(".tar.gz")]) 163 os.remove(self.filename) 164 except Exception: 165 pass 166 167 def iter_srcs(self): 168 g = "{crate}/**/*.rs".format(crate=self.filename[:-len(".tar.gz")]) 169 for rsrc in glob.iglob(g): 170 yield rsrc 171 172 def iter_lines(self): 173 for src in self.iter_srcs(): 174 with open(src) as f: 175 for line in f: 176 yield line 177 178 179if __name__ == "__main__": 180 main() 181