• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2
3from subprocess import call
4import argparse
5import datetime
6import glob
7import json
8import os
9import re
10import shutil
11import tempfile
12import time
13import urllib3
14
15CRATES_IO_INDEX_GIT_LOC = "https://github.com/rust-lang/crates.io-index.git"
16RE_REGEX = re.compile(r"Regex::new\((r?\".*?\")\)")
17KNOWN_UNMAINTAINED_CRATES = set(["queryst-prime", "oozz"])
18
19# if only requests was in the standard library...
20urllib3.disable_warnings()
21http = urllib3.PoolManager()
22
23
24def argparser():
25    p = argparse.ArgumentParser("A script to scrape crates.io for regex.")
26    p.add_argument("-c", "--crates-index", metavar="CRATES_INDEX_DIR",
27                   help=("A directory where we can find crates.io-index "
28                         + "(if this isn't set it will be automatically "
29                         + "downloaded)."))
30    p.add_argument("-o", "--output-file", metavar="OUTPUT",
31                   default="crates_regex.rs",
32                   help="The name of the output file to create.")
33    return p
34
35
36PRELUDE = """
37// DO NOT EDIT. Automatically generated by 'scripts/scrape_crates_io.py'
38// on {date}.
39
40
41
42""".lstrip()
43
44
45def main():
46    args = argparser().parse_args()
47    out = open(os.path.abspath(args.output_file), "w")
48    out.write(PRELUDE.format(date=str(datetime.datetime.now())))
49    if args.crates_index:
50        args.crates_index = os.path.abspath(args.crates_index)
51
52    # enter our scratch directory
53    old_dir = os.getcwd()
54    work_dir = tempfile.mkdtemp(prefix="scrape-crates-io")
55    os.chdir(work_dir)
56
57    crates_index = (args.crates_index
58                    if os.path.join(old_dir, args.crates_index)
59                    else download_crates_index())
60
61    for (name, vers) in iter_crates(crates_index):
62        if name in KNOWN_UNMAINTAINED_CRATES:
63            continue
64
65        with Crate(work_dir, name, vers) as c:
66            i = 0
67            for line in c.iter_lines():
68                for r in RE_REGEX.findall(line):
69                    print((name, vers, r))
70                    if len(r) >= 2 and r[-2] == "\\":
71                        continue
72                    out.write("// {}-{}: {}\n".format(name, vers, r))
73                    out.write("consistent!({}_{}, {});\n\n".format(
74                                name.replace("-", "_"), i, r))
75                    out.flush()
76                    i += 1
77
78    # Leave the scratch directory
79    os.chdir(old_dir)
80    shutil.rmtree(work_dir)
81    out.close()
82
83
84def download_crates_index():
85    if call(["git", "clone", CRATES_IO_INDEX_GIT_LOC]) != 0:
86        print("Error cloning the crates.io index")
87        exit(1)
88    return "crates.io-index"
89
90
91def iter_crates(crates_index):
92    exclude = set(["config.json", ".git"])
93    for crate_index_file in iter_files(crates_index, exclude=exclude):
94        with open(crate_index_file) as f:
95            most_recent = list(f)
96            most_recent = most_recent[len(most_recent) - 1]
97
98            crate_info = json.loads(most_recent)
99            if "regex" not in set(d["name"] for d in crate_info["deps"]):
100                continue
101
102            if crate_info["yanked"]:
103                continue
104            yield (crate_info["name"], crate_info["vers"])
105
106
107def iter_files(d, exclude=set()):
108    for x in os.listdir(d):
109        if x in exclude:
110            continue
111
112        fullfp = os.path.abspath(d + "/" + x)
113        if os.path.isfile(fullfp):
114            yield fullfp
115        elif os.path.isdir(fullfp):
116            for f in iter_files(fullfp, exclude):
117                yield f
118
119
120class Crate(object):
121    def __init__(self, work_dir, name, version):
122        self.name = name
123        self.version = version
124        self.url = ("https://crates.io/api/v1/crates/{name}/{version}/download"
125                    .format(name=self.name, version=self.version))
126        self.filename = "{}/{}-{}.tar.gz".format(
127                            work_dir, self.name, self.version)
128
129    def __enter__(self):
130        max_retries = 1
131        retries = 0
132        while retries < max_retries:
133            retries += 1
134
135            r = http.request("GET", self.url, preload_content=False)
136            try:
137                print("[{}/{}] Downloading {}".format(
138                        retries, max_retries + 1, self.url))
139                with open(self.filename, "wb") as f:
140                    while True:
141                        data = r.read(1024)
142                        if not data:
143                            break
144                        f.write(data)
145            except Exception:
146                time.sleep(1)
147                r.release_conn()
148                continue
149
150            r.release_conn()
151            break
152
153        call(["tar", "-xf", self.filename])
154
155        return self
156
157    def __exit__(self, ty, value, tb):
158        # We are going to clean up the whole temp dir anyway, so
159        # we don't really need to do this. Its nice to clean up
160        # after ourselves though.
161        try:
162            shutil.rmtree(self.filename[:-len(".tar.gz")])
163            os.remove(self.filename)
164        except Exception:
165            pass
166
167    def iter_srcs(self):
168        g = "{crate}/**/*.rs".format(crate=self.filename[:-len(".tar.gz")])
169        for rsrc in glob.iglob(g):
170            yield rsrc
171
172    def iter_lines(self):
173        for src in self.iter_srcs():
174            with open(src) as f:
175                for line in f:
176                    yield line
177
178
179if __name__ == "__main__":
180    main()
181