regex/scripts/scrape_crates_io.py

#!/usr/bin/env python3

from subprocess import call
import argparse
import datetime
import glob
import json
import os
import re
import shutil
import tempfile
import time
import urllib3

CRATES_IO_INDEX_GIT_LOC = "https://github.com/rust-lang/crates.io-index.git"
RE_REGEX = re.compile(r"Regex::new\((r?\".*?\")\)")
KNOWN_UNMAINTAINED_CRATES = set(["queryst-prime", "oozz"])

# if only requests was in the standard library...
urllib3.disable_warnings()
http = urllib3.PoolManager()


def argparser():
    p = argparse.ArgumentParser("A script to scrape crates.io for regex.")
    p.add_argument("-c", "--crates-index", metavar="CRATES_INDEX_DIR",
                   help=("A directory where we can find crates.io-index "
                         + "(if this isn't set it will be automatically "
                         + "downloaded)."))
    p.add_argument("-o", "--output-file", metavar="OUTPUT",
                   default="crates_regex.rs",
                   help="The name of the output file to create.")
    return p


PRELUDE = """
// DO NOT EDIT. Automatically generated by 'scripts/scrape_crates_io.py'
// on {date}.


""".lstrip()


def main():
    args = argparser().parse_args()
    out = open(os.path.abspath(args.output_file), "w")
    out.write(PRELUDE.format(date=str(datetime.datetime.now())))
    if args.crates_index:
        args.crates_index = os.path.abspath(args.crates_index)

    # enter our scratch directory
    old_dir = os.getcwd()
    work_dir = tempfile.mkdtemp(prefix="scrape-crates-io")
    os.chdir(work_dir)

    crates_index = (args.crates_index
                    if os.path.join(old_dir, args.crates_index)
                    else download_crates_index())

    for (name, vers) in iter_crates(crates_index):
        if name in KNOWN_UNMAINTAINED_CRATES:
            continue

        with Crate(work_dir, name, vers) as c:
            i = 0
            for line in c.iter_lines():
                for r in RE_REGEX.findall(line):
                    print((name, vers, r))
                    if len(r) >= 2 and r[-2] == "\\":
                        continue
                    out.write("// {}-{}: {}\n".format(name, vers, r))
                    out.write("consistent!({}_{}, {});\n\n".format(
                                name.replace("-", "_"), i, r))
                    out.flush()
                    i += 1

    # Leave the scratch directory
    os.chdir(old_dir)
    shutil.rmtree(work_dir)
    out.close()


def download_crates_index():
    if call(["git", "clone", CRATES_IO_INDEX_GIT_LOC]) != 0:
        print("Error cloning the crates.io index")
        exit(1)
    return "crates.io-index"


def iter_crates(crates_index):
    exclude = set(["config.json", ".git"])
    for crate_index_file in iter_files(crates_index, exclude=exclude):
        with open(crate_index_file) as f:
            most_recent = list(f)
            most_recent = most_recent[len(most_recent) - 1]

            crate_info = json.loads(most_recent)
            if "regex" not in set(d["name"] for d in crate_info["deps"]):
                continue

            if crate_info["yanked"]:
                continue
            yield (crate_info["name"], crate_info["vers"])


def iter_files(d, exclude=set()):
    for x in os.listdir(d):
        if x in exclude:
            continue

        fullfp = os.path.abspath(d + "/" + x)
        if os.path.isfile(fullfp):
            yield fullfp
        elif os.path.isdir(fullfp):
            for f in iter_files(fullfp, exclude):
                yield f


class Crate(object):
    def __init__(self, work_dir, name, version):
        self.name = name
        self.version = version
        self.url = ("https://crates.io/api/v1/crates/{name}/{version}/download"
                    .format(name=self.name, version=self.version))
        self.filename = "{}/{}-{}.tar.gz".format(
                            work_dir, self.name, self.version)

    def __enter__(self):
        max_retries = 1
        retries = 0
        while retries < max_retries:
            retries += 1

            r = http.request("GET", self.url, preload_content=False)
            try:
                print("[{}/{}] Downloading {}".format(
                        retries, max_retries + 1, self.url))
                with open(self.filename, "wb") as f:
                    while True:
                        data = r.read(1024)
                        if not data:
                            break
                        f.write(data)
            except Exception:
                time.sleep(1)
                r.release_conn()
                continue

            r.release_conn()
            break

        call(["tar", "-xf", self.filename])

        return self

    def __exit__(self, ty, value, tb):
        # We are going to clean up the whole temp dir anyway, so
        # we don't really need to do this. Its nice to clean up
        # after ourselves though.
        try:
            shutil.rmtree(self.filename[:-len(".tar.gz")])
            os.remove(self.filename)
        except Exception:
            pass

    def iter_srcs(self):
        g = "{crate}/**/*.rs".format(crate=self.filename[:-len(".tar.gz")])
        for rsrc in glob.iglob(g):
            yield rsrc

    def iter_lines(self):
        for src in self.iter_srcs():
            with open(src) as f:
                for line in f:
                    yield line


if __name__ == "__main__":
    main()