sksl/lex/Main.cpp

/*
 * Copyright 2017 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#include "src/sksl/lex/NFAtoDFA.h"
#include "src/sksl/lex/RegexParser.h"
#include "src/sksl/lex/TransitionTable.h"

#include <fstream>
#include <sstream>
#include <string>

/**
 * Processes a .lex file and produces .h and .cpp files which implement a lexical analyzer. The .lex
 * file is a text file with one token definition per line. Each line is of the form:
 * <TOKEN_NAME> = <pattern>
 * where <pattern> is either a regular expression (e.g [0-9]) or a double-quoted literal string.
 */

static constexpr const char* HEADER =
    "/*\n"
    " * Copyright 2017 Google Inc.\n"
    " *\n"
    " * Use of this source code is governed by a BSD-style license that can be\n"
    " * found in the LICENSE file.\n"
    " */\n"
    "/*****************************************************************************************\n"
    " ******************** This file was generated by sksllex. Do not edit. *******************\n"
    " *****************************************************************************************/\n";

static void writeH(const DFA& dfa, const char* lexer, const char* token,
                   const std::vector<std::string>& tokens, const char* hPath) {
    std::ofstream out(hPath);
    SkASSERT(out.good());
    out << HEADER;
    out << "#ifndef SKSL_" << lexer << "\n";
    out << "#define SKSL_" << lexer << "\n";
    out << "#include \"include/core/SkStringView.h\"\n";
    out << "#include <cstddef>\n";
    out << "#include <cstdint>\n";
    out << "namespace SkSL {\n";
    out << "\n";
    out << "struct " << token << " {\n";
    out << "    enum class Kind {\n";
    for (const std::string& t : tokens) {
        out << "        TK_" << t << ",\n";
    }
    out << "        TK_NONE,";
    out << R"(
    };

    )" << token << "() {}";

    out << token << R"((Kind kind, int32_t offset, int32_t length, int32_t line)
    : fKind(kind)
    , fOffset(offset)
    , fLength(length)
    , fLine(line) {}

    Kind fKind      = Kind::TK_NONE;
    int32_t fOffset = -1;
    int32_t fLength = -1;
    int32_t fLine   = -1;
};

class )" << lexer << R"( {
public:
    void start(skstd::string_view text) {
        fText = text;
        fOffset = 0;
        fLine = 1;
    }

    )" << token << R"( next();

    struct Checkpoint {
        int32_t fOffset;
        int32_t fLine;
    };

    Checkpoint getCheckpoint() const {
        return {fOffset, fLine};
    }

    void rewindToCheckpoint(Checkpoint checkpoint) {
        fOffset = checkpoint.fOffset;
        fLine = checkpoint.fLine;
    }

private:
    skstd::string_view fText;
    int32_t fOffset;
    int32_t fLine;
};

} // namespace
#endif
)";
}

static void writeCPP(const DFA& dfa, const char* lexer, const char* token, const char* include,
                     const char* cppPath) {
    std::ofstream out(cppPath);
    SkASSERT(out.good());
    out << HEADER;
    out << "#include \"" << include << "\"\n";
    out << "\n";
    out << "namespace SkSL {\n";
    out << "\n";

    size_t states = 0;
    for (const auto& row : dfa.fTransitions) {
        states = std::max(states, row.size());
    }
    out << "using State = " << (states <= 256 ? "uint8_t" : "uint16_t") << ";\n";
    // arbitrarily-chosen character which is greater than START_CHAR and should not appear in actual
    // input
    out << "static const uint8_t INVALID_CHAR = 18;";
    out << "static const int8_t kMappings[" << dfa.fCharMappings.size() << "] = {\n    ";
    const char* separator = "";
    for (int m : dfa.fCharMappings) {
        out << separator << std::to_string(m);
        separator = ", ";
    }
    out << "\n};\n";

    WriteTransitionTable(out, dfa, states);

    out << "static const int8_t kAccepts[" << states << "] = {";
    for (size_t i = 0; i < states; ++i) {
        if (i < dfa.fAccepts.size()) {
            out << " " << dfa.fAccepts[i] << ",";
        } else {
            out << " " << INVALID << ",";
        }
    }
    out << " };\n";
    out << "\n";

    out << token << " " << lexer << "::next() {";
    out << R"(
    // note that we cheat here: normally a lexer needs to worry about the case
    // where a token has a prefix which is not itself a valid token - for instance,
    // maybe we have a valid token 'while', but 'w', 'wh', etc. are not valid
    // tokens. Our grammar doesn't have this property, so we can simplify the logic
    // a bit.
    int32_t startOffset = fOffset;
    if (startOffset == (int32_t)fText.length()) {
        return )" << token << "(" << token << R"(::Kind::TK_END_OF_FILE, startOffset, 0, fLine);
    }
    State state = 1;
    for (;;) {
        if (fOffset >= (int32_t)fText.length()) {
            if (kAccepts[state] == -1) {
                return Token(Token::Kind::TK_END_OF_FILE, startOffset, 0, fLine);
            }
            break;
        }
        uint8_t c = (uint8_t) fText[fOffset];
        if (c <= 8 || c >= )" << dfa.fCharMappings.size() << R"() {
            c = INVALID_CHAR;
        }
        State newState = get_transition(kMappings[c], state);
        if (!newState) {
            break;
        }
        state = newState;
        ++fOffset;
        if (c == '\n') {
            ++fLine;
        }
    }
    Token::Kind kind = ()" << token << R"(::Kind) kAccepts[state];
    return )" << token << R"((kind, startOffset, fOffset - startOffset, fLine);
}

} // namespace
)";
}

static void process(const char* inPath, const char* lexer, const char* token, const char* hPath,
                    const char* cppPath) {
    NFA nfa;
    std::vector<std::string> tokens;
    tokens.push_back("END_OF_FILE");
    std::string line;
    std::ifstream in(inPath);
    while (std::getline(in, line)) {
        if (line.length() == 0) {
            continue;
        }
        if (line.length() >= 2 && line[0] == '/' && line[1] == '/') {
            continue;
        }
        std::istringstream split(line);
        std::string name, delimiter, pattern;
        if (split >> name >> delimiter >> pattern) {
            SkASSERT(split.eof());
            SkASSERT(name != "");
            SkASSERT(delimiter == "=");
            SkASSERT(pattern != "");
            tokens.push_back(name);
            if (pattern[0] == '"') {
                SkASSERT(pattern.size() > 2 && pattern[pattern.size() - 1] == '"');
                RegexNode node = RegexNode(RegexNode::kChar_Kind, pattern[1]);
                for (size_t i = 2; i < pattern.size() - 1; ++i) {
                    node = RegexNode(RegexNode::kConcat_Kind, node,
                                     RegexNode(RegexNode::kChar_Kind, pattern[i]));
                }
                nfa.addRegex(node);
            }
            else {
                nfa.addRegex(RegexParser().parse(pattern));
            }
        }
    }
    NFAtoDFA converter(&nfa);
    DFA dfa = converter.convert();
    writeH(dfa, lexer, token, tokens, hPath);
    writeCPP(dfa, lexer, token, (std::string("src/sksl/SkSL") + lexer + ".h").c_str(), cppPath);
}

int main(int argc, const char** argv) {
    if (argc != 6) {
        printf("usage: sksllex <input.lex> <lexername> <tokenname> <output.h> <output.cpp>\n");
        exit(1);
    }
    process(argv[1], argv[2], argv[3], argv[4], argv[5]);
    return 0;
}