test/utils/generate-tokenization-tests.ts

import * as assert from 'node:assert';
import * as fs from 'node:fs';
import * as path from 'node:path';
import type { ParserError, Token } from 'parse5';
import { type Tokenizer, TokenizerMode, type TokenHandler } from 'parse5';
import { makeChunks } from './common.js';

export type HtmlLibToken = [string, string | null, ...unknown[]];

interface TokenError {
    code: string;
    line: number;
    col: number;
}

interface TokenSourceData {
    tokens: HtmlLibToken[];
    errors: TokenError[];
}

type TokenSourceCreator = (data: TokenizeHandler) => Tokenizer;

/** Receives events and immediately compares them against the expected values. We check the entire output again at the end. */
class TokenizeHandler implements TokenSourceData, TokenHandler {
    constructor(private testData: LoadedTest) {}

    private addToken(token: HtmlLibToken): void {
        assert.deepStrictEqual(token, this.testData.expected[this.tokens.length]);

        this.tokens.push(token);
    }

    onComment(token: Token.CommentToken): void {
        this.addToken(['Comment', token.data]);
    }
    onDoctype(token: Token.DoctypeToken): void {
        this.addToken(['DOCTYPE', token.name, token.publicId, token.systemId, !token.forceQuirks]);
    }
    onStartTag(token: Token.TagToken): void {
        const reformatedAttrs = Object.fromEntries(token.attrs.map(({ name, value }) => [name, value]));
        const startTagEntry: HtmlLibToken = ['StartTag', token.tagName, reformatedAttrs];

        if (token.selfClosing) {
            startTagEntry.push(true);
        }

        this.addToken(startTagEntry);
    }
    onEndTag(token: Token.TagToken): void {
        // NOTE: parser feedback simulator can produce adjusted SVG
        // tag names for end tag tokens so we need to lower case it
        this.addToken(['EndTag', token.tagName.toLowerCase()]);
    }
    onEof(): void {
        this.sawEof = true;
    }
    onCharacter(token: Token.CharacterToken): void {
        const lastEntry = this.tokens[this.tokens.length - 1];

        if (lastEntry && lastEntry[0] === 'Character' && lastEntry[1] != null) {
            lastEntry[1] += token.chars;
        } else {
            this.tokens.push(['Character', token.chars]);
        }

        const actual = this.tokens[this.tokens.length - 1];
        const expected = this.testData.expected[this.tokens.length - 1];
        assert.strictEqual('Character', expected[0]);
        assert.ok(typeof actual[1] === 'string');
        assert.ok(expected[1]?.startsWith(actual[1]));
    }
    onNullCharacter(token: Token.CharacterToken): void {
        this.onCharacter(token);
    }
    onWhitespaceCharacter(token: Token.CharacterToken): void {
        this.onCharacter(token);
    }
    onParseError(err: ParserError): void {
        assert.ok(
            this.testData.expectedErrors.some(
                ({ code, line, col }) => code === err.code && line === err.startLine && col === err.startCol
            )
        );

        this.errors.push({
            code: err.code,
            line: err.startLine,
            col: err.startCol,
        });
    }

    public sawEof = false;
    public tokens: HtmlLibToken[] = [];
    public errors: TokenError[] = [];
}

function tokenize(createTokenSource: TokenSourceCreator, chunks: string[], testData: LoadedTest): TokenSourceData {
    const result = new TokenizeHandler(testData);
    const tokenizer = createTokenSource(result);

    // NOTE: set small waterline for testing purposes
    tokenizer.preprocessor.bufferWaterline = 8;
    tokenizer.state = testData.initialState;

    if (testData.lastStartTag) {
        tokenizer.lastStartTagName = testData.lastStartTag;
    }

    for (let i = 0; i < chunks.length; i++) {
        assert.ok(!result.sawEof);
        tokenizer.write(chunks[i], i === chunks.length - 1);
    }

    assert.ok(result.sawEof);
    assert.ok(!tokenizer.active);

    // Sort errors by line and column
    result.errors.sort((err1, err2) => err1.line - err2.line || err1.col - err2.col);

    return result;
}

function unicodeUnescape(str: string): string {
    return str.replace(/\\[Uu]\w{4}/g, (match: string) => String.fromCharCode(Number.parseInt(match.slice(2), 16)));
}

function unescapeDescrIO(testDescr: TestDescription): void {
    testDescr.input = unicodeUnescape(testDescr.input);

    for (const tokenEntry of testDescr.output) {
        //NOTE: unescape token tagName (for StartTag and EndTag tokens), comment data (for Comment token),
        //character token data (for Character token).
        if (tokenEntry[1]) {
            tokenEntry[1] = unicodeUnescape(tokenEntry[1]);
        }
    }
}

function getTokenizerSuitableStateName(testDataStateName: string): Tokenizer['state'] {
    const name = testDataStateName.slice(0, -6).replace(' ', '_').toUpperCase();
    return TokenizerMode[name as keyof typeof TokenizerMode];
}

interface TestDescription {
    initialStates: string[];
    doubleEscaped?: boolean;
    output: HtmlLibToken[];
    description: string;
    input: string;
    lastStartTag: string;
    errors?: TokenError[];
}

interface LoadedTest {
    idx: number;
    setName: string;
    name: string;
    input: string;
    expected: HtmlLibToken[];
    initialState: Tokenizer['state'];
    initialStateName: string;
    lastStartTag: string;
    expectedErrors: TokenError[];
}

function loadTests(dataDirPath: string): LoadedTest[] {
    const testSetFileNames = fs.readdirSync(dataDirPath);
    const tests: LoadedTest[] = [];
    let testIdx = 0;

    for (const fileName of testSetFileNames) {
        if (path.extname(fileName) !== '.test') {
            continue;
        }

        const filePath = path.join(dataDirPath, fileName);
        const testSetJson = fs.readFileSync(filePath).toString();
        const testSet = JSON.parse(testSetJson);
        const testDescrs: TestDescription[] = testSet.tests;

        if (!testDescrs) {
            continue;
        }

        const setName = fileName.replace('.test', '');

        for (const descr of testDescrs) {
            if (!descr.initialStates) {
                descr.initialStates = ['Data state'];
            }

            if (descr.doubleEscaped) {
                unescapeDescrIO(descr);
            }

            const expected = descr.output;

            for (const initialStateName of descr.initialStates) {
                tests.push({
                    idx: ++testIdx,
                    setName,
                    name: descr.description,
                    input: descr.input,
                    expected,
                    initialState: getTokenizerSuitableStateName(initialStateName),
                    initialStateName,
                    lastStartTag: descr.lastStartTag,
                    expectedErrors: descr.errors || [],
                });
            }
        }
    }

    return tests;
}

export function generateTokenizationTests(
    prefix: string,
    testSuite: string,
    createTokenSource: TokenSourceCreator
): void {
    for (const testData of loadTests(testSuite)) {
        const testName = `${prefix} - ${testData.idx}.${testData.setName} - ${testData.name} - Initial state: ${testData.initialStateName}`;

        it(testName, () => {
            const chunks = makeChunks(testData.input);
            const result = tokenize(createTokenSource, chunks, testData);

            assert.deepEqual(result.tokens, testData.expected, `Chunks: ${JSON.stringify(chunks)}`);
            assert.deepEqual(result.errors, testData.expectedErrors || []);
        });
    }
}